annotate auditok/core.py @ 177:2acbdbd18327

Implement a generator version for tokenize
author Amine Sehili <amine.sehili@gmail.com>
date Sat, 16 Mar 2019 18:28:23 +0100
parents 382f30f8dab5
children 11885f96acb2
rev   line source
amine@33 1 """
amine@33 2 This module gathers processing (i.e. tokenization) classes.
amine@33 3
amine@33 4 Class summary
amine@33 5 =============
amine@33 6
amine@33 7 .. autosummary::
amine@33 8
amine@33 9 StreamTokenizer
amine@33 10 """
amine@33 11
amine@2 12 from auditok.util import DataValidator
amine@96 13 from auditok.io import check_audio_data
amine@2 14
amine@81 15 __all__ = ["AudioRegion", "StreamTokenizer"]
amine@81 16
amine@81 17
amine@81 18 class AudioRegion(object):
amine@81 19
amine@81 20 def __init__(self, data, start, sampling_rate, sample_width, channels):
amine@81 21 """
amine@81 22 A class for detected audio events.
amine@81 23
amine@81 24 :Parameters:
amine@81 25
amine@81 26 data: bytes
amine@81 27 audio data
amine@81 28 start: float
amine@81 29 start time in seconds
amine@81 30 samling_rate: int
amine@81 31 sampling rate of audio data
amine@81 32 sample_width: int
amine@81 33 number of bytes of one audio sample
amine@81 34 channels: int
amine@81 35 number of channels of audio data
amine@81 36 """
amine@96 37 check_audio_data(data, sample_width, channels)
amine@81 38 self._data = data
amine@81 39 self._start = start
amine@81 40 self._sampling_rate = sampling_rate
amine@81 41 self._sample_width = sample_width
amine@81 42 self._channels = channels
amine@81 43
amine@81 44 @property
amine@81 45 def start(self):
amine@81 46 return self._start
amine@81 47
amine@81 48 @property
amine@81 49 def end(self):
amine@85 50 return self.start + self.duration
amine@81 51
amine@81 52 @property
amine@82 53 def duration(self):
amine@85 54 """
amine@85 55 Returns region duration in seconds.
amine@85 56 """
amine@85 57 return len(self._data) / (self.sampling_rate *
amine@85 58 self.sample_width *
amine@85 59 self.channels)
amine@82 60
amine@82 61 @property
amine@81 62 def sampling_rate(self):
amine@85 63 return self._sampling_rate
amine@81 64
amine@81 65 @property
amine@81 66 def sr(self):
amine@81 67 return self._sampling_rate
amine@81 68
amine@81 69 @property
amine@81 70 def sample_width(self):
amine@81 71 return self._sample_width
amine@81 72
amine@81 73 @property
amine@81 74 def sw(self):
amine@81 75 return self._sample_width
amine@81 76
amine@81 77 @property
amine@81 78 def channels(self):
amine@81 79 return self._channels
amine@81 80
amine@81 81 @property
amine@81 82 def ch(self):
amine@81 83 return self._channels
amine@2 84
amine@82 85 def __len__(self):
amine@85 86 """
amine@85 87 Rerurns region duration in milliseconds.
amine@85 88 """
amine@85 89 return round(self.duration * 1000)
amine@82 90
amine@83 91 def __bytes__(self):
amine@83 92 return self._data
amine@83 93
amine@83 94 def __repr__(self):
amine@83 95 return ('AudioRegion(data, start={:.3f}, end={:.3f}, '
amine@83 96 'sampling_rate={}, sample_width={}, channels={})'.format(self.start,
amine@83 97 self.end,
amine@83 98 self.sr,
amine@83 99 self.sw,
amine@83 100 self.ch))
amine@83 101
amine@83 102 def __str__(self):
amine@83 103 return 'AudioRegion(start={:.3f}, end={:.3f}, duration={:.3f}'.format(self.start,
amine@83 104 self.end,
amine@83 105 self.duration)
amine@83 106
amine@87 107 def __add__(self, other):
amine@87 108 """
amine@87 109 Concatenates this region and `other` and return a new region.
amine@87 110 Both regions must have the same sampling rate, sample width
amine@87 111 and number of channels. If not, raises a `ValueError`.
amine@87 112 """
amine@87 113 if not isinstance(other, AudioRegion):
amine@87 114 raise TypeError('Can only concatenate AudioRegion, '
amine@87 115 'not "{}"'.format(type(other)))
amine@87 116 if other.sr != self.sr:
amine@87 117 raise ValueError('Can only concatenate AudioRegions of the same '
amine@87 118 'sampling rate ({} != {})'.format(self.sr,
amine@87 119 other.sr))
amine@87 120 if other.sw != self.sw:
amine@87 121 raise ValueError('Can only concatenate AudioRegions of the same '
amine@87 122 'sample width ({} != {})'.format(self.sw,
amine@87 123 other.sw))
amine@87 124 if other.ch != self.ch:
amine@87 125 raise ValueError('Can only concatenate AudioRegions of the same '
amine@87 126 'number of channels ({} != {})'.format(self.ch,
amine@87 127 other.ch))
amine@87 128 data = self._data + other._data
amine@87 129 return AudioRegion(data, self.start, self.sr, self.sw, self.ch)
amine@87 130
amine@87 131 def __radd__(self, other):
amine@87 132 """
amine@87 133 Concatenates `other` and this region. `other` should be an
amine@87 134 `AudioRegion` with the same audio parameters as this region
amine@87 135 but can exceptionally be `0` to make it possible to concatenate
amine@87 136 many regions with `sum`.
amine@87 137 """
amine@87 138 if other == 0:
amine@87 139 return self
amine@87 140 return other.add(self)
amine@87 141
amine@2 142
amine@2 143 class StreamTokenizer():
amine@32 144 """
amine@32 145 Class for stream tokenizers. It implements a 4-state automaton scheme
amine@32 146 to extract sub-sequences of interest on the fly.
amine@67 147
amine@32 148 :Parameters:
amine@67 149
amine@5 150 `validator` :
amine@5 151 instance of `DataValidator` that implements `is_valid` method.
amine@67 152
amine@5 153 `min_length` : *(int)*
amine@5 154 Minimum number of frames of a valid token. This includes all \
amine@5 155 tolerated non valid frames within the token.
amine@67 156
amine@5 157 `max_length` : *(int)*
amine@5 158 Maximum number of frames of a valid token. This includes all \
amine@5 159 tolerated non valid frames within the token.
amine@67 160
amine@5 161 `max_continuous_silence` : *(int)*
amine@5 162 Maximum number of consecutive non-valid frames within a token.
amine@5 163 Note that, within a valid token, there may be many tolerated \
amine@5 164 *silent* regions that contain each a number of non valid frames up to \
amine@5 165 `max_continuous_silence`
amine@67 166
amine@5 167 `init_min` : *(int, default=0)*
amine@5 168 Minimum number of consecutive valid frames that must be **initially** \
amine@5 169 gathered before any sequence of non valid frames can be tolerated. This
amine@5 170 option is not always needed, it can be used to drop non-valid tokens as
amine@5 171 early as possible. **Default = 0** means that the option is by default
amine@5 172 ineffective.
amine@67 173
amine@5 174 `init_max_silence` : *(int, default=0)*
amine@5 175 Maximum number of tolerated consecutive non-valid frames if the \
amine@5 176 number already gathered valid frames has not yet reached 'init_min'.
amine@5 177 This argument is normally used if `init_min` is used. **Default = 0**,
amine@5 178 by default this argument is not taken into consideration.
amine@67 179
amine@5 180 `mode` : *(int, default=0)*
amine@5 181 `mode` can be:
amine@67 182
amine@35 183 1. `StreamTokenizer.STRICT_MIN_LENGTH`:
amine@32 184 if token *i* is delivered because `max_length`
amine@32 185 is reached, and token *i+1* is immediately adjacent to
amine@32 186 token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts
amine@32 187 at frame *k+1*) then accept token *i+1* only of it has a size of at
amine@32 188 least `min_length`. The default behavior is to accept token *i+1*
amine@32 189 event if it is shorter than `min_length` (given that the above conditions
amine@32 190 are fulfilled of course).
amine@67 191
amine@32 192 :Examples:
amine@67 193
amine@32 194 In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
amine@32 195 accepted although it is shorter than `min_length` (3), because it immediately
amine@32 196 follows the latest delivered token:
amine@67 197
amine@32 198 .. code:: python
amine@67 199
amine@32 200 from auditok import StreamTokenizer, StringDataSource, DataValidator
amine@67 201
amine@32 202 class UpperCaseChecker(DataValidator):
amine@32 203 def is_valid(self, frame):
amine@32 204 return frame.isupper()
amine@67 205
amine@67 206
amine@32 207 dsource = StringDataSource("aaaAAAABBbbb")
amine@32 208 tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
amine@32 209 min_length=3,
amine@32 210 max_length=4,
amine@32 211 max_continuous_silence=0)
amine@67 212
amine@32 213 tokenizer.tokenize(dsource)
amine@67 214
amine@32 215 :output:
amine@67 216
amine@32 217 .. code:: python
amine@67 218
amine@32 219 [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
amine@32 220
amine@32 221
amine@32 222 The following tokenizer will however reject the 'BB' token:
amine@67 223
amine@32 224 .. code:: python
amine@67 225
amine@32 226 dsource = StringDataSource("aaaAAAABBbbb")
amine@32 227 tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
amine@32 228 min_length=3, max_length=4,
amine@32 229 max_continuous_silence=0,
amine@32 230 mode=StreamTokenizer.STRICT_MIN_LENGTH)
amine@32 231 tokenizer.tokenize(dsource)
amine@67 232
amine@32 233 :output:
amine@67 234
amine@32 235 .. code:: python
amine@67 236
amine@32 237 [(['A', 'A', 'A', 'A'], 3, 6)]
amine@67 238
amine@67 239
amine@35 240 2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames
amine@32 241 from a token to be delivered if and only if it is not **truncated**.
amine@32 242 This can be a bit tricky. A token is actually delivered if:
amine@67 243
amine@32 244 - a. `max_continuous_silence` is reached
amine@67 245
amine@32 246 :or:
amine@67 247
amine@32 248 - b. Its length reaches `max_length`. This is called a **truncated** token
amine@67 249
amine@32 250 In the current implementation, a `StreamTokenizer`'s decision is only based on already seen
amine@32 251 data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated
amine@32 252 frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing
amine@32 253 silence will be kept because it can potentially be part of valid token (if `max_length`
amine@32 254 was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered
amine@32 255 token will not be considered as truncated but a result of *normal* end of detection
amine@32 256 (i.e. no more valid data). In that case the tailing silence can be removed if you use
amine@32 257 the `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
amine@67 258
amine@32 259 :Example:
amine@67 260
amine@32 261 .. code:: python
amine@67 262
amine@32 263 tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3,
amine@32 264 max_length=6, max_continuous_silence=3,
amine@32 265 mode=StreamTokenizer.DROP_TRAILING_SILENCE)
amine@67 266
amine@32 267 dsource = StringDataSource("aaaAAAaaaBBbbbb")
amine@32 268 tokenizer.tokenize(dsource)
amine@67 269
amine@32 270 :output:
amine@67 271
amine@32 272 .. code:: python
amine@67 273
amine@32 274 [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
amine@67 275
amine@32 276 The first token is delivered with its tailing silence because it is truncated
amine@32 277 while the second one has its tailing frames removed.
amine@67 278
amine@32 279 Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
amine@67 280
amine@32 281 .. code:: python
amine@67 282
amine@32 283 [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)]
amine@67 284
amine@67 285
amine@32 286 3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`:
amine@32 287 use both options. That means: first remove tailing silence, then ckeck if the
amine@32 288 token still has at least a length of `min_length`.
amine@32 289 """
amine@67 290
amine@32 291 SILENCE = 0
amine@32 292 POSSIBLE_SILENCE = 1
amine@67 293 POSSIBLE_NOISE = 2
amine@32 294 NOISE = 3
amine@67 295
amine@32 296 STRICT_MIN_LENGTH = 2
amine@32 297 DROP_TRAILING_SILENCE = 4
amine@32 298 # alias
amine@32 299 DROP_TAILING_SILENCE = 4
amine@67 300
amine@67 301 def __init__(self, validator,
amine@32 302 min_length, max_length, max_continuous_silence,
amine@32 303 init_min=0, init_max_silence=0,
amine@32 304 mode=0):
amine@67 305
amine@2 306 if not isinstance(validator, DataValidator):
amine@2 307 raise TypeError("'validator' must be an instance of 'DataValidator'")
amine@67 308
amine@2 309 if max_length <= 0:
amine@2 310 raise ValueError("'max_length' must be > 0 (value={0})".format(max_length))
amine@67 311
amine@2 312 if min_length <= 0 or min_length > max_length:
amine@35 313 raise ValueError("'min_length' must be > 0 and <= 'max_length' (value={0})".format(min_length))
amine@67 314
amine@2 315 if max_continuous_silence >= max_length:
amine@35 316 raise ValueError("'max_continuous_silence' must be < 'max_length' (value={0})".format(max_continuous_silence))
amine@67 317
amine@5 318 if init_min >= max_length:
amine@35 319 raise ValueError("'init_min' must be < 'max_length' (value={0})".format(max_continuous_silence))
amine@67 320
amine@2 321 self.validator = validator
amine@2 322 self.min_length = min_length
amine@2 323 self.max_length = max_length
amine@2 324 self.max_continuous_silence = max_continuous_silence
amine@2 325 self.init_min = init_min
amine@2 326 self.init_max_silent = init_max_silence
amine@67 327
amine@2 328 self._mode = None
amine@2 329 self.set_mode(mode)
amine@2 330 self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
amine@67 331 self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
amine@67 332
amine@2 333 self._deliver = None
amine@2 334 self._tokens = None
amine@2 335 self._state = None
amine@2 336 self._data = None
amine@2 337 self._contiguous_token = False
amine@67 338
amine@2 339 self._init_count = 0
amine@2 340 self._silence_length = 0
amine@2 341 self._start_frame = 0
amine@2 342 self._current_frame = 0
amine@67 343
amine@2 344 def set_mode(self, mode):
amine@177 345 # TODO: use properties and make these deprecated
amine@2 346 """
amine@32 347 :Parameters:
amine@67 348
amine@32 349 `mode` : *(int)*
amine@32 350 New mode, must be one of:
amine@67 351
amine@67 352
amine@33 353 - `StreamTokenizer.STRICT_MIN_LENGTH`
amine@67 354
amine@33 355 - `StreamTokenizer.DROP_TRAILING_SILENCE`
amine@67 356
amine@33 357 - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`
amine@67 358
amine@177 359 - `0` TODO: this mode should have a name
amine@67 360
amine@3 361 See `StreamTokenizer.__init__` for more information about the mode.
amine@2 362 """
amine@67 363
amine@32 364 if not mode in [self.STRICT_MIN_LENGTH, self.DROP_TRAILING_SILENCE,
amine@67 365 self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, 0]:
amine@67 366
amine@2 367 raise ValueError("Wrong value for mode")
amine@67 368
amine@2 369 self._mode = mode
amine@2 370 self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
amine@67 371 self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
amine@67 372
amine@2 373 def get_mode(self):
amine@2 374 """
amine@2 375 Return the current mode. To check whether a specific mode is activated use
amine@2 376 the bitwise 'and' operator `&`. Example:
amine@67 377
amine@32 378 .. code:: python
amine@67 379
amine@2 380 if mode & self.STRICT_MIN_LENGTH != 0:
amine@32 381 do_something()
amine@2 382 """
amine@2 383 return self._mode
amine@67 384
amine@2 385 def _reinitialize(self):
amine@2 386 self._contiguous_token = False
amine@2 387 self._data = []
amine@2 388 self._tokens = []
amine@2 389 self._state = self.SILENCE
amine@2 390 self._current_frame = -1
amine@2 391 self._deliver = self._append_token
amine@67 392
amine@177 393 def tokenize(self, data_source, callback=None, generator=False):
amine@2 394 """
amine@2 395 Read data from `data_source`, one frame a time, and process the read frames in
amine@2 396 order to detect sequences of frames that make up valid tokens.
amine@67 397
amine@32 398 :Parameters:
amine@47 399 `data_source` : instance of the :class:`DataSource` class that implements a `read` method.
amine@32 400 'read' should return a slice of signal, i.e. frame (of whatever \
amine@32 401 type as long as it can be processed by validator) and None if \
amine@32 402 there is no more signal.
amine@67 403
amine@32 404 `callback` : an optional 3-argument function.
amine@32 405 If a `callback` function is given, it will be called each time a valid token
amine@32 406 is found.
amine@67 407
amine@67 408
amine@32 409 :Returns:
amine@32 410 A list of tokens if `callback` is None. Each token is tuple with the following elements:
amine@67 411
amine@32 412 .. code python
amine@67 413
amine@32 414 (data, start, end)
amine@67 415
amine@32 416 where `data` is a list of read frames, `start`: index of the first frame in the
amine@32 417 original data and `end` : index of the last frame.
amine@67 418
amine@2 419 """
amine@177 420 token_gen = self._iter_tokens(data_source)
amine@177 421 if callback:
amine@177 422 for token in token_gen:
amine@177 423 callback(*token)
amine@177 424 return
amine@177 425 if generator:
amine@177 426 return token_gen
amine@177 427 return list(token_gen)
amine@67 428
amine@177 429 def _iter_tokens(self, data_source):
amine@2 430 self._reinitialize()
amine@2 431 while True:
amine@67 432 frame = data_source.read()
amine@177 433 self._current_frame += 1
amine@47 434 if frame is None:
amine@177 435 token = self._post_process()
amine@177 436 if token is not None:
amine@177 437 yield token
amine@2 438 break
amine@177 439 token = self._process(frame)
amine@177 440 if token is not None:
amine@177 441 yield token
amine@67 442
amine@2 443 def _process(self, frame):
amine@67 444
amine@2 445 frame_is_valid = self.validator.is_valid(frame)
amine@67 446
amine@2 447 if self._state == self.SILENCE:
amine@67 448
amine@2 449 if frame_is_valid:
amine@2 450 # seems we got a valid frame after a silence
amine@2 451 self._init_count = 1
amine@2 452 self._silence_length = 0
amine@2 453 self._start_frame = self._current_frame
amine@2 454 self._data.append(frame)
amine@67 455
amine@67 456 if self._init_count >= self.init_min:
amine@2 457 self._state = self.NOISE
amine@2 458 if len(self._data) >= self.max_length:
amine@177 459 return self._process_end_of_detection(True)
amine@2 460 else:
amine@2 461 self._state = self.POSSIBLE_NOISE
amine@67 462
amine@2 463 elif self._state == self.POSSIBLE_NOISE:
amine@67 464
amine@2 465 if frame_is_valid:
amine@2 466 self._silence_length = 0
amine@2 467 self._init_count += 1
amine@2 468 self._data.append(frame)
amine@67 469 if self._init_count >= self.init_min:
amine@2 470 self._state = self.NOISE
amine@2 471 if len(self._data) >= self.max_length:
amine@177 472 return self._process_end_of_detection(True)
amine@67 473
amine@67 474 else:
amine@2 475 self._silence_length += 1
amine@2 476 if self._silence_length > self.init_max_silent or \
amine@67 477 len(self._data) + 1 >= self.max_length:
amine@2 478 # either init_max_silent or max_length is reached
amine@2 479 # before _init_count, back to silence
amine@2 480 self._data = []
amine@2 481 self._state = self.SILENCE
amine@2 482 else:
amine@2 483 self._data.append(frame)
amine@67 484
amine@2 485 elif self._state == self.NOISE:
amine@67 486
amine@2 487 if frame_is_valid:
amine@2 488 self._data.append(frame)
amine@2 489 if len(self._data) >= self.max_length:
amine@177 490 return self._process_end_of_detection(True)
amine@67 491
amine@67 492 elif self.max_continuous_silence <= 0:
amine@2 493 # max token reached at this frame will _deliver if _contiguous_token
amine@2 494 # and not _strict_min_length
amine@2 495 self._state = self.SILENCE
amine@177 496 return self._process_end_of_detection()
amine@2 497 else:
amine@2 498 # this is the first silent frame following a valid one
amine@2 499 # and it is tolerated
amine@2 500 self._silence_length = 1
amine@2 501 self._data.append(frame)
amine@2 502 self._state = self.POSSIBLE_SILENCE
amine@2 503 if len(self._data) == self.max_length:
amine@177 504 return self._process_end_of_detection(True)
amine@67 505 # don't reset _silence_length because we still
amine@2 506 # need to know the total number of silent frames
amine@67 507
amine@2 508 elif self._state == self.POSSIBLE_SILENCE:
amine@67 509
amine@2 510 if frame_is_valid:
amine@2 511 self._data.append(frame)
amine@2 512 self._silence_length = 0
amine@2 513 self._state = self.NOISE
amine@2 514 if len(self._data) >= self.max_length:
amine@177 515 return self._process_end_of_detection(True)
amine@67 516
amine@2 517 else:
amine@2 518 if self._silence_length >= self.max_continuous_silence:
amine@177 519 self._state = self.SILENCE
amine@2 520 if self._silence_length < len(self._data):
amine@67 521 # _deliver only gathered frames aren't all silent
amine@177 522 return self._process_end_of_detection()
amine@177 523 self._data = []
amine@2 524 self._silence_length = 0
amine@2 525 else:
amine@2 526 self._data.append(frame)
amine@2 527 self._silence_length += 1
amine@2 528 if len(self._data) >= self.max_length:
amine@177 529 return self._process_end_of_detection(True)
amine@67 530 # don't reset _silence_length because we still
amine@2 531 # need to know the total number of silent frames
amine@67 532
amine@2 533 def _post_process(self):
amine@2 534 if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
amine@2 535 if len(self._data) > 0 and len(self._data) > self._silence_length:
amine@177 536 return self._process_end_of_detection()
amine@67 537
amine@2 538 def _process_end_of_detection(self, truncated=False):
amine@67 539
amine@3 540 if not truncated and self._drop_tailing_silence and self._silence_length > 0:
amine@2 541 # happens if max_continuous_silence is reached
amine@2 542 # or max_length is reached at a silent frame
amine@2 543 self._data = self._data[0: - self._silence_length]
amine@67 544
amine@2 545 if (len(self._data) >= self.min_length) or \
amine@67 546 (len(self._data) > 0 and
amine@67 547 not self._strict_min_length and self._contiguous_token):
amine@67 548
amine@177 549 start_frame = self._start_frame
amine@177 550 end_frame = self._start_frame + len(self._data) - 1
amine@177 551 data = self._data
amine@177 552 self._data = []
amine@177 553 token = (data, start_frame, end_frame)
amine@67 554
amine@2 555 if truncated:
amine@2 556 # next token (if any) will start at _current_frame + 1
amine@2 557 self._start_frame = self._current_frame + 1
amine@2 558 # remember that it is contiguous with the just delivered one
amine@2 559 self._contiguous_token = True
amine@2 560 else:
amine@2 561 self._contiguous_token = False
amine@177 562 return token
amine@2 563 else:
amine@67 564 self._contiguous_token = False
amine@67 565
amine@2 566 self._data = []
amine@67 567
amine@2 568 def _append_token(self, data, start, end):
amine@177 569 self._tokens.append((data, start, end))