Mercurial > hg > auditok

--- a/CHANGELOG	Tue Sep 22 11:12:11 2015 +0200
+++ b/CHANGELOG	Wed Sep 23 11:26:58 2015 +0200
@@ -1,6 +1,13 @@
 auditok Changelog
 =================

+Version 0.1.3
+-------------
+
+- Bug fix: util.AudioEnergyValidator._formats = {1: 'b' , 2: 'h', 4: 'i'} instead of {1: 'B' , 2: 'H', 4: 'I'}
+- Typo fix in code: StreamTokenizer.DROP_TRAILING_SILENCE renamed StreamTokenizer.DROP_TAILING_SILENCE
+- Documentation updated
+

 Version 0.1.2
 --------------
--- a/INSTALL	Tue Sep 22 11:12:11 2015 +0200
+++ b/INSTALL	Wed Sep 23 11:26:58 2015 +0200
@@ -1,3 +1,3 @@


-pip install auditok
+python setup.py install
--- a/auditok/__init__.py	Tue Sep 22 11:12:11 2015 +0200
+++ b/auditok/__init__.py	Wed Sep 23 11:26:58 2015 +0200
@@ -490,6 +490,6 @@
 from util import *
 import dataset

-__version__ = "0.1.2"
+__version__ = "0.1.3"
--- a/auditok/core.py	Tue Sep 22 11:12:11 2015 +0200
+++ b/auditok/core.py	Wed Sep 23 11:26:58 2015 +0200
@@ -12,148 +12,7 @@


 class StreamTokenizer():
-    """
-    Class for stream tokenizers. It implements a 4-state automata scheme
-    for interesting sub-sequences extraction.

-    **Parameters:**
-
-    `validator` :
-        instance of `DataValidator` that implements `is_valid` method.
-
-    `min_length` : *(int)*
-        Minimum number of frames of a valid token. This includes all \
-        tolerated non valid frames within the token.
-
-    `max_length` : *(int)*
-        Maximum number of frames of a valid token. This includes all \
-        tolerated non valid frames within the token.
-
-    `max_continuous_silence` : *(int)*
-        Maximum number of consecutive non-valid frames within a token.
-        Note that, within a valid token, there may be many tolerated \
-        *silent* regions that contain each a number of non valid frames up to \
-        `max_continuous_silence`
-
-    `init_min` : *(int, default=0)*
-        Minimum number of consecutive valid frames that must be **initially** \
-        gathered before any sequence of non valid frames can be tolerated. This
-        option is not always needed, it can be used to drop non-valid tokens as
-        early as possible. **Default = 0** means that the option is by default
-        ineffective.
-
-    `init_max_silence` : *(int, default=0)*
-        Maximum number of tolerated consecutive non-valid frames if the \
-        number already gathered valid frames has not yet reached 'init_min'.
-        This arguement is normally used if `init_min` is used. **Default = 0**,
-        by default this argument is not taken into consideration.
-
-    `mode` : *(int, default=0)*
-        `mode` can be:
-
-       1. `StreamTokenizer.STRICT_MIN_LENGTH`:
-           if token *i* is delivered because `max_length`
-           is reatched, and token *i+1* is immedialtely adjacent to
-           token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts
-           at frame *k+1*) then accept toekn *i+1* only of it has a size of at
-           least `min_length`. The default behavior is to accept toekn *i+1*
-           event if it is shorter than `min_length` (given that the above conditions
-           are fullfilled of course).
-
-        ** Example **
-
-           In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
-           accepted although it is shorter than `min_length` (3), because it immediatly
-           follows the latest delivered token:
-
-            .. code:: python
-
-                from auditok import StreamTokenizer, StringDataSource, DataValidator
-
-                class UpperCaseChecker(DataValidator):
-                def is_valid(self, frame):
-                     return frame.isupper()
-
-                dsource = StringDataSource("aaaAAAABBbbb")
-                tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
-                            min_length=3, max_length=4, max_continuous_silence=0)
-
-                tokenizer.tokenize(dsource)
-
-
-            output:
-
-            .. code:: python
-
-                [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
-
-            The following toknizer will however reject the 'BB' token
-
-                dsource = StringDataSource("aaaAAAABBbbb")
-                tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
-                            min_length=3, max_length=4, max_continuous_silence=0,
-                            mode=StreamTokenizer.STRICT_MIN_LENGTH)
-                tokenizer.tokenize(dsource)
-
-        output:
-
-        .. code:: python
-
-            [(['A', 'A', 'A', 'A'], 3, 6)]
-
-
-       2. `StreamTokenizer.DROP_TAILING_SILENCE`: drop all tailing non-valid frames
-           from a token to be delivered if and only if it is not **truncated**.
-           This can be a bit tricky. A token is actually delivered if:
-
-           a. `max_continuous_silence` is reached
-
-           OR
-
-           b. Its length reaches `max_length`. This is called a **truncated** token
-
-        In the current implementation, a `StreamTokenizer`'s decision is only based on seen
-        data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated
-        frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing
-        silence will be kept because it can potentilly be part of valid token (if `max_length`
-        was bigger). But if `max_continuous_silence` is reched before `max_length`, the delivered
-        token will not be considered as truncted but a result of *normal* end of detection
-        (i.e. no more valid data). In that case the tailing silence can be removed if you use
-        the `StreamTokenizer.DROP_TAILING_SILENCE` mode.
-
-        Take the following example:
-
-        .. code:: python
-
-            tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3,
-            max_length=6, max_continuous_silence=3,
-            mode=StreamTokenizer.DROP_TAILING_SILENCE)
-
-            dsource = StringDataSource("aaaAAAaaaBBbbbb")
-            tokenizer.tokenize(dsource)
-
-        output:
-
-        .. code:: python
-
-            [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
-
-        The first troken is delivered with its tailing silence because it is truncated
-        while the second one has its tailing frames removed.
-
-        Without `StreamTokenizer.DROP_TAILING_SILENCE` the output whould be:
-
-        .. code:: python
-
-            [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)]
-
-
-
-       3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE`:
-           use both options. That means: first remove tailing silence, then ckeck if the
-           token still has at least a length of `min_length`.
-
-    """

     SILENCE = 0
     POSSIBLE_SILENCE = 1
@@ -167,6 +26,144 @@
                  min_length, max_length, max_continuous_silence,
                  init_min=0, init_max_silence=0,
                  mode=0):
+        """
+        Class for stream tokenizers. It implements a 4-state automaton scheme
+        for interesting sub-sequences extraction.
+
+        **Parameters:**
+
+        `validator` :
+            instance of `DataValidator` that implements `is_valid` method.
+
+        `min_length` : *(int)*
+            Minimum number of frames of a valid token. This includes all \
+            tolerated non valid frames within the token.
+
+        `max_length` : *(int)*
+            Maximum number of frames of a valid token. This includes all \
+            tolerated non valid frames within the token.
+
+        `max_continuous_silence` : *(int)*
+            Maximum number of consecutive non-valid frames within a token.
+            Note that, within a valid token, there may be many tolerated \
+            *silent* regions that contain each a number of non valid frames up to \
+            `max_continuous_silence`
+
+        `init_min` : *(int, default=0)*
+            Minimum number of consecutive valid frames that must be **initially** \
+            gathered before any sequence of non valid frames can be tolerated. This
+            option is not always needed, it can be used to drop non-valid tokens as
+            early as possible. **Default = 0** means that the option is by default
+            ineffective.
+
+        `init_max_silence` : *(int, default=0)*
+            Maximum number of tolerated consecutive non-valid frames if the \
+            number already gathered valid frames has not yet reached 'init_min'.
+            This argument is normally used if `init_min` is used. **Default = 0**,
+            by default this argument is not taken into consideration.
+
+        `mode` : *(int, default=0)*
+            `mode` can be:
+
+           1. `StreamTokenizer.STRICT_MIN_LENGTH`:
+               if token *i* is delivered because `max_length`
+               is reached, and token *i+1* is immediately adjacent to
+               token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts
+               at frame *k+1*) then accept token *i+1* only of it has a size of at
+               least `min_length`. The default behavior is to accept token *i+1*
+               event if it is shorter than `min_length` (given that the above conditions
+               are fulfilled of course).
+
+               **Example:**
+
+               In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
+               accepted although it is shorter than `min_length` (3), because it immediately
+               follows the latest delivered token:
+
+
+                    from auditok import StreamTokenizer, StringDataSource, DataValidator
+
+                    class UpperCaseChecker(DataValidator):
+                    def is_valid(self, frame):
+                        return frame.isupper()
+
+                    dsource = StringDataSource("aaaAAAABBbbb")
+                    tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
+                                min_length=3, max_length=4, max_continuous_silence=0)
+
+                    tokenizer.tokenize(dsource)
+
+
+               output:
+
+
+                #!python
+                [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
+
+
+               The following tokenizer will however reject the 'BB' token:
+
+                #!python
+                dsource = StringDataSource("aaaAAAABBbbb")
+                tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
+                            min_length=3, max_length=4, max_continuous_silence=0,
+                            mode=StreamTokenizer.STRICT_MIN_LENGTH)
+                tokenizer.tokenize(dsource)
+
+               output:
+
+                #!python
+                [(['A', 'A', 'A', 'A'], 3, 6)]
+
+
+           2. `StreamTokenizer.DROP_TAILING_SILENCE`: drop all tailing non-valid frames
+               from a token to be delivered if and only if it is not **truncated**.
+               This can be a bit tricky. A token is actually delivered if:
+
+               - a. `max_continuous_silence` is reached
+
+               OR
+
+               - b. Its length reaches `max_length`. This is called a **truncated** token
+
+               In the current implementation, a `StreamTokenizer`'s decision is only based on seen
+               data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated
+               frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing
+               silence will be kept because it can potentially be part of valid token (if `max_length`
+               was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered
+               token will not be considered as truncated but a result of *normal* end of detection
+               (i.e. no more valid data). In that case the tailing silence can be removed if you use
+               the `StreamTokenizer.DROP_TAILING_SILENCE` mode.
+
+               Take the following example:
+
+                #!python
+                tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3,
+                max_length=6, max_continuous_silence=3,
+                mode=StreamTokenizer.DROP_TAILING_SILENCE)
+
+                dsource = StringDataSource("aaaAAAaaaBBbbbb")
+                tokenizer.tokenize(dsource)
+
+               output:
+
+                #!python
+                [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
+
+               The first token is delivered with its tailing silence because it is truncated
+               while the second one has its tailing frames removed.
+
+               Without `StreamTokenizer.DROP_TAILING_SILENCE` the output would be:
+
+                #!python
+                [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)]
+
+
+
+           3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE`:
+               use both options. That means: first remove tailing silence, then ckeck if the
+               token still has at least a length of `min_length`.
+        """


@@ -183,9 +180,11 @@
         if max_continuous_silence >= max_length:
             raise ValueError("'max_continuous_silence' must be < \
             'max_length' (value={0})".format(max_continuous_silence))
+
+        if init_min >= max_length:
+            raise ValueError("'init_min' must be < \
+            'max_length' (value={0})".format(max_continuous_silence))

-        # init_min must be shorter than max_length
-
         self.validator = validator
         self.min_length = min_length
         self.max_length = max_length
@@ -215,11 +214,15 @@

         `mode` : *(int)*
             New mode, must be one of:
+

-                a. `StreamTokenizer.STRICT_MIN_LENGTH`
-                b. `StreamTokenizer.DROP_TAILING_SILENCE`
-                c. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE`
-                d. 0
+        - a. `StreamTokenizer.STRICT_MIN_LENGTH`
+
+        - b. `StreamTokenizer.DROP_TAILING_SILENCE`
+
+        - c. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE`
+
+        - d. `0`

         See `StreamTokenizer.__init__` for more information about the mode.
         """
@@ -239,8 +242,7 @@
         Return the current mode. To check whether a specific mode is activated use
         the bitwise 'and' operator `&`. Example:

-        .. code:: python
-
+            #!python
             if mode & self.STRICT_MIN_LENGTH != 0:
                 ...
         """
@@ -275,16 +277,14 @@

         **Returns:**

-        A list of tokens if `callback` is None. Each token is tuple with the following elemnts:
+        A list of tokens if `callback` is None. Each token is tuple with the following elements:

-        .. code:: python
-
+            #!python
             (data, start, end)

         where `data` is a list of read frames, `start`: index of the first frame in the
         original data and `end` : index of the last frame.

-
         """

         self._reinitialize()
--- a/auditok/dataset.py	Tue Sep 22 11:12:11 2015 +0200
+++ b/auditok/dataset.py	Wed Sep 23 11:26:58 2015 +0200
@@ -13,8 +13,10 @@

 one_to_six_arabic_16000_mono_bc_noise = "{cd}{sep}data{sep}1to6arabic_\
 16000_mono_bc_noise.wav".format(cd=_current_dir, sep=os.path.sep)
+"""A wave file that contains a pronunciation of Arabic numbers from 1 to 6"""


 was_der_mensch_saet_mono_44100_lead_tail_silence = "{cd}{sep}data{sep}was_\
 der_mensch_saet_das_wird_er_vielfach_ernten_44100Hz_mono_lead_tail_\
 silence.wav".format(cd=_current_dir, sep=os.path.sep)
+""" A wave file that contains a sentence between long leading and tailing periods of silence"""
\ No newline at end of file
--- a/auditok/io.py	Tue Sep 22 11:12:11 2015 +0200
+++ b/auditok/io.py	Wed Sep 23 11:26:58 2015 +0200
@@ -33,8 +33,8 @@

         """

-        Parameters
-        ----------
+        **Parameters:**
+

         `sampling_rate` *(int)* :
             Number of samples per second of audio stream. Default = 16000.
@@ -76,13 +76,13 @@
         """
         Read and return `size` audio samples at most.

-        Parameters
-        ----------
+        **Parameters:**
+
         `size` : *(int)* :
             the number of samples to read.

-        Returns
-        --------
+        **Returns:**
+
         Audio data as a string of length 'N' * 'smaple_width' * 'channels', where 'N' is:

         `size` if `size` < 'left_samples'
@@ -133,8 +133,8 @@
     def set_position(self, position):
         """ Move to an absolute position

-        Parameters
-        ----------
+        **Parameters:**
+
         `position` : *(int)*
             number of samples to skip from the start of the stream
         """
@@ -143,8 +143,8 @@
     def set_time_position(self, time_position):
         """ Move to an absolute position expressed in seconds

-        Parameters
-        ----------
+        **Parameters:**
+
         `time_position` : *(float)*
             seconds to skip from the start of the stream
         """
@@ -209,8 +209,8 @@
     def set_data(self, data_buffer):
         """ Set new data for this audio stream.

-        Parameters
-        ----------
+        **Parameters:**
+
         `data_buffer` :
            a string buffer with a length multiple of (sample_width * channels)
         """
@@ -223,8 +223,8 @@
     def append_data(self, data_buffer):
         """ Append data to this audio stream

-        Parameters
-        ----------
+        **Parameters:**
+
         `data_buffer` :
            a string buffer with a length multiple of (sample_width * channels)

@@ -275,8 +275,8 @@
     def __init__(self, filename):

         """
-        Parameters
-        ----------
+        **Parameters:**
+
         `filename` :
             path to a valid wave file

@@ -413,13 +413,13 @@
     Create an `AudioSource` object using the audio file specified by `filename`.
     The appropriate `AudioSource` class is guessed from file's extension.

-    Parameters
-    ----------
+    **Parameters:**
+
     `filename` :
         path to an audio file

-    Returns
-    -------
+    **Returns:**
+
     an `AudioSource` object that reads data from the given file.

     """
@@ -434,13 +434,13 @@
     """
     Return a `PyAudioPlayer` that can play data from `audio_source`.

-    Parameters
-    ----------
+    **Parameters:**
+
     `audio_source` :
         an `AudioSource` object.

-    Returns
-    -------
+    **Returns:**
+
     `PyAudioPlayer` that has the same sampling rate, sample width and number of channels
     as `audio_source`.
     """
--- a/auditok/util.py	Tue Sep 22 11:12:11 2015 +0200
+++ b/auditok/util.py	Wed Sep 23 11:26:58 2015 +0200
@@ -517,18 +517,15 @@

         @staticmethod
         def _convert(signal, sample_width):
-            return numpy.array(numpy.frombuffer(signal,
-                               dtype=AudioEnergyValidator._formats[sample_width]),
-                               dtype=numpy.float64)
-
+            return numpy.array(numpy.frombuffer(signal, dtype=AudioEnergyValidator._formats[sample_width]), dtype=numpy.float64)

         @staticmethod
-        def _siganl_energy(signal):
-                return float(numpy.dot(signal, signal)) / len(signal)
+        def _signal_energy(signal):
+            return float(numpy.dot(signal, signal)) / len(signal)

         @staticmethod
         def _signal_log_energy(signal):
-            energy = AudioEnergyValidator._siganl_energy(signal)
+            energy = AudioEnergyValidator._signal_energy(signal)
             if energy <= 0:
                 return -200
             return 10. * numpy.log10(energy)
@@ -536,22 +533,22 @@
     else:


-        _formats = {1: 'B' , 2: 'H', 4: 'I'}
+        _formats = {1: 'b' , 2: 'h', 4: 'i'}

         @staticmethod
         def _convert(signal, sample_width):
-            array("d", array(AudioEnergyValidator._formats[sample_width], signal))
+            return array("d", array(AudioEnergyValidator._formats[sample_width], signal))

         @staticmethod
-        def _siganl_energy(signal):
-                energy = 0.
-                for a in signal:
-                    energy += a * a
-                return energy / len(signal)
+        def _signal_energy(signal):
+            energy = 0.
+            for a in signal:
+                energy += a * a
+            return energy / len(signal)

         @staticmethod
         def _signal_log_energy(signal):
-            energy = AudioEnergyValidator._siganl_energy(signal)
+            energy = AudioEnergyValidator._signal_energy(signal)
             if energy <= 0:
                 return -200
             return 10. * math.log10(energy)
--- a/quickstart.rst	Tue Sep 22 11:12:11 2015 +0200
+++ b/quickstart.rst	Wed Sep 23 11:26:58 2015 +0200
@@ -54,7 +54,7 @@
 - Define a fixed-length block_size (i.e. analysis window)
 - Allow overlap between two consecutive analysis windows (hop_size < block_size). This can be very important if your validator use the **spectral** information of audio data instead of raw audio samples.
 - Limit the amount (i.e. duration) of read data (very useful when reading data from the microphone)
-- Record and rewind data (also useful if you read data from the microphone and you want to process it many times offline and/or save it)
+- Record and rewind data (also useful if you read data from the microphone and you want to process it many times off-line and/or save it)


 Last but not least, the current version has only one audio window validator based on
@@ -84,7 +84,7 @@

 We want to extract sub-sequences of characters that have:

-- A minimu length of 1 (`min_length` = 1)
+- A minimum length of 1 (`min_length` = 1)
 - A maximum length of 9999 (`max_length` = 9999)
 - Zero consecutive lower case characters within them (`max_continuous_silence` = 0)

@@ -144,7 +144,7 @@

 Notice the tailing lower case letters "dd" and "ee" at the end of the two
 tokens. The default behavior of `StreamTokenizer` is to keep the *tailing
-silence* if it does'nt exceed `max_continuous_silence`. This can be changed
+silence* if it doesn't exceed `max_continuous_silence`. This can be changed
 using the `DROP_TAILING_SILENCE` mode (see next example).

 Remove tailing silence
@@ -403,7 +403,7 @@
 is at most 1 silent window.

 Still with this configuration we can get the tokenizer detect that noise as a valid event
-(if it actually contains 3 consecutive noisy frames). To circummvent this we use an enough
+(if it actually contains 3 consecutive noisy frames). To circumvent this we use an enough
 large analysis window (here of 100 ms) to ensure that the brief noise be surrounded by a much
 longer silence and hence the energy of the overall analysis window will be below 50.

@@ -459,12 +459,12 @@
 Online audio signal processing
 ------------------------------

-In the next example, audio data is directely acquired from the built-in microphone.
+In the next example, audio data is directly acquired from the built-in microphone.
 The `tokenize` method is passed a callback function so that audio activities
 are delivered as soon as they are detected. Each detected activity is played
 back using the build-in audio output device.

-As mentionned before , Signal energy is strongly related to many factors such
+As mentioned before , Signal energy is strongly related to many factors such
 microphone sensitivity, background noise (including noise inherent to the hardware),
 distance and your operating system sound settings. Try a lower `energy_threshold`
 if your noise does not seem to be detected and a higher threshold if you notice
--- a/tests/test_AudioDataSourceFactory.py	Tue Sep 22 11:12:11 2015 +0200
+++ b/tests/test_AudioDataSourceFactory.py	Wed Sep 23 11:26:58 2015 +0200
@@ -7,7 +7,6 @@
 import unittest
 from auditok import dataset, ADSFactory, BufferAudioSource, WaveAudioSource
 import wave
-from Crypto.Cipher.AES import block_size


 class TestADSFactoryFileAudioSource(unittest.TestCase):
--- a/tests/test_StreamTokenizer.py	Tue Sep 22 11:12:11 2015 +0200
+++ b/tests/test_StreamTokenizer.py	Wed Sep 23 11:26:58 2015 +0200
@@ -62,7 +62,7 @@


-    # A valid token is considered iff the tokenizer encounters
+    # A valid token is considered as so iff the tokenizer encounters
     # at least valid frames (init_min = 3) between witch there
     # are at most 0 consecutive non valid frames (init_max_silence = 0)
     # The tokenizer will only rely on the other parameters
@@ -74,13 +74,10 @@
                                      init_max_silence = 0, mode=0)


-        #data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
-        #                                             ^       ^     ^   ^
-        #                                             18      26    32  36

         data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
-        #                                             ^           ^  ^   ^
-        #                                             18          30 33  37
+        #                                                 ^           ^  ^   ^
+        #                                                 18          30 33  37

         tokens = tokenizer.tokenize(data_source)

@@ -116,8 +113,8 @@


         data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
-        #                                ^          ^  ^           ^   ^   ^
-        #                                5          16 19          31  35  39
+        #                                    ^          ^  ^           ^   ^   ^
+        #                                    5          16 19          31  35  39
         tokens = tokenizer.tokenize(data_source)

         self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
@@ -166,8 +163,8 @@


         data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
-        #                            ^            ^   ^         ^
-        #                            1            14  18        28
+        #                                ^            ^   ^         ^
+        #                                1            14  18        28

         tokens = tokenizer.tokenize(data_source)

@@ -215,8 +212,8 @@


         data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA")
-        #                            ^              ^             ^            ^
-        #                            1              16            30           45
+        #                                ^              ^             ^            ^
+        #                                1              16            30           45

         tokens = tokenizer.tokenize(data_source)

@@ -251,8 +248,8 @@


         data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa")
-        #                                             ^   ^^   ^    ^   ^     ^   ^
-        #                                             18 2223  27   32  36    42  46
+        #                                                 ^   ^^   ^    ^   ^     ^   ^
+        #                                                 18 2223  27   32  36    42  46

         tokens = tokenizer.tokenize(data_source)

@@ -309,8 +306,8 @@
                                     init_max_silence = 3, mode=0)

         data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
-        #                              ^   ^ ^    ^  ^       ^
-        #                              3   7 9   14 17      25
+        #                                  ^   ^ ^    ^  ^       ^
+        #                                  3   7 9   14 17      25

         tokens = tokenizer.tokenize(data_source)

@@ -354,9 +351,9 @@
                                     init_max_silence = 3, mode=0)

         data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
-        #                              ^        ^^ ^ ^        ^
-        #                              3       12131517      26
-        #                                     (12 13 15 17)
+        #                                  ^        ^^ ^ ^        ^
+        #                                  3       12131517      26
+        #                                         (12 13 15 17)

         tokens = tokenizer.tokenize(data_source)

@@ -403,8 +400,8 @@
                                     init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH)

         data_source = StringDataSource("aaAAAAAAAAAAAA")
-        #                             ^      ^
-        #                             2      9
+        #                                 ^      ^
+        #                                 2      9

         tokens = tokenizer.tokenize(data_source)

@@ -428,8 +425,8 @@
                                     init_max_silence = 3, mode=StreamTokenizer.DROP_TAILING_SILENCE)

         data_source = StringDataSource("aaAAAAAaaaaa")
-        #                             ^   ^
-        #                             2   6
+        #                                 ^   ^
+        #                                 2   6

         tokens = tokenizer.tokenize(data_source)

@@ -453,8 +450,8 @@
                                     init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE)

         data_source = StringDataSource("aaAAAAAAAAAAAAaa")
-        #                             ^      ^
-        #                             2      8
+        #                                 ^      ^
+        #                                 2      8

         tokens = tokenizer.tokenize(data_source)

@@ -489,8 +486,8 @@
                                     init_max_silence = 3, mode=0)

         data_source = StringDataSource("aaAAAAAAAAAAAAa")
-        #                             ^      ^^   ^
-        #                             2      910  14
+        #                                 ^      ^^   ^
+        #                                 2      910  14

         tokenizer.tokenize(data_source, callback=callback)