# HG changeset patch # User Amine Sehili # Date 1570560493 -3600 # Node ID 7259b1eb9329457167a17e972c5305567b6bdfeb # Parent 5af0974b344619a35e8d3f63fcea17522c0b981e Refactor StreamTokenizer - Remove unused code - accept a callable validator - update doc diff -r 5af0974b3446 -r 7259b1eb9329 auditok/core.py --- a/auditok/core.py Mon Oct 07 20:58:23 2019 +0100 +++ b/auditok/core.py Tue Oct 08 19:48:13 2019 +0100 @@ -129,9 +129,7 @@ params["channels"] = input.ch input = bytes(input) try: - source = AudioReader( - input, block_dur=analysis_window, **params - ) + source = AudioReader(input, block_dur=analysis_window, **params) except TooSamllBlockDuration as exc: err_msg = "Too small 'analysis_windows' ({0}) for sampling rate " err_msg += "({1}). Analysis windows should at least be 1/{1} to " @@ -763,7 +761,8 @@ :Parameters: `validator` : - instance of `DataValidator` that implements `is_valid` method. + Callable or an instance of DataValidator that implements + `is_valid` method. `min_length` : *(int)* Minimum number of frames of a valid token. This includes all \ @@ -795,7 +794,11 @@ `mode` : *(int, default=0)* `mode` can be: - 1. `StreamTokenizer.STRICT_MIN_LENGTH`: + 1. `StreamTokenizer.NORMAL`: + Do not drop trailing silence, and accept a token shorter than + `min_length` if it is the continuation of the latest delivered token. + + 2. `StreamTokenizer.STRICT_MIN_LENGTH`: if token *i* is delivered because `max_length` is reached, and token *i+1* is immediately adjacent to token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts @@ -852,7 +855,7 @@ [(['A', 'A', 'A', 'A'], 3, 6)] - 2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames + 3. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames from a token to be delivered if and only if it is not **truncated**. This can be a bit tricky. A token is actually delivered if: @@ -898,7 +901,7 @@ [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)] - 3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`: + 4. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`: use both options. That means: first remove tailing silence, then ckeck if the token still has at least a length of `min_length`. """ @@ -907,11 +910,9 @@ POSSIBLE_SILENCE = 1 POSSIBLE_NOISE = 2 NOISE = 3 - + NORMAL = 0 STRICT_MIN_LENGTH = 2 DROP_TRAILING_SILENCE = 4 - # alias - DROP_TAILING_SILENCE = 4 def __init__( self, @@ -923,10 +924,13 @@ init_max_silence=0, mode=0, ): - - if not isinstance(validator, DataValidator): + if callable(validator): + self._is_valid = validator + elif isinstance(validator, DataValidator): + self._is_valid = validator.is_valid + else: raise TypeError( - "'validator' must be an instance of 'DataValidator'" + "'validator' must be a callable or an instance of DataValidator" ) if max_length <= 0: @@ -961,67 +965,30 @@ self.max_continuous_silence = max_continuous_silence self.init_min = init_min self.init_max_silent = init_max_silence - - self._mode = None - self.set_mode(mode) - self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0 - self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 - + self._set_mode(mode) self._deliver = None self._tokens = None self._state = None self._data = None self._contiguous_token = False - self._init_count = 0 self._silence_length = 0 self._start_frame = 0 self._current_frame = 0 - def set_mode(self, mode): - # TODO: use properties and make these deprecated - """ - :Parameters: - - `mode` : *(int)* - New mode, must be one of: - - - - `StreamTokenizer.STRICT_MIN_LENGTH` - - - `StreamTokenizer.DROP_TRAILING_SILENCE` - - - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE` - - - `0` TODO: this mode should have a name - - See `StreamTokenizer.__init__` for more information about the mode. - """ - + def _set_mode(self, mode): + strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH + strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE if not mode in [ - self.STRICT_MIN_LENGTH, - self.DROP_TRAILING_SILENCE, - self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, - 0, + StreamTokenizer.NORMAL, + StreamTokenizer.STRICT_MIN_LENGTH, + StreamTokenizer.DROP_TRAILING_SILENCE, + strict_min_and_drop_trailing, ]: - raise ValueError("Wrong value for mode") - self._mode = mode self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0 - self._drop_tailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 - - def get_mode(self): - """ - Return the current mode. To check whether a specific mode is activated use - the bitwise 'and' operator `&`. Example: - - .. code:: python - - if mode & self.STRICT_MIN_LENGTH != 0: - do_something() - """ - return self._mode + self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0 def _reinitialize(self): self._contiguous_token = False @@ -1056,7 +1023,6 @@ where `data` is a list of read frames, `start`: index of the first frame in the original data and `end` : index of the last frame. - """ token_gen = self._iter_tokens(data_source) if callback: @@ -1083,7 +1049,7 @@ def _process(self, frame): - frame_is_valid = self.validator.is_valid(frame) + frame_is_valid = self._is_valid(frame) if self._state == self.SILENCE: @@ -1182,7 +1148,7 @@ if ( not truncated - and self._drop_tailing_silence + and self._drop_trailing_silence and self._silence_length > 0 ): # happens if max_continuous_silence is reached diff -r 5af0974b3446 -r 7259b1eb9329 tests/test_StreamTokenizer.py --- a/tests/test_StreamTokenizer.py Mon Oct 07 20:58:23 2019 +0100 +++ b/tests/test_StreamTokenizer.py Tue Oct 08 19:48:13 2019 +0100 @@ -1,500 +1,1021 @@ -''' +""" @author: Amine Sehili September 2015 -''' +""" import unittest from auditok import StreamTokenizer, StringDataSource, DataValidator class AValidator(DataValidator): - def is_valid(self, frame): return frame == "A" class TestStreamTokenizerInitParams(unittest.TestCase): - - def setUp(self): self.A_validator = AValidator() - + # Completely deactivate init_min and init_max_silence # The tokenizer will only rely on the other parameters # Note that if init_min = 0, the value of init_max_silence # will have no effect def test_init_min_0_init_max_silence_0(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, - max_continuous_silence=4, init_min = 0, - init_max_silence = 0, mode=0) - - + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=20, + max_continuous_silence=4, + init_min=0, + init_max_silence=0, + mode=0, + ) + data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA") # ^ ^ ^ ^ # 2 16 20 27 tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 2, + msg="wrong number of tokens, expected: 2, found: {0} ".format( + len(tokens) + ), + ) tok1, tok2 = tokens[0], tokens[1] - + # tok1[0]: data # tok1[1]: start frame (included) # tok1[2]: end frame (included) - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AaaaAaAaaAaAaaaa", - msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(data)) - self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) - self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end)) - - - data = ''.join(tok2[0]) + self.assertEqual( + data, + "AaaaAaAaaAaAaaaa", + msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format( + data + ), + ) + self.assertEqual( + start, + 1, + msg="wrong start frame for token 1, expected: 1, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 16, + msg="wrong end frame for token 1, expected: 16, found: {0} ".format( + end + ), + ) + + data = "".join(tok2[0]) start = tok2[1] end = tok2[2] - self.assertEqual(data, "AAAAAAAA", - msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(data)) - self.assertEqual(start, 20, msg="wrong start frame for token 2, expected: 20, found: {0} ".format(start)) - self.assertEqual(end, 27, msg="wrong end frame for token 2, expected: 27, found: {0} ".format(end)) - - - + self.assertEqual( + data, + "AAAAAAAA", + msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format( + data + ), + ) + self.assertEqual( + start, + 20, + msg="wrong start frame for token 2, expected: 20, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 27, + msg="wrong end frame for token 2, expected: 27, found: {0} ".format( + end + ), + ) + # A valid token is considered as so iff the tokenizer encounters # at least valid frames (init_min = 3) between witch there # are at most 0 consecutive non valid frames (init_max_silence = 0) # The tokenizer will only rely on the other parameters # In other words, a valid token must start with 3 valid frames def test_init_min_3_init_max_silence_0(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, - max_continuous_silence=4, init_min = 3, - init_max_silence = 0, mode=0) - - - - data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA") + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=20, + max_continuous_silence=4, + init_min=3, + init_max_silence=0, + mode=0, + ) + + data_source = StringDataSource( + "aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA" + ) # ^ ^ ^ ^ # 18 30 33 37 - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 2, + msg="wrong number of tokens, expected: 2, found: {0} ".format( + len(tokens) + ), + ) tok1, tok2 = tokens[0], tokens[1] - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AAAAAAAAAaaaa", - msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data)) - self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start)) - self.assertEqual(end, 30, msg="wrong end frame for token 1, expected: 30, found: {0} ".format(end)) - - - data = ''.join(tok2[0]) + self.assertEqual( + data, + "AAAAAAAAAaaaa", + msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 18, + msg="wrong start frame for token 1, expected: 18, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 30, + msg="wrong end frame for token 1, expected: 30, found: {0} ".format( + end + ), + ) + + data = "".join(tok2[0]) start = tok2[1] end = tok2[2] - self.assertEqual(data, "AAAAA", - msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 33, msg="wrong start frame for token 2, expected: 33, found: {0} ".format(start)) - self.assertEqual(end, 37, msg="wrong end frame for token 2, expected: 37, found: {0} ".format(end)) - - + self.assertEqual( + data, + "AAAAA", + msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 33, + msg="wrong start frame for token 2, expected: 33, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 37, + msg="wrong end frame for token 2, expected: 37, found: {0} ".format( + end + ), + ) + # A valid token is considered iff the tokenizer encounters # at least valid frames (init_min = 3) between witch there # are at most 2 consecutive non valid frames (init_max_silence = 2) def test_init_min_3_init_max_silence_2(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, - max_continuous_silence=4, init_min = 3, - init_max_silence = 2, mode=0) - - - data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA") + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=20, + max_continuous_silence=4, + init_min=3, + init_max_silence=2, + mode=0, + ) + + data_source = StringDataSource( + "aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA" + ) # ^ ^ ^ ^ ^ ^ # 5 16 19 31 35 39 tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 3, + msg="wrong number of tokens, expected: 3, found: {0} ".format( + len(tokens) + ), + ) tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AaAaaAaAaaaa", - msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(data)) - self.assertEqual(start, 5, msg="wrong start frame for token 1, expected: 5, found: {0} ".format(start)) - self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end)) - - - data = ''.join(tok2[0]) + self.assertEqual( + data, + "AaAaaAaAaaaa", + msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 5, + msg="wrong start frame for token 1, expected: 5, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 16, + msg="wrong end frame for token 1, expected: 16, found: {0} ".format( + end + ), + ) + + data = "".join(tok2[0]) start = tok2[1] end = tok2[2] - self.assertEqual(data, "AAAAAAAAAaaaa", - msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data)) - self.assertEqual(start, 19, msg="wrong start frame for token 2, expected: 19, found: {0} ".format(start)) - self.assertEqual(end, 31, msg="wrong end frame for token 2, expected: 31, found: {0} ".format(end)) - - - data = ''.join(tok3[0]) + self.assertEqual( + data, + "AAAAAAAAAaaaa", + msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 19, + msg="wrong start frame for token 2, expected: 19, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 31, + msg="wrong end frame for token 2, expected: 31, found: {0} ".format( + end + ), + ) + + data = "".join(tok3[0]) start = tok3[1] end = tok3[2] - self.assertEqual(data, "AAAAA", - msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 35, msg="wrong start frame for token 2, expected: 35, found: {0} ".format(start)) - self.assertEqual(end, 39, msg="wrong end frame for token 2, expected: 39, found: {0} ".format(end)) - - - + self.assertEqual( + data, + "AAAAA", + msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 35, + msg="wrong start frame for token 2, expected: 35, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 39, + msg="wrong end frame for token 2, expected: 39, found: {0} ".format( + end + ), + ) + + class TestStreamTokenizerMinMaxLength(unittest.TestCase): - def setUp(self): self.A_validator = AValidator() - - + def test_min_length_6_init_max_length_20(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 6, max_length=20, - max_continuous_silence=2, init_min = 3, - init_max_silence = 3, mode=0) - - + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=6, + max_length=20, + max_continuous_silence=2, + init_min=3, + init_max_silence=3, + mode=0, + ) + data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") # ^ ^ ^ ^ # 1 14 18 28 - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 2, + msg="wrong number of tokens, expected: 2, found: {0} ".format( + len(tokens) + ), + ) tok1, tok2 = tokens[0], tokens[1] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AaaaAaAaaAaAaa", - msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format(data)) - self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) - self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end)) - - - data = ''.join(tok2[0]) + self.assertEqual( + data, + "AaaaAaAaaAaAaa", + msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 1, + msg="wrong start frame for token 1, expected: 1, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 14, + msg="wrong end frame for token 1, expected: 14, found: {0} ".format( + end + ), + ) + + data = "".join(tok2[0]) start = tok2[1] end = tok2[2] - self.assertEqual(data, "AAAAAAAAAaa", - msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format(data)) - self.assertEqual(start, 18, msg="wrong start frame for token 2, expected: 18, found: {0} ".format(start)) - self.assertEqual(end, 28, msg="wrong end frame for token 2, expected: 28, found: {0} ".format(end)) - - + self.assertEqual( + data, + "AAAAAAAAAaa", + msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 18, + msg="wrong start frame for token 2, expected: 18, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 28, + msg="wrong end frame for token 2, expected: 28, found: {0} ".format( + end + ), + ) + def test_min_length_1_init_max_length_1(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 1, max_length=1, - max_continuous_silence=0, init_min = 0, - init_max_silence = 0, mode=0) - - - data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") - + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=1, + max_length=1, + max_continuous_silence=0, + init_min=0, + init_max_silence=0, + mode=0, + ) + + data_source = StringDataSource( + "AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA" + ) + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 21, msg="wrong number of tokens, expected: 21, found: {0} ".format(len(tokens))) - - + + self.assertEqual( + len(tokens), + 21, + msg="wrong number of tokens, expected: 21, found: {0} ".format( + len(tokens) + ), + ) + def test_min_length_10_init_max_length_20(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 10, max_length=20, - max_continuous_silence=4, init_min = 3, - init_max_silence = 3, mode=0) - - - data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA") + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=10, + max_length=20, + max_continuous_silence=4, + init_min=3, + init_max_silence=3, + mode=0, + ) + + data_source = StringDataSource( + "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA" + ) # ^ ^ ^ ^ # 1 16 30 45 - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 2, + msg="wrong number of tokens, expected: 2, found: {0} ".format( + len(tokens) + ), + ) tok1, tok2 = tokens[0], tokens[1] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AaaaAaAaaAaAaaaa", - msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format(data)) - self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) - self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end)) - - - data = ''.join(tok2[0]) + self.assertEqual( + data, + "AaaaAaAaaAaAaaaa", + msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 1, + msg="wrong start frame for token 1, expected: 1, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 16, + msg="wrong end frame for token 1, expected: 16, found: {0} ".format( + end + ), + ) + + data = "".join(tok2[0]) start = tok2[1] end = tok2[2] - self.assertEqual(data, "AAAAAaaAAaaAAA", - msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 30, msg="wrong start frame for token 2, expected: 30, found: {0} ".format(start)) - self.assertEqual(end, 43, msg="wrong end frame for token 2, expected: 43, found: {0} ".format(end)) - - - + self.assertEqual( + data, + "AAAAAaaAAaaAAA", + msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 30, + msg="wrong start frame for token 2, expected: 30, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 43, + msg="wrong end frame for token 2, expected: 43, found: {0} ".format( + end + ), + ) + def test_min_length_4_init_max_length_5(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 4, max_length=5, - max_continuous_silence=4, init_min = 3, - init_max_silence = 3, mode=0) - - - data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa") + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=4, + max_length=5, + max_continuous_silence=4, + init_min=3, + init_max_silence=3, + mode=0, + ) + + data_source = StringDataSource( + "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa" + ) # ^ ^^ ^ ^ ^ ^ ^ # 18 2223 27 32 36 42 46 - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 4, msg="wrong number of tokens, expected: 4, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 4, + msg="wrong number of tokens, expected: 4, found: {0} ".format( + len(tokens) + ), + ) tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AAAAA", - msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start)) - self.assertEqual(end, 22, msg="wrong end frame for token 1, expected: 22, found: {0} ".format(end)) - - - data = ''.join(tok2[0]) + self.assertEqual( + data, + "AAAAA", + msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 18, + msg="wrong start frame for token 1, expected: 18, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 22, + msg="wrong end frame for token 1, expected: 22, found: {0} ".format( + end + ), + ) + + data = "".join(tok2[0]) start = tok2[1] end = tok2[2] - self.assertEqual(data, "AAAaa", - msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(data)) - self.assertEqual(start, 23, msg="wrong start frame for token 1, expected: 23, found: {0} ".format(start)) - self.assertEqual(end, 27, msg="wrong end frame for token 1, expected: 27, found: {0} ".format(end)) - - - data = ''.join(tok3[0]) + self.assertEqual( + data, + "AAAaa", + msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 23, + msg="wrong start frame for token 1, expected: 23, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 27, + msg="wrong end frame for token 1, expected: 27, found: {0} ".format( + end + ), + ) + + data = "".join(tok3[0]) start = tok3[1] end = tok3[2] - self.assertEqual(data, "AAAAA", - msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 32, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) - self.assertEqual(end, 36, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end)) - - - data = ''.join(tok4[0]) + self.assertEqual( + data, + "AAAAA", + msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 32, + msg="wrong start frame for token 1, expected: 1, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 36, + msg="wrong end frame for token 1, expected: 7, found: {0} ".format( + end + ), + ) + + data = "".join(tok4[0]) start = tok4[1] end = tok4[2] - self.assertEqual(data, "AAaaA", - msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(data)) - self.assertEqual(start, 42, msg="wrong start frame for token 2, expected: 17, found: {0} ".format(start)) - self.assertEqual(end, 46, msg="wrong end frame for token 2, expected: 22, found: {0} ".format(end)) - - + self.assertEqual( + data, + "AAaaA", + msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 42, + msg="wrong start frame for token 2, expected: 17, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 46, + msg="wrong end frame for token 2, expected: 22, found: {0} ".format( + end + ), + ) + + class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase): - def setUp(self): self.A_validator = AValidator() - - + def test_min_5_max_10_max_continuous_silence_0(self): - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10, - max_continuous_silence=0, init_min = 3, - init_max_silence = 3, mode=0) - + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=10, + max_continuous_silence=0, + init_min=3, + init_max_silence=3, + mode=0, + ) + data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") # ^ ^ ^ ^ ^ ^ # 3 7 9 14 17 25 - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 3, + msg="wrong number of tokens, expected: 3, found: {0} ".format( + len(tokens) + ), + ) tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AAAAA", - msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start)) - self.assertEqual(end, 7, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end)) - - - data = ''.join(tok2[0]) + self.assertEqual( + data, + "AAAAA", + msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 3, + msg="wrong start frame for token 1, expected: 3, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 7, + msg="wrong end frame for token 1, expected: 7, found: {0} ".format( + end + ), + ) + + data = "".join(tok2[0]) start = tok2[1] end = tok2[2] - self.assertEqual(data, "AAAAAA", - msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 9, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start)) - self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end)) - - - data = ''.join(tok3[0]) + self.assertEqual( + data, + "AAAAAA", + msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 9, + msg="wrong start frame for token 1, expected: 9, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 14, + msg="wrong end frame for token 1, expected: 14, found: {0} ".format( + end + ), + ) + + data = "".join(tok3[0]) start = tok3[1] end = tok3[2] - self.assertEqual(data, "AAAAAAAAA", - msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start)) - self.assertEqual(end, 25, msg="wrong end frame for token 1, expected: 25, found: {0} ".format(end)) - - - - + self.assertEqual( + data, + "AAAAAAAAA", + msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 17, + msg="wrong start frame for token 1, expected: 17, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 25, + msg="wrong end frame for token 1, expected: 25, found: {0} ".format( + end + ), + ) + def test_min_5_max_10_max_continuous_silence_1(self): - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10, - max_continuous_silence=1, init_min = 3, - init_max_silence = 3, mode=0) - + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=10, + max_continuous_silence=1, + init_min=3, + init_max_silence=3, + mode=0, + ) + data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") # ^ ^^ ^ ^ ^ # 3 12131517 26 # (12 13 15 17) - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 3, + msg="wrong number of tokens, expected: 3, found: {0} ".format( + len(tokens) + ), + ) tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AAAAAaAAAA", - msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start)) - self.assertEqual(end, 12, msg="wrong end frame for token 1, expected: 10, found: {0} ".format(end)) - - - data = ''.join(tok2[0]) + self.assertEqual( + data, + "AAAAAaAAAA", + msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 3, + msg="wrong start frame for token 1, expected: 3, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 12, + msg="wrong end frame for token 1, expected: 10, found: {0} ".format( + end + ), + ) + + data = "".join(tok2[0]) start = tok2[1] end = tok2[2] - self.assertEqual(data, "AAa", - msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(data)) - self.assertEqual(start, 13, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start)) - self.assertEqual(end, 15, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end)) - - - data = ''.join(tok3[0]) + self.assertEqual( + data, + "AAa", + msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 13, + msg="wrong start frame for token 1, expected: 9, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 15, + msg="wrong end frame for token 1, expected: 14, found: {0} ".format( + end + ), + ) + + data = "".join(tok3[0]) start = tok3[1] end = tok3[2] - self.assertEqual(data, "AAAAAAAAAa", - msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(data)) - self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start)) - self.assertEqual(end, 26, msg="wrong end frame for token 1, expected: 26, found: {0} ".format(end)) - - + self.assertEqual( + data, + "AAAAAAAAAa", + msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 17, + msg="wrong start frame for token 1, expected: 17, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 26, + msg="wrong end frame for token 1, expected: 26, found: {0} ".format( + end + ), + ) + + class TestStreamTokenizerModes(unittest.TestCase): - def setUp(self): self.A_validator = AValidator() - + def test_STRICT_MIN_LENGTH(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8, - max_continuous_silence=3, init_min = 3, - init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH) - + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=8, + max_continuous_silence=3, + init_min=3, + init_max_silence=3, + mode=StreamTokenizer.STRICT_MIN_LENGTH, + ) + data_source = StringDataSource("aaAAAAAAAAAAAA") # ^ ^ # 2 9 - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 1, + msg="wrong number of tokens, expected: 1, found: {0} ".format( + len(tokens) + ), + ) tok1 = tokens[0] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AAAAAAAA", - msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start)) - self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end)) - - + self.assertEqual( + data, + "AAAAAAAA", + msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 2, + msg="wrong start frame for token 1, expected: 2, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 9, + msg="wrong end frame for token 1, expected: 9, found: {0} ".format( + end + ), + ) + def test_DROP_TAILING_SILENCE(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10, - max_continuous_silence=2, init_min = 3, - init_max_silence = 3, mode=StreamTokenizer.DROP_TAILING_SILENCE) - + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=10, + max_continuous_silence=2, + init_min=3, + init_max_silence=3, + mode=StreamTokenizer.DROP_TRAILING_SILENCE, + ) + data_source = StringDataSource("aaAAAAAaaaaa") # ^ ^ # 2 6 - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 1, + msg="wrong number of tokens, expected: 1, found: {0} ".format( + len(tokens) + ), + ) tok1 = tokens[0] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AAAAA", - msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start)) - self.assertEqual(end, 6, msg="wrong end frame for token 1, expected: 6, found: {0} ".format(end)) - - + self.assertEqual( + data, + "AAAAA", + msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 2, + msg="wrong start frame for token 1, expected: 2, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 6, + msg="wrong end frame for token 1, expected: 6, found: {0} ".format( + end + ), + ) + def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self): - - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8, - max_continuous_silence=3, init_min = 3, - init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE) - + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=8, + max_continuous_silence=3, + init_min=3, + init_max_silence=3, + mode=StreamTokenizer.STRICT_MIN_LENGTH + | StreamTokenizer.DROP_TRAILING_SILENCE, + ) + data_source = StringDataSource("aaAAAAAAAAAAAAaa") # ^ ^ # 2 8 - + tokens = tokenizer.tokenize(data_source) - - self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) + + self.assertEqual( + len(tokens), + 1, + msg="wrong number of tokens, expected: 1, found: {0} ".format( + len(tokens) + ), + ) tok1 = tokens[0] - - - data = ''.join(tok1[0]) + + data = "".join(tok1[0]) start = tok1[1] end = tok1[2] - self.assertEqual(data, "AAAAAAAA", - msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data)) - self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start)) - self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end)) - - + self.assertEqual( + data, + "AAAAAAAA", + msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format( + data + ), + ) + self.assertEqual( + start, + 2, + msg="wrong start frame for token 1, expected: 2, found: {0} ".format( + start + ), + ) + self.assertEqual( + end, + 9, + msg="wrong end frame for token 1, expected: 9, found: {0} ".format( + end + ), + ) + + class TestStreamTokenizerCallback(unittest.TestCase): - def setUp(self): self.A_validator = AValidator() - + def test_callback(self): - + tokens = [] - + def callback(data, start, end): tokens.append((data, start, end)) - - - tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8, - max_continuous_silence=3, init_min = 3, - init_max_silence = 3, mode=0) - + + tokenizer = StreamTokenizer( + self.A_validator, + min_length=5, + max_length=8, + max_continuous_silence=3, + init_min=3, + init_max_silence=3, + mode=0, + ) + data_source = StringDataSource("aaAAAAAAAAAAAAa") # ^ ^^ ^ # 2 910 14 - + tokenizer.tokenize(data_source, callback=callback) - - self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) - + + self.assertEqual( + len(tokens), + 2, + msg="wrong number of tokens, expected: 1, found: {0} ".format( + len(tokens) + ), + ) if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.testName'] + # import sys;sys.argv = ['', 'Test.testName'] unittest.main()