amine@2: ''' amine@2: @author: Amine Sehili amine@2: September 2015 amine@2: amine@2: ''' amine@2: amine@2: import unittest amine@2: from auditok import StreamTokenizer, StringDataSource, DataValidator amine@2: amine@2: amine@2: class AValidator(DataValidator): amine@2: amine@2: def is_valid(self, frame): amine@2: return frame == "A" amine@2: amine@2: amine@2: class TestStreamTokenizerInitParams(unittest.TestCase): amine@2: amine@2: amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@2: amine@2: # Completely deactivate init_min and init_max_silence amine@2: # The tokenizer will only rely on the other parameters amine@2: # Note that if init_min = 0, the value of init_max_silence amine@2: # will have no effect amine@2: def test_init_min_0_init_max_silence_0(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, amine@2: max_continuous_silence=4, init_min = 0, amine@2: init_max_silence = 0, mode=0) amine@2: amine@2: amine@2: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA") amine@19: # ^ ^ ^ ^ amine@19: # 2 16 20 27 amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) amine@2: tok1, tok2 = tokens[0], tokens[1] amine@2: amine@2: # tok1[0]: data amine@2: # tok1[1]: start frame (included) amine@2: # tok1[2]: end frame (included) amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AaaaAaAaaAaAaaaa", amine@2: msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(data)) amine@2: self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) amine@2: self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@2: self.assertEqual(data, "AAAAAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(data)) amine@2: self.assertEqual(start, 20, msg="wrong start frame for token 2, expected: 20, found: {0} ".format(start)) amine@2: self.assertEqual(end, 27, msg="wrong end frame for token 2, expected: 27, found: {0} ".format(end)) amine@2: amine@2: amine@2: amine@5: # A valid token is considered as so iff the tokenizer encounters amine@2: # at least valid frames (init_min = 3) between witch there amine@2: # are at most 0 consecutive non valid frames (init_max_silence = 0) amine@2: # The tokenizer will only rely on the other parameters amine@2: # In other words, a valid token must start with 3 valid frames amine@2: def test_init_min_3_init_max_silence_0(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, amine@2: max_continuous_silence=4, init_min = 3, amine@2: init_max_silence = 0, mode=0) amine@2: amine@2: amine@2: amine@2: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA") amine@5: # ^ ^ ^ ^ amine@5: # 18 30 33 37 amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) amine@2: tok1, tok2 = tokens[0], tokens[1] amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AAAAAAAAAaaaa", amine@2: msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start)) amine@2: self.assertEqual(end, 30, msg="wrong end frame for token 1, expected: 30, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@2: self.assertEqual(data, "AAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 33, msg="wrong start frame for token 2, expected: 33, found: {0} ".format(start)) amine@2: self.assertEqual(end, 37, msg="wrong end frame for token 2, expected: 37, found: {0} ".format(end)) amine@2: amine@2: amine@2: # A valid token is considered iff the tokenizer encounters amine@2: # at least valid frames (init_min = 3) between witch there amine@2: # are at most 2 consecutive non valid frames (init_max_silence = 2) amine@2: def test_init_min_3_init_max_silence_2(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, amine@2: max_continuous_silence=4, init_min = 3, amine@2: init_max_silence = 2, mode=0) amine@2: amine@2: amine@2: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA") amine@5: # ^ ^ ^ ^ ^ ^ amine@5: # 5 16 19 31 35 39 amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens))) amine@2: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AaAaaAaAaaaa", amine@2: msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 5, msg="wrong start frame for token 1, expected: 5, found: {0} ".format(start)) amine@2: self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@2: self.assertEqual(data, "AAAAAAAAAaaaa", amine@2: msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 19, msg="wrong start frame for token 2, expected: 19, found: {0} ".format(start)) amine@2: self.assertEqual(end, 31, msg="wrong end frame for token 2, expected: 31, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok3[0]) amine@2: start = tok3[1] amine@2: end = tok3[2] amine@2: self.assertEqual(data, "AAAAA", amine@2: msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 35, msg="wrong start frame for token 2, expected: 35, found: {0} ".format(start)) amine@2: self.assertEqual(end, 39, msg="wrong end frame for token 2, expected: 39, found: {0} ".format(end)) amine@2: amine@2: amine@2: amine@2: class TestStreamTokenizerMinMaxLength(unittest.TestCase): amine@2: amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@2: amine@2: amine@2: def test_min_length_6_init_max_length_20(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 6, max_length=20, amine@2: max_continuous_silence=2, init_min = 3, amine@2: init_max_silence = 3, mode=0) amine@2: amine@2: amine@2: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") amine@5: # ^ ^ ^ ^ amine@5: # 1 14 18 28 amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) amine@2: tok1, tok2 = tokens[0], tokens[1] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AaaaAaAaaAaAaa", amine@2: msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) amine@2: self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@2: self.assertEqual(data, "AAAAAAAAAaa", amine@2: msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 18, msg="wrong start frame for token 2, expected: 18, found: {0} ".format(start)) amine@2: self.assertEqual(end, 28, msg="wrong end frame for token 2, expected: 28, found: {0} ".format(end)) amine@2: amine@2: amine@2: def test_min_length_1_init_max_length_1(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 1, max_length=1, amine@2: max_continuous_silence=0, init_min = 0, amine@2: init_max_silence = 0, mode=0) amine@2: amine@2: amine@2: data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 21, msg="wrong number of tokens, expected: 21, found: {0} ".format(len(tokens))) amine@2: amine@2: amine@2: def test_min_length_10_init_max_length_20(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 10, max_length=20, amine@2: max_continuous_silence=4, init_min = 3, amine@2: init_max_silence = 3, mode=0) amine@2: amine@2: amine@2: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA") amine@5: # ^ ^ ^ ^ amine@5: # 1 16 30 45 amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) amine@2: tok1, tok2 = tokens[0], tokens[1] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AaaaAaAaaAaAaaaa", amine@2: msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) amine@2: self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@2: self.assertEqual(data, "AAAAAaaAAaaAAA", amine@2: msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 30, msg="wrong start frame for token 2, expected: 30, found: {0} ".format(start)) amine@2: self.assertEqual(end, 43, msg="wrong end frame for token 2, expected: 43, found: {0} ".format(end)) amine@2: amine@2: amine@2: amine@2: def test_min_length_4_init_max_length_5(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 4, max_length=5, amine@2: max_continuous_silence=4, init_min = 3, amine@2: init_max_silence = 3, mode=0) amine@2: amine@2: amine@2: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa") amine@5: # ^ ^^ ^ ^ ^ ^ ^ amine@5: # 18 2223 27 32 36 42 46 amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 4, msg="wrong number of tokens, expected: 4, found: {0} ".format(len(tokens))) amine@2: tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start)) amine@2: self.assertEqual(end, 22, msg="wrong end frame for token 1, expected: 22, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@2: self.assertEqual(data, "AAAaa", amine@2: msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 23, msg="wrong start frame for token 1, expected: 23, found: {0} ".format(start)) amine@2: self.assertEqual(end, 27, msg="wrong end frame for token 1, expected: 27, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok3[0]) amine@2: start = tok3[1] amine@2: end = tok3[2] amine@2: self.assertEqual(data, "AAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 32, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) amine@2: self.assertEqual(end, 36, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok4[0]) amine@2: start = tok4[1] amine@2: end = tok4[2] amine@2: self.assertEqual(data, "AAaaA", amine@2: msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 42, msg="wrong start frame for token 2, expected: 17, found: {0} ".format(start)) amine@2: self.assertEqual(end, 46, msg="wrong end frame for token 2, expected: 22, found: {0} ".format(end)) amine@2: amine@2: amine@2: class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase): amine@2: amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@2: amine@2: amine@2: def test_min_5_max_10_max_continuous_silence_0(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10, amine@2: max_continuous_silence=0, init_min = 3, amine@2: init_max_silence = 3, mode=0) amine@2: amine@2: data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") amine@5: # ^ ^ ^ ^ ^ ^ amine@5: # 3 7 9 14 17 25 amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens))) amine@2: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start)) amine@2: self.assertEqual(end, 7, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@2: self.assertEqual(data, "AAAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 9, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start)) amine@2: self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok3[0]) amine@2: start = tok3[1] amine@2: end = tok3[2] amine@2: self.assertEqual(data, "AAAAAAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start)) amine@2: self.assertEqual(end, 25, msg="wrong end frame for token 1, expected: 25, found: {0} ".format(end)) amine@2: amine@2: amine@2: amine@2: amine@2: def test_min_5_max_10_max_continuous_silence_1(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10, amine@2: max_continuous_silence=1, init_min = 3, amine@2: init_max_silence = 3, mode=0) amine@2: amine@2: data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") amine@5: # ^ ^^ ^ ^ ^ amine@5: # 3 12131517 26 amine@5: # (12 13 15 17) amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens))) amine@2: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AAAAAaAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start)) amine@2: self.assertEqual(end, 12, msg="wrong end frame for token 1, expected: 10, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@2: self.assertEqual(data, "AAa", amine@2: msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 13, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start)) amine@2: self.assertEqual(end, 15, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end)) amine@2: amine@2: amine@2: data = ''.join(tok3[0]) amine@2: start = tok3[1] amine@2: end = tok3[2] amine@2: self.assertEqual(data, "AAAAAAAAAa", amine@2: msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start)) amine@2: self.assertEqual(end, 26, msg="wrong end frame for token 1, expected: 26, found: {0} ".format(end)) amine@2: amine@2: amine@2: class TestStreamTokenizerModes(unittest.TestCase): amine@2: amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@2: amine@2: def test_STRICT_MIN_LENGTH(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8, amine@2: max_continuous_silence=3, init_min = 3, amine@2: init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH) amine@2: amine@2: data_source = StringDataSource("aaAAAAAAAAAAAA") amine@5: # ^ ^ amine@5: # 2 9 amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) amine@2: tok1 = tokens[0] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AAAAAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start)) amine@2: self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end)) amine@2: amine@2: amine@3: def test_DROP_TAILING_SILENCE(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10, amine@2: max_continuous_silence=2, init_min = 3, amine@3: init_max_silence = 3, mode=StreamTokenizer.DROP_TAILING_SILENCE) amine@2: amine@2: data_source = StringDataSource("aaAAAAAaaaaa") amine@5: # ^ ^ amine@5: # 2 6 amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) amine@2: tok1 = tokens[0] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start)) amine@2: self.assertEqual(end, 6, msg="wrong end frame for token 1, expected: 6, found: {0} ".format(end)) amine@2: amine@2: amine@3: def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self): amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8, amine@2: max_continuous_silence=3, init_min = 3, amine@3: init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE) amine@2: amine@2: data_source = StringDataSource("aaAAAAAAAAAAAAaa") amine@5: # ^ ^ amine@5: # 2 8 amine@2: amine@2: tokens = tokenizer.tokenize(data_source) amine@2: amine@2: self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) amine@2: tok1 = tokens[0] amine@2: amine@2: amine@2: data = ''.join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@2: self.assertEqual(data, "AAAAAAAA", amine@2: msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data)) amine@2: self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start)) amine@2: self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end)) amine@2: amine@2: amine@2: class TestStreamTokenizerCallback(unittest.TestCase): amine@2: amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@2: amine@2: def test_callback(self): amine@2: amine@2: tokens = [] amine@2: amine@2: def callback(data, start, end): amine@2: tokens.append((data, start, end)) amine@2: amine@2: amine@2: tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8, amine@2: max_continuous_silence=3, init_min = 3, amine@2: init_max_silence = 3, mode=0) amine@2: amine@2: data_source = StringDataSource("aaAAAAAAAAAAAAa") amine@5: # ^ ^^ ^ amine@5: # 2 910 14 amine@2: amine@2: tokenizer.tokenize(data_source, callback=callback) amine@2: amine@2: self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) amine@2: amine@2: amine@2: amine@2: if __name__ == "__main__": amine@2: #import sys;sys.argv = ['', 'Test.testName'] amine@2: unittest.main()