amine@297: """ amine@2: @author: Amine Sehili amine@2: September 2015 amine@2: amine@297: """ amine@2: amine@2: import unittest amine@2: from auditok import StreamTokenizer, StringDataSource, DataValidator amine@2: amine@2: amine@2: class AValidator(DataValidator): amine@2: def is_valid(self, frame): amine@2: return frame == "A" amine@2: amine@2: amine@2: class TestStreamTokenizerInitParams(unittest.TestCase): amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@297: amine@2: # Completely deactivate init_min and init_max_silence amine@2: # The tokenizer will only rely on the other parameters amine@2: # Note that if init_min = 0, the value of init_max_silence amine@2: # will have no effect amine@2: def test_init_min_0_init_max_silence_0(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=20, amine@297: max_continuous_silence=4, amine@297: init_min=0, amine@297: init_max_silence=0, amine@297: mode=0, amine@297: ) amine@297: amine@2: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA") amine@19: # ^ ^ ^ ^ amine@19: # 2 16 20 27 amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 2, amine@297: msg="wrong number of tokens, expected: 2, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1, tok2 = tokens[0], tokens[1] amine@297: amine@2: # tok1[0]: data amine@2: # tok1[1]: start frame (included) amine@2: # tok1[2]: end frame (included) amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AaaaAaAaaAaAaaaa", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', " amine@334: "found: {0} " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 1, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 1, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 16, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 16, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAAAAA', found: {0} " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 20, amine@334: msg=( amine@334: "wrong start frame for token 2, expected: 20, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 27, amine@334: msg=( amine@334: "wrong end frame for token 2, expected: 27, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@5: # A valid token is considered as so iff the tokenizer encounters amine@2: # at least valid frames (init_min = 3) between witch there amine@2: # are at most 0 consecutive non valid frames (init_max_silence = 0) amine@2: # The tokenizer will only rely on the other parameters amine@2: # In other words, a valid token must start with 3 valid frames amine@2: def test_init_min_3_init_max_silence_0(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=20, amine@297: max_continuous_silence=4, amine@297: init_min=3, amine@297: init_max_silence=0, amine@297: mode=0, amine@297: ) amine@297: amine@297: data_source = StringDataSource( amine@297: "aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA" amine@297: ) amine@334: # ^ ^ ^ ^ amine@334: # 18 30 33 37 amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 2, amine@297: msg="wrong number of tokens, expected: 2, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1, tok2 = tokens[0], tokens[1] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAAAAAaaaa", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAAAAAAaaaa', " amine@334: "found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 18, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 18, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 30, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 30, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 33, amine@334: msg=( amine@334: "wrong start frame for token 2, expected: 33, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 37, amine@334: msg=( amine@334: "wrong end frame for token 2, expected: 37, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@2: # A valid token is considered iff the tokenizer encounters amine@2: # at least valid frames (init_min = 3) between witch there amine@2: # are at most 2 consecutive non valid frames (init_max_silence = 2) amine@2: def test_init_min_3_init_max_silence_2(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=20, amine@297: max_continuous_silence=4, amine@297: init_min=3, amine@297: init_max_silence=2, amine@297: mode=0, amine@297: ) amine@297: amine@297: data_source = StringDataSource( amine@297: "aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA" amine@297: ) amine@334: # ^ ^ ^ ^ ^ ^ amine@334: # 5 16 19 31 35 39 amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 3, amine@297: msg="wrong number of tokens, expected: 3, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AaAaaAaAaaaa", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 5, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 5, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 16, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 16, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAAAAAaaaa", amine@334: msg=( amine@334: "wrong data for token 2, expected: 'AAAAAAAAAaaaa', " amine@334: "found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 19, amine@334: msg=( amine@334: "wrong start frame for token 2, expected: 19, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 31, amine@334: msg=( amine@334: "wrong end frame for token 2, expected: 31, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok3[0]) amine@2: start = tok3[1] amine@2: end = tok3[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAA", amine@334: msg=( amine@334: "wrong data for token 3, expected: 'AAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 35, amine@334: msg=( amine@334: "wrong start frame for token 2, expected: 35, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 39, amine@334: msg=( amine@334: "wrong end frame for token 2, expected: 39, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: amine@2: class TestStreamTokenizerMinMaxLength(unittest.TestCase): amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@297: amine@2: def test_min_length_6_init_max_length_20(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=6, amine@297: max_length=20, amine@297: max_continuous_silence=2, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=0, amine@297: ) amine@297: amine@2: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") amine@5: # ^ ^ ^ ^ amine@5: # 1 14 18 28 amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 2, amine@297: msg="wrong number of tokens, expected: 2, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1, tok2 = tokens[0], tokens[1] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AaaaAaAaaAaAaa", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AaaaAaAaaAaAaa', " amine@334: "found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 1, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 1, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 14, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 14, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAAAAAaa", amine@334: msg=( amine@334: "wrong data for token 2, expected: 'AAAAAAAAAaa', " amine@334: "found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 18, amine@334: msg=( amine@334: "wrong start frame for token 2, expected: 18, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 28, amine@334: msg=( amine@334: "wrong end frame for token 2, expected: 28, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@2: def test_min_length_1_init_max_length_1(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=1, amine@297: max_length=1, amine@297: max_continuous_silence=0, amine@297: init_min=0, amine@297: init_max_silence=0, amine@297: mode=0, amine@297: ) amine@297: amine@297: data_source = StringDataSource( amine@297: "AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA" amine@297: ) amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 21, amine@297: msg="wrong number of tokens, expected: 21, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@297: amine@2: def test_min_length_10_init_max_length_20(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=10, amine@297: max_length=20, amine@297: max_continuous_silence=4, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=0, amine@297: ) amine@297: amine@297: data_source = StringDataSource( amine@297: "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA" amine@297: ) amine@334: # ^ ^ ^ ^ amine@334: # 1 16 30 45 amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 2, amine@297: msg="wrong number of tokens, expected: 2, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1, tok2 = tokens[0], tokens[1] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AaaaAaAaaAaAaaaa", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', " amine@334: "found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 1, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 1, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 16, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 16, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAaaAAaaAAA", amine@334: msg=( amine@334: "wrong data for token 2, expected: 'AAAAAaaAAaaAAA', " amine@334: "found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 30, amine@334: msg=( amine@334: "wrong start frame for token 2, expected: 30, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 43, amine@334: msg=( amine@334: "wrong end frame for token 2, expected: 43, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@2: def test_min_length_4_init_max_length_5(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=4, amine@297: max_length=5, amine@297: max_continuous_silence=4, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=0, amine@297: ) amine@297: amine@297: data_source = StringDataSource( amine@297: "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa" amine@297: ) amine@334: # ^ ^^ ^ ^ ^ ^ ^ amine@334: # 18 2223 27 32 36 42 46 amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 4, amine@297: msg="wrong number of tokens, expected: 4, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 18, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 18, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 22, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 22, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAaa", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAaa', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 23, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 23, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 27, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 27, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok3[0]) amine@2: start = tok3[1] amine@2: end = tok3[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 32, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 1, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 36, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 7, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok4[0]) amine@2: start = tok4[1] amine@2: end = tok4[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAaaA", amine@334: msg=( amine@334: "wrong data for token 2, expected: 'AAaaA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 42, amine@334: msg=( amine@334: "wrong start frame for token 2, expected: 17, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 46, amine@334: msg=( amine@334: "wrong end frame for token 2, expected: 22, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: amine@2: class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase): amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@297: amine@2: def test_min_5_max_10_max_continuous_silence_0(self): amine@2: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=10, amine@297: max_continuous_silence=0, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=0, amine@297: ) amine@297: amine@2: data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") amine@5: # ^ ^ ^ ^ ^ ^ amine@5: # 3 7 9 14 17 25 amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 3, amine@297: msg="wrong number of tokens, expected: 3, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 3, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 3, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 7, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 7, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 9, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 9, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 14, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 14, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok3[0]) amine@2: start = tok3[1] amine@2: end = tok3[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 17, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 17, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 25, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 25, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@2: def test_min_5_max_10_max_continuous_silence_1(self): amine@2: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=10, amine@297: max_continuous_silence=1, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=0, amine@297: ) amine@297: amine@2: data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") amine@5: # ^ ^^ ^ ^ ^ amine@5: # 3 12131517 26 amine@5: # (12 13 15 17) amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 3, amine@297: msg="wrong number of tokens, expected: 3, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAaAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAAaAAAA', " amine@334: "found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 3, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 3, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 12, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 10, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok2[0]) amine@2: start = tok2[1] amine@2: end = tok2[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAa", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAa', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 13, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 9, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 15, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 14, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: data = "".join(tok3[0]) amine@2: start = tok3[1] amine@2: end = tok3[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAAAAAa", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAAAAAAa', " amine@334: "found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 17, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 17, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 26, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 26, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: amine@2: class TestStreamTokenizerModes(unittest.TestCase): amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@297: amine@2: def test_STRICT_MIN_LENGTH(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=8, amine@297: max_continuous_silence=3, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=StreamTokenizer.STRICT_MIN_LENGTH, amine@297: ) amine@297: amine@2: data_source = StringDataSource("aaAAAAAAAAAAAA") amine@5: # ^ ^ amine@5: # 2 9 amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 1, amine@297: msg="wrong number of tokens, expected: 1, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1 = tokens[0] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 2, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 2, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 9, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 9, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@3: def test_DROP_TAILING_SILENCE(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=10, amine@297: max_continuous_silence=2, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=StreamTokenizer.DROP_TRAILING_SILENCE, amine@297: ) amine@297: amine@2: data_source = StringDataSource("aaAAAAAaaaaa") amine@5: # ^ ^ amine@5: # 2 6 amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 1, amine@297: msg="wrong number of tokens, expected: 1, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1 = tokens[0] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 2, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 2, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 6, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 6, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@3: def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self): amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=8, amine@297: max_continuous_silence=3, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=StreamTokenizer.STRICT_MIN_LENGTH amine@297: | StreamTokenizer.DROP_TRAILING_SILENCE, amine@297: ) amine@297: amine@2: data_source = StringDataSource("aaAAAAAAAAAAAAaa") amine@5: # ^ ^ amine@5: # 2 8 amine@297: amine@2: tokens = tokenizer.tokenize(data_source) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 1, amine@297: msg="wrong number of tokens, expected: 1, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: tok1 = tokens[0] amine@297: amine@297: data = "".join(tok1[0]) amine@2: start = tok1[1] amine@2: end = tok1[2] amine@297: self.assertEqual( amine@297: data, amine@297: "AAAAAAAA", amine@334: msg=( amine@334: "wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' " amine@334: ).format(data), amine@297: ) amine@297: self.assertEqual( amine@297: start, amine@297: 2, amine@334: msg=( amine@334: "wrong start frame for token 1, expected: 2, found: {0} " amine@334: ).format(start), amine@297: ) amine@297: self.assertEqual( amine@297: end, amine@297: 9, amine@334: msg=( amine@334: "wrong end frame for token 1, expected: 9, found: {0} " amine@334: ).format(end), amine@297: ) amine@297: amine@297: amine@2: class TestStreamTokenizerCallback(unittest.TestCase): amine@2: def setUp(self): amine@2: self.A_validator = AValidator() amine@297: amine@2: def test_callback(self): amine@297: amine@2: tokens = [] amine@297: amine@2: def callback(data, start, end): amine@2: tokens.append((data, start, end)) amine@297: amine@297: tokenizer = StreamTokenizer( amine@297: self.A_validator, amine@297: min_length=5, amine@297: max_length=8, amine@297: max_continuous_silence=3, amine@297: init_min=3, amine@297: init_max_silence=3, amine@297: mode=0, amine@297: ) amine@297: amine@2: data_source = StringDataSource("aaAAAAAAAAAAAAa") amine@5: # ^ ^^ ^ amine@5: # 2 910 14 amine@297: amine@2: tokenizer.tokenize(data_source, callback=callback) amine@297: amine@297: self.assertEqual( amine@297: len(tokens), amine@297: 2, amine@297: msg="wrong number of tokens, expected: 1, found: {0} ".format( amine@297: len(tokens) amine@297: ), amine@297: ) amine@2: amine@2: amine@2: if __name__ == "__main__": amine@2: unittest.main()