amine@403: import os amine@403: amine@400: import pytest amine@403: amine@403: from auditok import DataValidator, StreamTokenizer, StringDataSource amine@2: amine@2: amine@2: class AValidator(DataValidator): amine@2: def is_valid(self, frame): amine@2: return frame == "A" amine@2: amine@2: amine@400: @pytest.fixture amine@400: def validator(): amine@400: return AValidator() amine@297: amine@297: amine@400: def test_init_min_0_init_max_silence_0(validator): amine@400: tokenizer = StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=20, amine@400: max_continuous_silence=4, amine@400: init_min=0, amine@400: init_max_silence=0, amine@400: mode=0, amine@400: ) amine@297: amine@400: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA") amine@400: # ^ ^ ^ ^ amine@400: # 2 16 20 27 amine@400: tokens = tokenizer.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 2 amine@403: ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens)) amine@400: tok1, tok2 = tokens[0], tokens[1] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AaaaAaAaaAaAaaaa" amine@403: ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {}".format( amine@403: data amine@403: ) amine@400: assert ( amine@400: start == 1 amine@403: ), "wrong start frame for token 1, expected: 1, found: {}".format(start) amine@403: assert ( amine@403: end == 16 amine@403: ), "wrong end frame for token 1, expected: 16, found: {}".format(end) amine@297: amine@400: data = "".join(tok2[0]) amine@400: start = tok2[1] amine@400: end = tok2[2] amine@400: assert ( amine@400: data == "AAAAAAAA" amine@403: ), "wrong data for token 2, expected: 'AAAAAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 20 amine@403: ), "wrong start frame for token 2, expected: 20, found: {}".format(start) amine@403: assert ( amine@403: end == 27 amine@403: ), "wrong end frame for token 2, expected: 27, found: {}".format(end) amine@297: amine@297: amine@400: def test_init_min_3_init_max_silence_0(validator): amine@400: tokenizer = StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=20, amine@400: max_continuous_silence=4, amine@400: init_min=3, amine@400: init_max_silence=0, amine@400: mode=0, amine@400: ) amine@297: amine@400: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA") amine@400: # ^ ^ ^ ^ amine@400: # 18 30 33 37 amine@297: amine@400: tokens = tokenizer.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 2 amine@403: ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens)) amine@400: tok1, tok2 = tokens[0], tokens[1] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AAAAAAAAAaaaa" amine@403: ), "wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: {}".format( amine@403: data amine@403: ) amine@400: assert ( amine@400: start == 18 amine@403: ), "wrong start frame for token 1, expected: 18, found: {}".format(start) amine@403: assert ( amine@403: end == 30 amine@403: ), "wrong end frame for token 1, expected: 30, found: {}".format(end) amine@297: amine@400: data = "".join(tok2[0]) amine@400: start = tok2[1] amine@400: end = tok2[2] amine@400: assert ( amine@400: data == "AAAAA" amine@403: ), "wrong data for token 2, expected: 'AAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 33 amine@403: ), "wrong start frame for token 2, expected: 33, found: {}".format(start) amine@403: assert ( amine@403: end == 37 amine@403: ), "wrong end frame for token 2, expected: 37, found: {}".format(end) amine@297: amine@297: amine@400: def test_init_min_3_init_max_silence_2(validator): amine@400: tokenizer = StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=20, amine@400: max_continuous_silence=4, amine@400: init_min=3, amine@400: init_max_silence=2, amine@400: mode=0, amine@400: ) amine@297: amine@400: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA") amine@400: # ^ ^ ^ ^ ^ ^ amine@400: # 5 16 19 31 35 39 amine@400: tokens = tokenizer.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 3 amine@403: ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens)) amine@400: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AaAaaAaAaaaa" amine@403: ), "wrong data for token 1, expected: 'AaAaaAaA', found: {}".format(data) amine@400: assert ( amine@400: start == 5 amine@403: ), "wrong start frame for token 1, expected: 5, found: {}".format(start) amine@403: assert ( amine@403: end == 16 amine@403: ), "wrong end frame for token 1, expected: 16, found: {}".format(end) amine@297: amine@400: data = "".join(tok2[0]) amine@400: start = tok2[1] amine@400: end = tok2[2] amine@400: assert ( amine@400: data == "AAAAAAAAAaaaa" amine@403: ), "wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: {}".format( amine@403: data amine@403: ) amine@400: assert ( amine@400: start == 19 amine@403: ), "wrong start frame for token 2, expected: 19, found: {}".format(start) amine@403: assert ( amine@403: end == 31 amine@403: ), "wrong end frame for token 2, expected: 31, found: {}".format(end) amine@297: amine@400: data = "".join(tok3[0]) amine@400: start = tok3[1] amine@400: end = tok3[2] amine@400: assert ( amine@400: data == "AAAAA" amine@403: ), "wrong data for token 3, expected: 'AAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 35 amine@403: ), "wrong start frame for token 3, expected: 35, found: {}".format(start) amine@403: assert ( amine@403: end == 39 amine@403: ), "wrong end frame for token 3, expected: 39, found: {}".format(end) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_min_max_length(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=6, amine@400: max_length=20, amine@400: max_continuous_silence=2, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=0, amine@400: ) amine@297: amine@297: amine@400: def test_min_length_6_init_max_length_20(tokenizer_min_max_length): amine@400: data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") amine@400: # ^ ^ ^ ^ amine@400: # 1 14 18 28 amine@297: amine@400: tokens = tokenizer_min_max_length.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 2 amine@403: ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens)) amine@400: tok1, tok2 = tokens[0], tokens[1] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AaaaAaAaaAaAaa" amine@403: ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: {}".format( amine@403: data amine@403: ) amine@400: assert ( amine@400: start == 1 amine@403: ), "wrong start frame for token 1, expected: 1, found: {}".format(start) amine@403: assert ( amine@403: end == 14 amine@403: ), "wrong end frame for token 1, expected: 14, found: {}".format(end) amine@297: amine@400: data = "".join(tok2[0]) amine@400: start = tok2[1] amine@400: end = tok2[2] amine@400: assert ( amine@400: data == "AAAAAAAAAaa" amine@403: ), "wrong data for token 2, expected: 'AAAAAAAAAaa', found: {}".format(data) amine@400: assert ( amine@400: start == 18 amine@403: ), "wrong start frame for token 2, expected: 18, found: {}".format(start) amine@403: assert ( amine@403: end == 28 amine@403: ), "wrong end frame for token 2, expected: 28, found: {}".format(end) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_min_max_length_1_1(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=1, amine@400: max_length=1, amine@400: max_continuous_silence=0, amine@400: init_min=0, amine@400: init_max_silence=0, amine@400: mode=0, amine@400: ) amine@297: amine@297: amine@400: def test_min_length_1_init_max_length_1(tokenizer_min_max_length_1_1): amine@400: data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") amine@297: amine@400: tokens = tokenizer_min_max_length_1_1.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 21 amine@403: ), "wrong number of tokens, expected: 21, found: {}".format(len(tokens)) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_min_max_length_10_20(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=10, amine@400: max_length=20, amine@400: max_continuous_silence=4, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=0, amine@400: ) amine@297: amine@297: amine@400: def test_min_length_10_init_max_length_20(tokenizer_min_max_length_10_20): amine@400: data_source = StringDataSource( amine@400: "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA" amine@400: ) amine@400: # ^ ^ ^ ^ amine@400: # 1 16 30 45 amine@297: amine@400: tokens = tokenizer_min_max_length_10_20.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 2 amine@403: ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens)) amine@400: tok1, tok2 = tokens[0], tokens[1] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AaaaAaAaaAaAaaaa" amine@403: ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {}".format( amine@403: data amine@403: ) amine@400: assert ( amine@400: start == 1 amine@403: ), "wrong start frame for token 1, expected: 1, found: {}".format(start) amine@403: assert ( amine@403: end == 16 amine@403: ), "wrong end frame for token 1, expected: 16, found: {}".format(end) amine@297: amine@400: data = "".join(tok2[0]) amine@400: start = tok2[1] amine@400: end = tok2[2] amine@400: assert ( amine@400: data == "AAAAAaaAAaaAAA" amine@403: ), "wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: {}".format( amine@403: data amine@403: ) amine@400: assert ( amine@400: start == 30 amine@403: ), "wrong start frame for token 2, expected: 30, found: {}".format(start) amine@403: assert ( amine@403: end == 43 amine@403: ), "wrong end frame for token 2, expected: 43, found: {}".format(end) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_min_max_length_4_5(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=4, amine@400: max_length=5, amine@400: max_continuous_silence=4, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=0, amine@400: ) amine@297: amine@297: amine@400: def test_min_length_4_init_max_length_5(tokenizer_min_max_length_4_5): amine@400: data_source = StringDataSource( amine@400: "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa" amine@400: ) amine@400: # ^ ^^ ^ ^ ^ ^ ^ amine@400: # 18 2223 27 32 36 42 46 amine@297: amine@400: tokens = tokenizer_min_max_length_4_5.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 4 amine@403: ), "wrong number of tokens, expected: 4, found: {}".format(len(tokens)) amine@400: tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AAAAA" amine@403: ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 18 amine@403: ), "wrong start frame for token 1, expected: 18, found: {}".format(start) amine@403: assert ( amine@403: end == 22 amine@403: ), "wrong end frame for token 1, expected: 22, found: {}".format(end) amine@297: amine@400: data = "".join(tok2[0]) amine@400: start = tok2[1] amine@400: end = tok2[2] amine@400: assert ( amine@400: data == "AAAaa" amine@403: ), "wrong data for token 2, expected: 'AAAaa', found: {}".format(data) amine@400: assert ( amine@400: start == 23 amine@403: ), "wrong start frame for token 2, expected: 23, found: {}".format(start) amine@403: assert ( amine@403: end == 27 amine@403: ), "wrong end frame for token 2, expected: 27, found: {}".format(end) amine@297: amine@400: data = "".join(tok3[0]) amine@400: start = tok3[1] amine@400: end = tok3[2] amine@400: assert ( amine@400: data == "AAAAA" amine@403: ), "wrong data for token 3, expected: 'AAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 32 amine@403: ), "wrong start frame for token 3, expected: 32, found: {}".format(start) amine@403: assert ( amine@403: end == 36 amine@403: ), "wrong end frame for token 3, expected: 36, found: {}".format(end) amine@297: amine@400: data = "".join(tok4[0]) amine@400: start = tok4[1] amine@400: end = tok4[2] amine@400: assert ( amine@400: data == "AAaaA" amine@403: ), "wrong data for token 4, expected: 'AAaaA', found: {}".format(data) amine@400: assert ( amine@400: start == 42 amine@403: ), "wrong start frame for token 4, expected: 42, found: {}".format(start) amine@403: assert ( amine@403: end == 46 amine@403: ), "wrong end frame for token 4, expected: 46, found: {}".format(end) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_max_continuous_silence_0(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=10, amine@400: max_continuous_silence=0, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=0, amine@400: ) amine@2: amine@297: amine@400: def test_min_5_max_10_max_continuous_silence_0( amine@400: tokenizer_max_continuous_silence_0, amine@400: ): amine@400: data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") amine@400: # ^ ^ ^ ^ ^ ^ amine@400: # 3 7 9 14 17 25 amine@297: amine@400: tokens = tokenizer_max_continuous_silence_0.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 3 amine@403: ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens)) amine@400: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AAAAA" amine@403: ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 3 amine@403: ), "wrong start frame for token 1, expected: 3, found: {}".format(start) amine@403: assert ( amine@403: end == 7 amine@403: ), "wrong end frame for token 1, expected: 7, found: {}".format(end) amine@297: amine@400: data = "".join(tok2[0]) amine@400: start = tok2[1] amine@400: end = tok2[2] amine@400: assert ( amine@400: data == "AAAAAA" amine@403: ), "wrong data for token 2, expected: 'AAAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 9 amine@403: ), "wrong start frame for token 2, expected: 9, found: {}".format(start) amine@403: assert ( amine@403: end == 14 amine@403: ), "wrong end frame for token 2, expected: 14, found: {}".format(end) amine@297: amine@400: data = "".join(tok3[0]) amine@400: start = tok3[1] amine@400: end = tok3[2] amine@400: assert ( amine@400: data == "AAAAAAAAA" amine@403: ), "wrong data for token 3, expected: 'AAAAAAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 17 amine@403: ), "wrong start frame for token 3, expected: 17, found: {}".format(start) amine@403: assert ( amine@403: end == 25 amine@403: ), "wrong end frame for token 3, expected: 25, found: {}".format(end) amine@297: amine@2: amine@400: @pytest.fixture amine@400: def tokenizer_max_continuous_silence_1(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=10, amine@400: max_continuous_silence=1, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=0, amine@400: ) amine@297: amine@297: amine@400: def test_min_5_max_10_max_continuous_silence_1( amine@400: tokenizer_max_continuous_silence_1, amine@400: ): amine@400: data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") amine@400: # ^ ^^ ^ ^ ^ amine@400: # 3 12131517 26 amine@400: # (12 13 15 17) amine@297: amine@400: tokens = tokenizer_max_continuous_silence_1.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 3 amine@403: ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens)) amine@400: tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AAAAAaAAAA" amine@403: ), "wrong data for token 1, expected: 'AAAAAaAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 3 amine@403: ), "wrong start frame for token 1, expected: 3, found: {}".format(start) amine@403: assert ( amine@403: end == 12 amine@403: ), "wrong end frame for token 1, expected: 12, found: {}".format(end) amine@297: amine@400: data = "".join(tok2[0]) amine@400: start = tok2[1] amine@400: end = tok2[2] amine@400: assert ( amine@400: data == "AAa" amine@403: ), "wrong data for token 2, expected: 'AAa', found: {}".format(data) amine@400: assert ( amine@400: start == 13 amine@403: ), "wrong start frame for token 2, expected: 13, found: {}".format(start) amine@403: assert ( amine@403: end == 15 amine@403: ), "wrong end frame for token 2, expected: 15, found: {}".format(end) amine@297: amine@400: data = "".join(tok3[0]) amine@400: start = tok3[1] amine@400: end = tok3[2] amine@400: assert ( amine@400: data == "AAAAAAAAAa" amine@403: ), "wrong data for token 3, expected: 'AAAAAAAAAa', found: {}".format(data) amine@400: assert ( amine@400: start == 17 amine@403: ), "wrong start frame for token 3, expected: 17, found: {}".format(start) amine@403: assert ( amine@403: end == 26 amine@403: ), "wrong end frame for token 3, expected: 26, found: {}".format(end) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_strict_min_length(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=8, amine@400: max_continuous_silence=3, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=StreamTokenizer.STRICT_MIN_LENGTH, amine@400: ) amine@297: amine@297: amine@400: def test_STRICT_MIN_LENGTH(tokenizer_strict_min_length): amine@400: data_source = StringDataSource("aaAAAAAAAAAAAA") amine@400: # ^ ^ amine@400: # 2 9 amine@297: amine@400: tokens = tokenizer_strict_min_length.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 1 amine@403: ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens)) amine@400: tok1 = tokens[0] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AAAAAAAA" amine@403: ), "wrong data for token 1, expected: 'AAAAAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 2 amine@403: ), "wrong start frame for token 1, expected: 2, found: {}".format(start) amine@403: assert ( amine@403: end == 9 amine@403: ), "wrong end frame for token 1, expected: 9, found: {}".format(end) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_drop_trailing_silence(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=10, amine@400: max_continuous_silence=2, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=StreamTokenizer.DROP_TRAILING_SILENCE, amine@400: ) amine@297: amine@297: amine@400: def test_DROP_TAILING_SILENCE(tokenizer_drop_trailing_silence): amine@400: data_source = StringDataSource("aaAAAAAaaaaa") amine@400: # ^ ^ amine@400: # 2 6 amine@297: amine@400: tokens = tokenizer_drop_trailing_silence.tokenize(data_source) amine@297: amine@400: assert ( amine@400: len(tokens) == 1 amine@403: ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens)) amine@400: tok1 = tokens[0] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AAAAA" amine@403: ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 2 amine@403: ), "wrong start frame for token 1, expected: 2, found: {}".format(start) amine@403: assert ( amine@403: end == 6 amine@403: ), "wrong end frame for token 1, expected: 6, found: {}".format(end) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_strict_min_and_drop_trailing_silence(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=8, amine@400: max_continuous_silence=3, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=StreamTokenizer.STRICT_MIN_LENGTH amine@400: | StreamTokenizer.DROP_TRAILING_SILENCE, amine@400: ) amine@297: amine@297: amine@400: def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE( amine@400: tokenizer_strict_min_and_drop_trailing_silence, amine@400: ): amine@400: data_source = StringDataSource("aaAAAAAAAAAAAAaa") amine@400: # ^ ^ amine@400: # 2 8 amine@297: amine@400: tokens = tokenizer_strict_min_and_drop_trailing_silence.tokenize( amine@400: data_source amine@400: ) amine@297: amine@400: assert ( amine@400: len(tokens) == 1 amine@403: ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens)) amine@400: tok1 = tokens[0] amine@297: amine@400: data = "".join(tok1[0]) amine@400: start = tok1[1] amine@400: end = tok1[2] amine@400: assert ( amine@400: data == "AAAAAAAA" amine@403: ), "wrong data for token 1, expected: 'AAAAAAAA', found: {}".format(data) amine@400: assert ( amine@400: start == 2 amine@403: ), "wrong start frame for token 1, expected: 2, found: {}".format(start) amine@403: assert ( amine@403: end == 9 amine@403: ), "wrong end frame for token 1, expected: 9, found: {}".format(end) amine@297: amine@297: amine@400: @pytest.fixture amine@400: def tokenizer_callback(validator): amine@400: return StreamTokenizer( amine@400: validator, amine@400: min_length=5, amine@400: max_length=8, amine@400: max_continuous_silence=3, amine@400: init_min=3, amine@400: init_max_silence=3, amine@400: mode=0, amine@400: ) amine@297: amine@297: amine@400: def test_callback(tokenizer_callback): amine@400: tokens = [] amine@297: amine@400: def callback(data, start, end): amine@400: tokens.append((data, start, end)) amine@297: amine@400: data_source = StringDataSource("aaAAAAAAAAAAAAa") amine@400: # ^ ^^ ^ amine@400: # 2 910 14 amine@297: amine@400: tokenizer_callback.tokenize(data_source, callback=callback) amine@2: amine@400: assert ( amine@400: len(tokens) == 2 amine@403: ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))