amine@400
|
1 import pytest
|
amine@2
|
2 from auditok import StreamTokenizer, StringDataSource, DataValidator
|
amine@2
|
3
|
amine@2
|
4
|
amine@2
|
5 class AValidator(DataValidator):
|
amine@2
|
6 def is_valid(self, frame):
|
amine@2
|
7 return frame == "A"
|
amine@2
|
8
|
amine@2
|
9
|
amine@400
|
10 @pytest.fixture
|
amine@400
|
11 def validator():
|
amine@400
|
12 return AValidator()
|
amine@297
|
13
|
amine@297
|
14
|
amine@400
|
15 def test_init_min_0_init_max_silence_0(validator):
|
amine@400
|
16 tokenizer = StreamTokenizer(
|
amine@400
|
17 validator,
|
amine@400
|
18 min_length=5,
|
amine@400
|
19 max_length=20,
|
amine@400
|
20 max_continuous_silence=4,
|
amine@400
|
21 init_min=0,
|
amine@400
|
22 init_max_silence=0,
|
amine@400
|
23 mode=0,
|
amine@400
|
24 )
|
amine@297
|
25
|
amine@400
|
26 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
|
amine@400
|
27 # ^ ^ ^ ^
|
amine@400
|
28 # 2 16 20 27
|
amine@400
|
29 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
30
|
amine@400
|
31 assert (
|
amine@400
|
32 len(tokens) == 2
|
amine@400
|
33 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
|
amine@400
|
34 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
35
|
amine@400
|
36 data = "".join(tok1[0])
|
amine@400
|
37 start = tok1[1]
|
amine@400
|
38 end = tok1[2]
|
amine@400
|
39 assert (
|
amine@400
|
40 data == "AaaaAaAaaAaAaaaa"
|
amine@400
|
41 ), f"wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {data}"
|
amine@400
|
42 assert (
|
amine@400
|
43 start == 1
|
amine@400
|
44 ), f"wrong start frame for token 1, expected: 1, found: {start}"
|
amine@400
|
45 assert end == 16, f"wrong end frame for token 1, expected: 16, found: {end}"
|
amine@297
|
46
|
amine@400
|
47 data = "".join(tok2[0])
|
amine@400
|
48 start = tok2[1]
|
amine@400
|
49 end = tok2[2]
|
amine@400
|
50 assert (
|
amine@400
|
51 data == "AAAAAAAA"
|
amine@400
|
52 ), f"wrong data for token 2, expected: 'AAAAAAAA', found: {data}"
|
amine@400
|
53 assert (
|
amine@400
|
54 start == 20
|
amine@400
|
55 ), f"wrong start frame for token 2, expected: 20, found: {start}"
|
amine@400
|
56 assert end == 27, f"wrong end frame for token 2, expected: 27, found: {end}"
|
amine@297
|
57
|
amine@297
|
58
|
amine@400
|
59 def test_init_min_3_init_max_silence_0(validator):
|
amine@400
|
60 tokenizer = StreamTokenizer(
|
amine@400
|
61 validator,
|
amine@400
|
62 min_length=5,
|
amine@400
|
63 max_length=20,
|
amine@400
|
64 max_continuous_silence=4,
|
amine@400
|
65 init_min=3,
|
amine@400
|
66 init_max_silence=0,
|
amine@400
|
67 mode=0,
|
amine@400
|
68 )
|
amine@297
|
69
|
amine@400
|
70 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
|
amine@400
|
71 # ^ ^ ^ ^
|
amine@400
|
72 # 18 30 33 37
|
amine@297
|
73
|
amine@400
|
74 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
75
|
amine@400
|
76 assert (
|
amine@400
|
77 len(tokens) == 2
|
amine@400
|
78 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
|
amine@400
|
79 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
80
|
amine@400
|
81 data = "".join(tok1[0])
|
amine@400
|
82 start = tok1[1]
|
amine@400
|
83 end = tok1[2]
|
amine@400
|
84 assert (
|
amine@400
|
85 data == "AAAAAAAAAaaaa"
|
amine@400
|
86 ), f"wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{data}'"
|
amine@400
|
87 assert (
|
amine@400
|
88 start == 18
|
amine@400
|
89 ), f"wrong start frame for token 1, expected: 18, found: {start}"
|
amine@400
|
90 assert end == 30, f"wrong end frame for token 1, expected: 30, found: {end}"
|
amine@297
|
91
|
amine@400
|
92 data = "".join(tok2[0])
|
amine@400
|
93 start = tok2[1]
|
amine@400
|
94 end = tok2[2]
|
amine@400
|
95 assert (
|
amine@400
|
96 data == "AAAAA"
|
amine@400
|
97 ), f"wrong data for token 2, expected: 'AAAAA', found: '{data}'"
|
amine@400
|
98 assert (
|
amine@400
|
99 start == 33
|
amine@400
|
100 ), f"wrong start frame for token 2, expected: 33, found: {start}"
|
amine@400
|
101 assert end == 37, f"wrong end frame for token 2, expected: 37, found: {end}"
|
amine@297
|
102
|
amine@297
|
103
|
amine@400
|
104 def test_init_min_3_init_max_silence_2(validator):
|
amine@400
|
105 tokenizer = StreamTokenizer(
|
amine@400
|
106 validator,
|
amine@400
|
107 min_length=5,
|
amine@400
|
108 max_length=20,
|
amine@400
|
109 max_continuous_silence=4,
|
amine@400
|
110 init_min=3,
|
amine@400
|
111 init_max_silence=2,
|
amine@400
|
112 mode=0,
|
amine@400
|
113 )
|
amine@297
|
114
|
amine@400
|
115 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
|
amine@400
|
116 # ^ ^ ^ ^ ^ ^
|
amine@400
|
117 # 5 16 19 31 35 39
|
amine@400
|
118 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
119
|
amine@400
|
120 assert (
|
amine@400
|
121 len(tokens) == 3
|
amine@400
|
122 ), f"wrong number of tokens, expected: 3, found: {len(tokens)}"
|
amine@400
|
123 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
124
|
amine@400
|
125 data = "".join(tok1[0])
|
amine@400
|
126 start = tok1[1]
|
amine@400
|
127 end = tok1[2]
|
amine@400
|
128 assert (
|
amine@400
|
129 data == "AaAaaAaAaaaa"
|
amine@400
|
130 ), f"wrong data for token 1, expected: 'AaAaaAaA', found: '{data}'"
|
amine@400
|
131 assert (
|
amine@400
|
132 start == 5
|
amine@400
|
133 ), f"wrong start frame for token 1, expected: 5, found: {start}"
|
amine@400
|
134 assert end == 16, f"wrong end frame for token 1, expected: 16, found: {end}"
|
amine@297
|
135
|
amine@400
|
136 data = "".join(tok2[0])
|
amine@400
|
137 start = tok2[1]
|
amine@400
|
138 end = tok2[2]
|
amine@400
|
139 assert (
|
amine@400
|
140 data == "AAAAAAAAAaaaa"
|
amine@400
|
141 ), f"wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{data}'"
|
amine@400
|
142 assert (
|
amine@400
|
143 start == 19
|
amine@400
|
144 ), f"wrong start frame for token 2, expected: 19, found: {start}"
|
amine@400
|
145 assert end == 31, f"wrong end frame for token 2, expected: 31, found: {end}"
|
amine@297
|
146
|
amine@400
|
147 data = "".join(tok3[0])
|
amine@400
|
148 start = tok3[1]
|
amine@400
|
149 end = tok3[2]
|
amine@400
|
150 assert (
|
amine@400
|
151 data == "AAAAA"
|
amine@400
|
152 ), f"wrong data for token 3, expected: 'AAAAA', found: '{data}'"
|
amine@400
|
153 assert (
|
amine@400
|
154 start == 35
|
amine@400
|
155 ), f"wrong start frame for token 3, expected: 35, found: {start}"
|
amine@400
|
156 assert end == 39, f"wrong end frame for token 3, expected: 39, found: {end}"
|
amine@297
|
157
|
amine@297
|
158
|
amine@400
|
159 @pytest.fixture
|
amine@400
|
160 def tokenizer_min_max_length(validator):
|
amine@400
|
161 return StreamTokenizer(
|
amine@400
|
162 validator,
|
amine@400
|
163 min_length=6,
|
amine@400
|
164 max_length=20,
|
amine@400
|
165 max_continuous_silence=2,
|
amine@400
|
166 init_min=3,
|
amine@400
|
167 init_max_silence=3,
|
amine@400
|
168 mode=0,
|
amine@400
|
169 )
|
amine@297
|
170
|
amine@297
|
171
|
amine@400
|
172 def test_min_length_6_init_max_length_20(tokenizer_min_max_length):
|
amine@400
|
173 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@400
|
174 # ^ ^ ^ ^
|
amine@400
|
175 # 1 14 18 28
|
amine@297
|
176
|
amine@400
|
177 tokens = tokenizer_min_max_length.tokenize(data_source)
|
amine@297
|
178
|
amine@400
|
179 assert (
|
amine@400
|
180 len(tokens) == 2
|
amine@400
|
181 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
|
amine@400
|
182 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
183
|
amine@400
|
184 data = "".join(tok1[0])
|
amine@400
|
185 start = tok1[1]
|
amine@400
|
186 end = tok1[2]
|
amine@400
|
187 assert (
|
amine@400
|
188 data == "AaaaAaAaaAaAaa"
|
amine@400
|
189 ), f"wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{data}'"
|
amine@400
|
190 assert (
|
amine@400
|
191 start == 1
|
amine@400
|
192 ), f"wrong start frame for token 1, expected: 1, found: {start}"
|
amine@400
|
193 assert end == 14, f"wrong end frame for token 1, expected: 14, found: {end}"
|
amine@297
|
194
|
amine@400
|
195 data = "".join(tok2[0])
|
amine@400
|
196 start = tok2[1]
|
amine@400
|
197 end = tok2[2]
|
amine@400
|
198 assert (
|
amine@400
|
199 data == "AAAAAAAAAaa"
|
amine@400
|
200 ), f"wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{data}'"
|
amine@400
|
201 assert (
|
amine@400
|
202 start == 18
|
amine@400
|
203 ), f"wrong start frame for token 2, expected: 18, found: {start}"
|
amine@400
|
204 assert end == 28, f"wrong end frame for token 2, expected: 28, found: {end}"
|
amine@297
|
205
|
amine@297
|
206
|
amine@400
|
207 @pytest.fixture
|
amine@400
|
208 def tokenizer_min_max_length_1_1(validator):
|
amine@400
|
209 return StreamTokenizer(
|
amine@400
|
210 validator,
|
amine@400
|
211 min_length=1,
|
amine@400
|
212 max_length=1,
|
amine@400
|
213 max_continuous_silence=0,
|
amine@400
|
214 init_min=0,
|
amine@400
|
215 init_max_silence=0,
|
amine@400
|
216 mode=0,
|
amine@400
|
217 )
|
amine@297
|
218
|
amine@297
|
219
|
amine@400
|
220 def test_min_length_1_init_max_length_1(tokenizer_min_max_length_1_1):
|
amine@400
|
221 data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@297
|
222
|
amine@400
|
223 tokens = tokenizer_min_max_length_1_1.tokenize(data_source)
|
amine@297
|
224
|
amine@400
|
225 assert (
|
amine@400
|
226 len(tokens) == 21
|
amine@400
|
227 ), f"wrong number of tokens, expected: 21, found: {len(tokens)}"
|
amine@297
|
228
|
amine@297
|
229
|
amine@400
|
230 @pytest.fixture
|
amine@400
|
231 def tokenizer_min_max_length_10_20(validator):
|
amine@400
|
232 return StreamTokenizer(
|
amine@400
|
233 validator,
|
amine@400
|
234 min_length=10,
|
amine@400
|
235 max_length=20,
|
amine@400
|
236 max_continuous_silence=4,
|
amine@400
|
237 init_min=3,
|
amine@400
|
238 init_max_silence=3,
|
amine@400
|
239 mode=0,
|
amine@400
|
240 )
|
amine@297
|
241
|
amine@297
|
242
|
amine@400
|
243 def test_min_length_10_init_max_length_20(tokenizer_min_max_length_10_20):
|
amine@400
|
244 data_source = StringDataSource(
|
amine@400
|
245 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA"
|
amine@400
|
246 )
|
amine@400
|
247 # ^ ^ ^ ^
|
amine@400
|
248 # 1 16 30 45
|
amine@297
|
249
|
amine@400
|
250 tokens = tokenizer_min_max_length_10_20.tokenize(data_source)
|
amine@297
|
251
|
amine@400
|
252 assert (
|
amine@400
|
253 len(tokens) == 2
|
amine@400
|
254 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
|
amine@400
|
255 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
256
|
amine@400
|
257 data = "".join(tok1[0])
|
amine@400
|
258 start = tok1[1]
|
amine@400
|
259 end = tok1[2]
|
amine@400
|
260 assert (
|
amine@400
|
261 data == "AaaaAaAaaAaAaaaa"
|
amine@400
|
262 ), f"wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{data}'"
|
amine@400
|
263 assert (
|
amine@400
|
264 start == 1
|
amine@400
|
265 ), f"wrong start frame for token 1, expected: 1, found: {start}"
|
amine@400
|
266 assert end == 16, f"wrong end frame for token 1, expected: 16, found: {end}"
|
amine@297
|
267
|
amine@400
|
268 data = "".join(tok2[0])
|
amine@400
|
269 start = tok2[1]
|
amine@400
|
270 end = tok2[2]
|
amine@400
|
271 assert (
|
amine@400
|
272 data == "AAAAAaaAAaaAAA"
|
amine@400
|
273 ), f"wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{data}'"
|
amine@400
|
274 assert (
|
amine@400
|
275 start == 30
|
amine@400
|
276 ), f"wrong start frame for token 2, expected: 30, found: {start}"
|
amine@400
|
277 assert end == 43, f"wrong end frame for token 2, expected: 43, found: {end}"
|
amine@297
|
278
|
amine@297
|
279
|
amine@400
|
280 @pytest.fixture
|
amine@400
|
281 def tokenizer_min_max_length_4_5(validator):
|
amine@400
|
282 return StreamTokenizer(
|
amine@400
|
283 validator,
|
amine@400
|
284 min_length=4,
|
amine@400
|
285 max_length=5,
|
amine@400
|
286 max_continuous_silence=4,
|
amine@400
|
287 init_min=3,
|
amine@400
|
288 init_max_silence=3,
|
amine@400
|
289 mode=0,
|
amine@400
|
290 )
|
amine@297
|
291
|
amine@297
|
292
|
amine@400
|
293 def test_min_length_4_init_max_length_5(tokenizer_min_max_length_4_5):
|
amine@400
|
294 data_source = StringDataSource(
|
amine@400
|
295 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa"
|
amine@400
|
296 )
|
amine@400
|
297 # ^ ^^ ^ ^ ^ ^ ^
|
amine@400
|
298 # 18 2223 27 32 36 42 46
|
amine@297
|
299
|
amine@400
|
300 tokens = tokenizer_min_max_length_4_5.tokenize(data_source)
|
amine@297
|
301
|
amine@400
|
302 assert (
|
amine@400
|
303 len(tokens) == 4
|
amine@400
|
304 ), f"wrong number of tokens, expected: 4, found: {len(tokens)}"
|
amine@400
|
305 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
|
amine@297
|
306
|
amine@400
|
307 data = "".join(tok1[0])
|
amine@400
|
308 start = tok1[1]
|
amine@400
|
309 end = tok1[2]
|
amine@400
|
310 assert (
|
amine@400
|
311 data == "AAAAA"
|
amine@400
|
312 ), f"wrong data for token 1, expected: 'AAAAA', found: '{data}'"
|
amine@400
|
313 assert (
|
amine@400
|
314 start == 18
|
amine@400
|
315 ), f"wrong start frame for token 1, expected: 18, found: {start}"
|
amine@400
|
316 assert end == 22, f"wrong end frame for token 1, expected: 22, found: {end}"
|
amine@297
|
317
|
amine@400
|
318 data = "".join(tok2[0])
|
amine@400
|
319 start = tok2[1]
|
amine@400
|
320 end = tok2[2]
|
amine@400
|
321 assert (
|
amine@400
|
322 data == "AAAaa"
|
amine@400
|
323 ), f"wrong data for token 2, expected: 'AAAaa', found: '{data}'"
|
amine@400
|
324 assert (
|
amine@400
|
325 start == 23
|
amine@400
|
326 ), f"wrong start frame for token 2, expected: 23, found: {start}"
|
amine@400
|
327 assert end == 27, f"wrong end frame for token 2, expected: 27, found: {end}"
|
amine@297
|
328
|
amine@400
|
329 data = "".join(tok3[0])
|
amine@400
|
330 start = tok3[1]
|
amine@400
|
331 end = tok3[2]
|
amine@400
|
332 assert (
|
amine@400
|
333 data == "AAAAA"
|
amine@400
|
334 ), f"wrong data for token 3, expected: 'AAAAA', found: '{data}'"
|
amine@400
|
335 assert (
|
amine@400
|
336 start == 32
|
amine@400
|
337 ), f"wrong start frame for token 3, expected: 32, found: {start}"
|
amine@400
|
338 assert end == 36, f"wrong end frame for token 3, expected: 36, found: {end}"
|
amine@297
|
339
|
amine@400
|
340 data = "".join(tok4[0])
|
amine@400
|
341 start = tok4[1]
|
amine@400
|
342 end = tok4[2]
|
amine@400
|
343 assert (
|
amine@400
|
344 data == "AAaaA"
|
amine@400
|
345 ), f"wrong data for token 4, expected: 'AAaaA', found: '{data}'"
|
amine@400
|
346 assert (
|
amine@400
|
347 start == 42
|
amine@400
|
348 ), f"wrong start frame for token 4, expected: 42, found: {start}"
|
amine@400
|
349 assert end == 46, f"wrong end frame for token 4, expected: 46, found: {end}"
|
amine@297
|
350
|
amine@297
|
351
|
amine@400
|
352 @pytest.fixture
|
amine@400
|
353 def tokenizer_max_continuous_silence_0(validator):
|
amine@400
|
354 return StreamTokenizer(
|
amine@400
|
355 validator,
|
amine@400
|
356 min_length=5,
|
amine@400
|
357 max_length=10,
|
amine@400
|
358 max_continuous_silence=0,
|
amine@400
|
359 init_min=3,
|
amine@400
|
360 init_max_silence=3,
|
amine@400
|
361 mode=0,
|
amine@400
|
362 )
|
amine@2
|
363
|
amine@297
|
364
|
amine@400
|
365 def test_min_5_max_10_max_continuous_silence_0(
|
amine@400
|
366 tokenizer_max_continuous_silence_0,
|
amine@400
|
367 ):
|
amine@400
|
368 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@400
|
369 # ^ ^ ^ ^ ^ ^
|
amine@400
|
370 # 3 7 9 14 17 25
|
amine@297
|
371
|
amine@400
|
372 tokens = tokenizer_max_continuous_silence_0.tokenize(data_source)
|
amine@297
|
373
|
amine@400
|
374 assert (
|
amine@400
|
375 len(tokens) == 3
|
amine@400
|
376 ), f"wrong number of tokens, expected: 3, found: {len(tokens)}"
|
amine@400
|
377 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
378
|
amine@400
|
379 data = "".join(tok1[0])
|
amine@400
|
380 start = tok1[1]
|
amine@400
|
381 end = tok1[2]
|
amine@400
|
382 assert (
|
amine@400
|
383 data == "AAAAA"
|
amine@400
|
384 ), f"wrong data for token 1, expected: 'AAAAA', found: '{data}'"
|
amine@400
|
385 assert (
|
amine@400
|
386 start == 3
|
amine@400
|
387 ), f"wrong start frame for token 1, expected: 3, found: {start}"
|
amine@400
|
388 assert end == 7, f"wrong end frame for token 1, expected: 7, found: {end}"
|
amine@297
|
389
|
amine@400
|
390 data = "".join(tok2[0])
|
amine@400
|
391 start = tok2[1]
|
amine@400
|
392 end = tok2[2]
|
amine@400
|
393 assert (
|
amine@400
|
394 data == "AAAAAA"
|
amine@400
|
395 ), f"wrong data for token 2, expected: 'AAAAAA', found: '{data}'"
|
amine@400
|
396 assert (
|
amine@400
|
397 start == 9
|
amine@400
|
398 ), f"wrong start frame for token 2, expected: 9, found: {start}"
|
amine@400
|
399 assert end == 14, f"wrong end frame for token 2, expected: 14, found: {end}"
|
amine@297
|
400
|
amine@400
|
401 data = "".join(tok3[0])
|
amine@400
|
402 start = tok3[1]
|
amine@400
|
403 end = tok3[2]
|
amine@400
|
404 assert (
|
amine@400
|
405 data == "AAAAAAAAA"
|
amine@400
|
406 ), f"wrong data for token 3, expected: 'AAAAAAAAA', found: '{data}'"
|
amine@400
|
407 assert (
|
amine@400
|
408 start == 17
|
amine@400
|
409 ), f"wrong start frame for token 3, expected: 17, found: {start}"
|
amine@400
|
410 assert end == 25, f"wrong end frame for token 3, expected: 25, found: {end}"
|
amine@297
|
411
|
amine@2
|
412
|
amine@400
|
413 @pytest.fixture
|
amine@400
|
414 def tokenizer_max_continuous_silence_1(validator):
|
amine@400
|
415 return StreamTokenizer(
|
amine@400
|
416 validator,
|
amine@400
|
417 min_length=5,
|
amine@400
|
418 max_length=10,
|
amine@400
|
419 max_continuous_silence=1,
|
amine@400
|
420 init_min=3,
|
amine@400
|
421 init_max_silence=3,
|
amine@400
|
422 mode=0,
|
amine@400
|
423 )
|
amine@297
|
424
|
amine@297
|
425
|
amine@400
|
426 def test_min_5_max_10_max_continuous_silence_1(
|
amine@400
|
427 tokenizer_max_continuous_silence_1,
|
amine@400
|
428 ):
|
amine@400
|
429 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@400
|
430 # ^ ^^ ^ ^ ^
|
amine@400
|
431 # 3 12131517 26
|
amine@400
|
432 # (12 13 15 17)
|
amine@297
|
433
|
amine@400
|
434 tokens = tokenizer_max_continuous_silence_1.tokenize(data_source)
|
amine@297
|
435
|
amine@400
|
436 assert (
|
amine@400
|
437 len(tokens) == 3
|
amine@400
|
438 ), f"wrong number of tokens, expected: 3, found: {len(tokens)}"
|
amine@400
|
439 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
440
|
amine@400
|
441 data = "".join(tok1[0])
|
amine@400
|
442 start = tok1[1]
|
amine@400
|
443 end = tok1[2]
|
amine@400
|
444 assert (
|
amine@400
|
445 data == "AAAAAaAAAA"
|
amine@400
|
446 ), f"wrong data for token 1, expected: 'AAAAAaAAAA', found: '{data}'"
|
amine@400
|
447 assert (
|
amine@400
|
448 start == 3
|
amine@400
|
449 ), f"wrong start frame for token 1, expected: 3, found: {start}"
|
amine@400
|
450 assert end == 12, f"wrong end frame for token 1, expected: 12, found: {end}"
|
amine@297
|
451
|
amine@400
|
452 data = "".join(tok2[0])
|
amine@400
|
453 start = tok2[1]
|
amine@400
|
454 end = tok2[2]
|
amine@400
|
455 assert (
|
amine@400
|
456 data == "AAa"
|
amine@400
|
457 ), f"wrong data for token 2, expected: 'AAa', found: '{data}'"
|
amine@400
|
458 assert (
|
amine@400
|
459 start == 13
|
amine@400
|
460 ), f"wrong start frame for token 2, expected: 13, found: {start}"
|
amine@400
|
461 assert end == 15, f"wrong end frame for token 2, expected: 15, found: {end}"
|
amine@297
|
462
|
amine@400
|
463 data = "".join(tok3[0])
|
amine@400
|
464 start = tok3[1]
|
amine@400
|
465 end = tok3[2]
|
amine@400
|
466 assert (
|
amine@400
|
467 data == "AAAAAAAAAa"
|
amine@400
|
468 ), f"wrong data for token 3, expected: 'AAAAAAAAAa', found: '{data}'"
|
amine@400
|
469 assert (
|
amine@400
|
470 start == 17
|
amine@400
|
471 ), f"wrong start frame for token 3, expected: 17, found: {start}"
|
amine@400
|
472 assert end == 26, f"wrong end frame for token 3, expected: 26, found: {end}"
|
amine@297
|
473
|
amine@297
|
474
|
amine@400
|
475 @pytest.fixture
|
amine@400
|
476 def tokenizer_strict_min_length(validator):
|
amine@400
|
477 return StreamTokenizer(
|
amine@400
|
478 validator,
|
amine@400
|
479 min_length=5,
|
amine@400
|
480 max_length=8,
|
amine@400
|
481 max_continuous_silence=3,
|
amine@400
|
482 init_min=3,
|
amine@400
|
483 init_max_silence=3,
|
amine@400
|
484 mode=StreamTokenizer.STRICT_MIN_LENGTH,
|
amine@400
|
485 )
|
amine@297
|
486
|
amine@297
|
487
|
amine@400
|
488 def test_STRICT_MIN_LENGTH(tokenizer_strict_min_length):
|
amine@400
|
489 data_source = StringDataSource("aaAAAAAAAAAAAA")
|
amine@400
|
490 # ^ ^
|
amine@400
|
491 # 2 9
|
amine@297
|
492
|
amine@400
|
493 tokens = tokenizer_strict_min_length.tokenize(data_source)
|
amine@297
|
494
|
amine@400
|
495 assert (
|
amine@400
|
496 len(tokens) == 1
|
amine@400
|
497 ), f"wrong number of tokens, expected: 1, found: {len(tokens)}"
|
amine@400
|
498 tok1 = tokens[0]
|
amine@297
|
499
|
amine@400
|
500 data = "".join(tok1[0])
|
amine@400
|
501 start = tok1[1]
|
amine@400
|
502 end = tok1[2]
|
amine@400
|
503 assert (
|
amine@400
|
504 data == "AAAAAAAA"
|
amine@400
|
505 ), f"wrong data for token 1, expected: 'AAAAAAAA', found: '{data}'"
|
amine@400
|
506 assert (
|
amine@400
|
507 start == 2
|
amine@400
|
508 ), f"wrong start frame for token 1, expected: 2, found: {start}"
|
amine@400
|
509 assert end == 9, f"wrong end frame for token 1, expected: 9, found: {end}"
|
amine@297
|
510
|
amine@297
|
511
|
amine@400
|
512 @pytest.fixture
|
amine@400
|
513 def tokenizer_drop_trailing_silence(validator):
|
amine@400
|
514 return StreamTokenizer(
|
amine@400
|
515 validator,
|
amine@400
|
516 min_length=5,
|
amine@400
|
517 max_length=10,
|
amine@400
|
518 max_continuous_silence=2,
|
amine@400
|
519 init_min=3,
|
amine@400
|
520 init_max_silence=3,
|
amine@400
|
521 mode=StreamTokenizer.DROP_TRAILING_SILENCE,
|
amine@400
|
522 )
|
amine@297
|
523
|
amine@297
|
524
|
amine@400
|
525 def test_DROP_TAILING_SILENCE(tokenizer_drop_trailing_silence):
|
amine@400
|
526 data_source = StringDataSource("aaAAAAAaaaaa")
|
amine@400
|
527 # ^ ^
|
amine@400
|
528 # 2 6
|
amine@297
|
529
|
amine@400
|
530 tokens = tokenizer_drop_trailing_silence.tokenize(data_source)
|
amine@297
|
531
|
amine@400
|
532 assert (
|
amine@400
|
533 len(tokens) == 1
|
amine@400
|
534 ), f"wrong number of tokens, expected: 1, found: {len(tokens)}"
|
amine@400
|
535 tok1 = tokens[0]
|
amine@297
|
536
|
amine@400
|
537 data = "".join(tok1[0])
|
amine@400
|
538 start = tok1[1]
|
amine@400
|
539 end = tok1[2]
|
amine@400
|
540 assert (
|
amine@400
|
541 data == "AAAAA"
|
amine@400
|
542 ), f"wrong data for token 1, expected: 'AAAAA', found: '{data}'"
|
amine@400
|
543 assert (
|
amine@400
|
544 start == 2
|
amine@400
|
545 ), f"wrong start frame for token 1, expected: 2, found: {start}"
|
amine@400
|
546 assert end == 6, f"wrong end frame for token 1, expected: 6, found: {end}"
|
amine@297
|
547
|
amine@297
|
548
|
amine@400
|
549 @pytest.fixture
|
amine@400
|
550 def tokenizer_strict_min_and_drop_trailing_silence(validator):
|
amine@400
|
551 return StreamTokenizer(
|
amine@400
|
552 validator,
|
amine@400
|
553 min_length=5,
|
amine@400
|
554 max_length=8,
|
amine@400
|
555 max_continuous_silence=3,
|
amine@400
|
556 init_min=3,
|
amine@400
|
557 init_max_silence=3,
|
amine@400
|
558 mode=StreamTokenizer.STRICT_MIN_LENGTH
|
amine@400
|
559 | StreamTokenizer.DROP_TRAILING_SILENCE,
|
amine@400
|
560 )
|
amine@297
|
561
|
amine@297
|
562
|
amine@400
|
563 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(
|
amine@400
|
564 tokenizer_strict_min_and_drop_trailing_silence,
|
amine@400
|
565 ):
|
amine@400
|
566 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
|
amine@400
|
567 # ^ ^
|
amine@400
|
568 # 2 8
|
amine@297
|
569
|
amine@400
|
570 tokens = tokenizer_strict_min_and_drop_trailing_silence.tokenize(
|
amine@400
|
571 data_source
|
amine@400
|
572 )
|
amine@297
|
573
|
amine@400
|
574 assert (
|
amine@400
|
575 len(tokens) == 1
|
amine@400
|
576 ), f"wrong number of tokens, expected: 1, found: {len(tokens)}"
|
amine@400
|
577 tok1 = tokens[0]
|
amine@297
|
578
|
amine@400
|
579 data = "".join(tok1[0])
|
amine@400
|
580 start = tok1[1]
|
amine@400
|
581 end = tok1[2]
|
amine@400
|
582 assert (
|
amine@400
|
583 data == "AAAAAAAA"
|
amine@400
|
584 ), f"wrong data for token 1, expected: 'AAAAAAAA', found: '{data}'"
|
amine@400
|
585 assert (
|
amine@400
|
586 start == 2
|
amine@400
|
587 ), f"wrong start frame for token 1, expected: 2, found: {start}"
|
amine@400
|
588 assert end == 9, f"wrong end frame for token 1, expected: 9, found: {end}"
|
amine@297
|
589
|
amine@297
|
590
|
amine@400
|
591 @pytest.fixture
|
amine@400
|
592 def tokenizer_callback(validator):
|
amine@400
|
593 return StreamTokenizer(
|
amine@400
|
594 validator,
|
amine@400
|
595 min_length=5,
|
amine@400
|
596 max_length=8,
|
amine@400
|
597 max_continuous_silence=3,
|
amine@400
|
598 init_min=3,
|
amine@400
|
599 init_max_silence=3,
|
amine@400
|
600 mode=0,
|
amine@400
|
601 )
|
amine@297
|
602
|
amine@297
|
603
|
amine@400
|
604 def test_callback(tokenizer_callback):
|
amine@400
|
605 tokens = []
|
amine@297
|
606
|
amine@400
|
607 def callback(data, start, end):
|
amine@400
|
608 tokens.append((data, start, end))
|
amine@297
|
609
|
amine@400
|
610 data_source = StringDataSource("aaAAAAAAAAAAAAa")
|
amine@400
|
611 # ^ ^^ ^
|
amine@400
|
612 # 2 910 14
|
amine@297
|
613
|
amine@400
|
614 tokenizer_callback.tokenize(data_source, callback=callback)
|
amine@2
|
615
|
amine@400
|
616 assert (
|
amine@400
|
617 len(tokens) == 2
|
amine@400
|
618 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
|