amine@403
|
1 import os
|
amine@403
|
2
|
amine@400
|
3 import pytest
|
amine@403
|
4
|
amine@403
|
5 from auditok import DataValidator, StreamTokenizer, StringDataSource
|
amine@2
|
6
|
amine@2
|
7
|
amine@2
|
8 class AValidator(DataValidator):
|
amine@2
|
9 def is_valid(self, frame):
|
amine@2
|
10 return frame == "A"
|
amine@2
|
11
|
amine@2
|
12
|
amine@400
|
13 @pytest.fixture
|
amine@400
|
14 def validator():
|
amine@400
|
15 return AValidator()
|
amine@297
|
16
|
amine@297
|
17
|
amine@400
|
18 def test_init_min_0_init_max_silence_0(validator):
|
amine@400
|
19 tokenizer = StreamTokenizer(
|
amine@400
|
20 validator,
|
amine@400
|
21 min_length=5,
|
amine@400
|
22 max_length=20,
|
amine@400
|
23 max_continuous_silence=4,
|
amine@400
|
24 init_min=0,
|
amine@400
|
25 init_max_silence=0,
|
amine@400
|
26 mode=0,
|
amine@400
|
27 )
|
amine@297
|
28
|
amine@400
|
29 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
|
amine@400
|
30 # ^ ^ ^ ^
|
amine@400
|
31 # 2 16 20 27
|
amine@400
|
32 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
33
|
amine@400
|
34 assert (
|
amine@400
|
35 len(tokens) == 2
|
amine@403
|
36 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
|
amine@400
|
37 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
38
|
amine@400
|
39 data = "".join(tok1[0])
|
amine@400
|
40 start = tok1[1]
|
amine@400
|
41 end = tok1[2]
|
amine@400
|
42 assert (
|
amine@400
|
43 data == "AaaaAaAaaAaAaaaa"
|
amine@403
|
44 ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {}".format(
|
amine@403
|
45 data
|
amine@403
|
46 )
|
amine@400
|
47 assert (
|
amine@400
|
48 start == 1
|
amine@403
|
49 ), "wrong start frame for token 1, expected: 1, found: {}".format(start)
|
amine@403
|
50 assert (
|
amine@403
|
51 end == 16
|
amine@403
|
52 ), "wrong end frame for token 1, expected: 16, found: {}".format(end)
|
amine@297
|
53
|
amine@400
|
54 data = "".join(tok2[0])
|
amine@400
|
55 start = tok2[1]
|
amine@400
|
56 end = tok2[2]
|
amine@400
|
57 assert (
|
amine@400
|
58 data == "AAAAAAAA"
|
amine@403
|
59 ), "wrong data for token 2, expected: 'AAAAAAAA', found: {}".format(data)
|
amine@400
|
60 assert (
|
amine@400
|
61 start == 20
|
amine@403
|
62 ), "wrong start frame for token 2, expected: 20, found: {}".format(start)
|
amine@403
|
63 assert (
|
amine@403
|
64 end == 27
|
amine@403
|
65 ), "wrong end frame for token 2, expected: 27, found: {}".format(end)
|
amine@297
|
66
|
amine@297
|
67
|
amine@400
|
68 def test_init_min_3_init_max_silence_0(validator):
|
amine@400
|
69 tokenizer = StreamTokenizer(
|
amine@400
|
70 validator,
|
amine@400
|
71 min_length=5,
|
amine@400
|
72 max_length=20,
|
amine@400
|
73 max_continuous_silence=4,
|
amine@400
|
74 init_min=3,
|
amine@400
|
75 init_max_silence=0,
|
amine@400
|
76 mode=0,
|
amine@400
|
77 )
|
amine@297
|
78
|
amine@400
|
79 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
|
amine@400
|
80 # ^ ^ ^ ^
|
amine@400
|
81 # 18 30 33 37
|
amine@297
|
82
|
amine@400
|
83 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
84
|
amine@400
|
85 assert (
|
amine@400
|
86 len(tokens) == 2
|
amine@403
|
87 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
|
amine@400
|
88 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
89
|
amine@400
|
90 data = "".join(tok1[0])
|
amine@400
|
91 start = tok1[1]
|
amine@400
|
92 end = tok1[2]
|
amine@400
|
93 assert (
|
amine@400
|
94 data == "AAAAAAAAAaaaa"
|
amine@403
|
95 ), "wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: {}".format(
|
amine@403
|
96 data
|
amine@403
|
97 )
|
amine@400
|
98 assert (
|
amine@400
|
99 start == 18
|
amine@403
|
100 ), "wrong start frame for token 1, expected: 18, found: {}".format(start)
|
amine@403
|
101 assert (
|
amine@403
|
102 end == 30
|
amine@403
|
103 ), "wrong end frame for token 1, expected: 30, found: {}".format(end)
|
amine@297
|
104
|
amine@400
|
105 data = "".join(tok2[0])
|
amine@400
|
106 start = tok2[1]
|
amine@400
|
107 end = tok2[2]
|
amine@400
|
108 assert (
|
amine@400
|
109 data == "AAAAA"
|
amine@403
|
110 ), "wrong data for token 2, expected: 'AAAAA', found: {}".format(data)
|
amine@400
|
111 assert (
|
amine@400
|
112 start == 33
|
amine@403
|
113 ), "wrong start frame for token 2, expected: 33, found: {}".format(start)
|
amine@403
|
114 assert (
|
amine@403
|
115 end == 37
|
amine@403
|
116 ), "wrong end frame for token 2, expected: 37, found: {}".format(end)
|
amine@297
|
117
|
amine@297
|
118
|
amine@400
|
119 def test_init_min_3_init_max_silence_2(validator):
|
amine@400
|
120 tokenizer = StreamTokenizer(
|
amine@400
|
121 validator,
|
amine@400
|
122 min_length=5,
|
amine@400
|
123 max_length=20,
|
amine@400
|
124 max_continuous_silence=4,
|
amine@400
|
125 init_min=3,
|
amine@400
|
126 init_max_silence=2,
|
amine@400
|
127 mode=0,
|
amine@400
|
128 )
|
amine@297
|
129
|
amine@400
|
130 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
|
amine@400
|
131 # ^ ^ ^ ^ ^ ^
|
amine@400
|
132 # 5 16 19 31 35 39
|
amine@400
|
133 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
134
|
amine@400
|
135 assert (
|
amine@400
|
136 len(tokens) == 3
|
amine@403
|
137 ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens))
|
amine@400
|
138 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
139
|
amine@400
|
140 data = "".join(tok1[0])
|
amine@400
|
141 start = tok1[1]
|
amine@400
|
142 end = tok1[2]
|
amine@400
|
143 assert (
|
amine@400
|
144 data == "AaAaaAaAaaaa"
|
amine@403
|
145 ), "wrong data for token 1, expected: 'AaAaaAaA', found: {}".format(data)
|
amine@400
|
146 assert (
|
amine@400
|
147 start == 5
|
amine@403
|
148 ), "wrong start frame for token 1, expected: 5, found: {}".format(start)
|
amine@403
|
149 assert (
|
amine@403
|
150 end == 16
|
amine@403
|
151 ), "wrong end frame for token 1, expected: 16, found: {}".format(end)
|
amine@297
|
152
|
amine@400
|
153 data = "".join(tok2[0])
|
amine@400
|
154 start = tok2[1]
|
amine@400
|
155 end = tok2[2]
|
amine@400
|
156 assert (
|
amine@400
|
157 data == "AAAAAAAAAaaaa"
|
amine@403
|
158 ), "wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: {}".format(
|
amine@403
|
159 data
|
amine@403
|
160 )
|
amine@400
|
161 assert (
|
amine@400
|
162 start == 19
|
amine@403
|
163 ), "wrong start frame for token 2, expected: 19, found: {}".format(start)
|
amine@403
|
164 assert (
|
amine@403
|
165 end == 31
|
amine@403
|
166 ), "wrong end frame for token 2, expected: 31, found: {}".format(end)
|
amine@297
|
167
|
amine@400
|
168 data = "".join(tok3[0])
|
amine@400
|
169 start = tok3[1]
|
amine@400
|
170 end = tok3[2]
|
amine@400
|
171 assert (
|
amine@400
|
172 data == "AAAAA"
|
amine@403
|
173 ), "wrong data for token 3, expected: 'AAAAA', found: {}".format(data)
|
amine@400
|
174 assert (
|
amine@400
|
175 start == 35
|
amine@403
|
176 ), "wrong start frame for token 3, expected: 35, found: {}".format(start)
|
amine@403
|
177 assert (
|
amine@403
|
178 end == 39
|
amine@403
|
179 ), "wrong end frame for token 3, expected: 39, found: {}".format(end)
|
amine@297
|
180
|
amine@297
|
181
|
amine@400
|
182 @pytest.fixture
|
amine@400
|
183 def tokenizer_min_max_length(validator):
|
amine@400
|
184 return StreamTokenizer(
|
amine@400
|
185 validator,
|
amine@400
|
186 min_length=6,
|
amine@400
|
187 max_length=20,
|
amine@400
|
188 max_continuous_silence=2,
|
amine@400
|
189 init_min=3,
|
amine@400
|
190 init_max_silence=3,
|
amine@400
|
191 mode=0,
|
amine@400
|
192 )
|
amine@297
|
193
|
amine@297
|
194
|
amine@400
|
195 def test_min_length_6_init_max_length_20(tokenizer_min_max_length):
|
amine@400
|
196 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@400
|
197 # ^ ^ ^ ^
|
amine@400
|
198 # 1 14 18 28
|
amine@297
|
199
|
amine@400
|
200 tokens = tokenizer_min_max_length.tokenize(data_source)
|
amine@297
|
201
|
amine@400
|
202 assert (
|
amine@400
|
203 len(tokens) == 2
|
amine@403
|
204 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
|
amine@400
|
205 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
206
|
amine@400
|
207 data = "".join(tok1[0])
|
amine@400
|
208 start = tok1[1]
|
amine@400
|
209 end = tok1[2]
|
amine@400
|
210 assert (
|
amine@400
|
211 data == "AaaaAaAaaAaAaa"
|
amine@403
|
212 ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: {}".format(
|
amine@403
|
213 data
|
amine@403
|
214 )
|
amine@400
|
215 assert (
|
amine@400
|
216 start == 1
|
amine@403
|
217 ), "wrong start frame for token 1, expected: 1, found: {}".format(start)
|
amine@403
|
218 assert (
|
amine@403
|
219 end == 14
|
amine@403
|
220 ), "wrong end frame for token 1, expected: 14, found: {}".format(end)
|
amine@297
|
221
|
amine@400
|
222 data = "".join(tok2[0])
|
amine@400
|
223 start = tok2[1]
|
amine@400
|
224 end = tok2[2]
|
amine@400
|
225 assert (
|
amine@400
|
226 data == "AAAAAAAAAaa"
|
amine@403
|
227 ), "wrong data for token 2, expected: 'AAAAAAAAAaa', found: {}".format(data)
|
amine@400
|
228 assert (
|
amine@400
|
229 start == 18
|
amine@403
|
230 ), "wrong start frame for token 2, expected: 18, found: {}".format(start)
|
amine@403
|
231 assert (
|
amine@403
|
232 end == 28
|
amine@403
|
233 ), "wrong end frame for token 2, expected: 28, found: {}".format(end)
|
amine@297
|
234
|
amine@297
|
235
|
amine@400
|
236 @pytest.fixture
|
amine@400
|
237 def tokenizer_min_max_length_1_1(validator):
|
amine@400
|
238 return StreamTokenizer(
|
amine@400
|
239 validator,
|
amine@400
|
240 min_length=1,
|
amine@400
|
241 max_length=1,
|
amine@400
|
242 max_continuous_silence=0,
|
amine@400
|
243 init_min=0,
|
amine@400
|
244 init_max_silence=0,
|
amine@400
|
245 mode=0,
|
amine@400
|
246 )
|
amine@297
|
247
|
amine@297
|
248
|
amine@400
|
249 def test_min_length_1_init_max_length_1(tokenizer_min_max_length_1_1):
|
amine@400
|
250 data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@297
|
251
|
amine@400
|
252 tokens = tokenizer_min_max_length_1_1.tokenize(data_source)
|
amine@297
|
253
|
amine@400
|
254 assert (
|
amine@400
|
255 len(tokens) == 21
|
amine@403
|
256 ), "wrong number of tokens, expected: 21, found: {}".format(len(tokens))
|
amine@297
|
257
|
amine@297
|
258
|
amine@400
|
259 @pytest.fixture
|
amine@400
|
260 def tokenizer_min_max_length_10_20(validator):
|
amine@400
|
261 return StreamTokenizer(
|
amine@400
|
262 validator,
|
amine@400
|
263 min_length=10,
|
amine@400
|
264 max_length=20,
|
amine@400
|
265 max_continuous_silence=4,
|
amine@400
|
266 init_min=3,
|
amine@400
|
267 init_max_silence=3,
|
amine@400
|
268 mode=0,
|
amine@400
|
269 )
|
amine@297
|
270
|
amine@297
|
271
|
amine@400
|
272 def test_min_length_10_init_max_length_20(tokenizer_min_max_length_10_20):
|
amine@400
|
273 data_source = StringDataSource(
|
amine@400
|
274 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA"
|
amine@400
|
275 )
|
amine@400
|
276 # ^ ^ ^ ^
|
amine@400
|
277 # 1 16 30 45
|
amine@297
|
278
|
amine@400
|
279 tokens = tokenizer_min_max_length_10_20.tokenize(data_source)
|
amine@297
|
280
|
amine@400
|
281 assert (
|
amine@400
|
282 len(tokens) == 2
|
amine@403
|
283 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
|
amine@400
|
284 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
285
|
amine@400
|
286 data = "".join(tok1[0])
|
amine@400
|
287 start = tok1[1]
|
amine@400
|
288 end = tok1[2]
|
amine@400
|
289 assert (
|
amine@400
|
290 data == "AaaaAaAaaAaAaaaa"
|
amine@403
|
291 ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {}".format(
|
amine@403
|
292 data
|
amine@403
|
293 )
|
amine@400
|
294 assert (
|
amine@400
|
295 start == 1
|
amine@403
|
296 ), "wrong start frame for token 1, expected: 1, found: {}".format(start)
|
amine@403
|
297 assert (
|
amine@403
|
298 end == 16
|
amine@403
|
299 ), "wrong end frame for token 1, expected: 16, found: {}".format(end)
|
amine@297
|
300
|
amine@400
|
301 data = "".join(tok2[0])
|
amine@400
|
302 start = tok2[1]
|
amine@400
|
303 end = tok2[2]
|
amine@400
|
304 assert (
|
amine@400
|
305 data == "AAAAAaaAAaaAAA"
|
amine@403
|
306 ), "wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: {}".format(
|
amine@403
|
307 data
|
amine@403
|
308 )
|
amine@400
|
309 assert (
|
amine@400
|
310 start == 30
|
amine@403
|
311 ), "wrong start frame for token 2, expected: 30, found: {}".format(start)
|
amine@403
|
312 assert (
|
amine@403
|
313 end == 43
|
amine@403
|
314 ), "wrong end frame for token 2, expected: 43, found: {}".format(end)
|
amine@297
|
315
|
amine@297
|
316
|
amine@400
|
317 @pytest.fixture
|
amine@400
|
318 def tokenizer_min_max_length_4_5(validator):
|
amine@400
|
319 return StreamTokenizer(
|
amine@400
|
320 validator,
|
amine@400
|
321 min_length=4,
|
amine@400
|
322 max_length=5,
|
amine@400
|
323 max_continuous_silence=4,
|
amine@400
|
324 init_min=3,
|
amine@400
|
325 init_max_silence=3,
|
amine@400
|
326 mode=0,
|
amine@400
|
327 )
|
amine@297
|
328
|
amine@297
|
329
|
amine@400
|
330 def test_min_length_4_init_max_length_5(tokenizer_min_max_length_4_5):
|
amine@400
|
331 data_source = StringDataSource(
|
amine@400
|
332 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa"
|
amine@400
|
333 )
|
amine@400
|
334 # ^ ^^ ^ ^ ^ ^ ^
|
amine@400
|
335 # 18 2223 27 32 36 42 46
|
amine@297
|
336
|
amine@400
|
337 tokens = tokenizer_min_max_length_4_5.tokenize(data_source)
|
amine@297
|
338
|
amine@400
|
339 assert (
|
amine@400
|
340 len(tokens) == 4
|
amine@403
|
341 ), "wrong number of tokens, expected: 4, found: {}".format(len(tokens))
|
amine@400
|
342 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
|
amine@297
|
343
|
amine@400
|
344 data = "".join(tok1[0])
|
amine@400
|
345 start = tok1[1]
|
amine@400
|
346 end = tok1[2]
|
amine@400
|
347 assert (
|
amine@400
|
348 data == "AAAAA"
|
amine@403
|
349 ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data)
|
amine@400
|
350 assert (
|
amine@400
|
351 start == 18
|
amine@403
|
352 ), "wrong start frame for token 1, expected: 18, found: {}".format(start)
|
amine@403
|
353 assert (
|
amine@403
|
354 end == 22
|
amine@403
|
355 ), "wrong end frame for token 1, expected: 22, found: {}".format(end)
|
amine@297
|
356
|
amine@400
|
357 data = "".join(tok2[0])
|
amine@400
|
358 start = tok2[1]
|
amine@400
|
359 end = tok2[2]
|
amine@400
|
360 assert (
|
amine@400
|
361 data == "AAAaa"
|
amine@403
|
362 ), "wrong data for token 2, expected: 'AAAaa', found: {}".format(data)
|
amine@400
|
363 assert (
|
amine@400
|
364 start == 23
|
amine@403
|
365 ), "wrong start frame for token 2, expected: 23, found: {}".format(start)
|
amine@403
|
366 assert (
|
amine@403
|
367 end == 27
|
amine@403
|
368 ), "wrong end frame for token 2, expected: 27, found: {}".format(end)
|
amine@297
|
369
|
amine@400
|
370 data = "".join(tok3[0])
|
amine@400
|
371 start = tok3[1]
|
amine@400
|
372 end = tok3[2]
|
amine@400
|
373 assert (
|
amine@400
|
374 data == "AAAAA"
|
amine@403
|
375 ), "wrong data for token 3, expected: 'AAAAA', found: {}".format(data)
|
amine@400
|
376 assert (
|
amine@400
|
377 start == 32
|
amine@403
|
378 ), "wrong start frame for token 3, expected: 32, found: {}".format(start)
|
amine@403
|
379 assert (
|
amine@403
|
380 end == 36
|
amine@403
|
381 ), "wrong end frame for token 3, expected: 36, found: {}".format(end)
|
amine@297
|
382
|
amine@400
|
383 data = "".join(tok4[0])
|
amine@400
|
384 start = tok4[1]
|
amine@400
|
385 end = tok4[2]
|
amine@400
|
386 assert (
|
amine@400
|
387 data == "AAaaA"
|
amine@403
|
388 ), "wrong data for token 4, expected: 'AAaaA', found: {}".format(data)
|
amine@400
|
389 assert (
|
amine@400
|
390 start == 42
|
amine@403
|
391 ), "wrong start frame for token 4, expected: 42, found: {}".format(start)
|
amine@403
|
392 assert (
|
amine@403
|
393 end == 46
|
amine@403
|
394 ), "wrong end frame for token 4, expected: 46, found: {}".format(end)
|
amine@297
|
395
|
amine@297
|
396
|
amine@400
|
397 @pytest.fixture
|
amine@400
|
398 def tokenizer_max_continuous_silence_0(validator):
|
amine@400
|
399 return StreamTokenizer(
|
amine@400
|
400 validator,
|
amine@400
|
401 min_length=5,
|
amine@400
|
402 max_length=10,
|
amine@400
|
403 max_continuous_silence=0,
|
amine@400
|
404 init_min=3,
|
amine@400
|
405 init_max_silence=3,
|
amine@400
|
406 mode=0,
|
amine@400
|
407 )
|
amine@2
|
408
|
amine@297
|
409
|
amine@400
|
410 def test_min_5_max_10_max_continuous_silence_0(
|
amine@400
|
411 tokenizer_max_continuous_silence_0,
|
amine@400
|
412 ):
|
amine@400
|
413 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@400
|
414 # ^ ^ ^ ^ ^ ^
|
amine@400
|
415 # 3 7 9 14 17 25
|
amine@297
|
416
|
amine@400
|
417 tokens = tokenizer_max_continuous_silence_0.tokenize(data_source)
|
amine@297
|
418
|
amine@400
|
419 assert (
|
amine@400
|
420 len(tokens) == 3
|
amine@403
|
421 ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens))
|
amine@400
|
422 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
423
|
amine@400
|
424 data = "".join(tok1[0])
|
amine@400
|
425 start = tok1[1]
|
amine@400
|
426 end = tok1[2]
|
amine@400
|
427 assert (
|
amine@400
|
428 data == "AAAAA"
|
amine@403
|
429 ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data)
|
amine@400
|
430 assert (
|
amine@400
|
431 start == 3
|
amine@403
|
432 ), "wrong start frame for token 1, expected: 3, found: {}".format(start)
|
amine@403
|
433 assert (
|
amine@403
|
434 end == 7
|
amine@403
|
435 ), "wrong end frame for token 1, expected: 7, found: {}".format(end)
|
amine@297
|
436
|
amine@400
|
437 data = "".join(tok2[0])
|
amine@400
|
438 start = tok2[1]
|
amine@400
|
439 end = tok2[2]
|
amine@400
|
440 assert (
|
amine@400
|
441 data == "AAAAAA"
|
amine@403
|
442 ), "wrong data for token 2, expected: 'AAAAAA', found: {}".format(data)
|
amine@400
|
443 assert (
|
amine@400
|
444 start == 9
|
amine@403
|
445 ), "wrong start frame for token 2, expected: 9, found: {}".format(start)
|
amine@403
|
446 assert (
|
amine@403
|
447 end == 14
|
amine@403
|
448 ), "wrong end frame for token 2, expected: 14, found: {}".format(end)
|
amine@297
|
449
|
amine@400
|
450 data = "".join(tok3[0])
|
amine@400
|
451 start = tok3[1]
|
amine@400
|
452 end = tok3[2]
|
amine@400
|
453 assert (
|
amine@400
|
454 data == "AAAAAAAAA"
|
amine@403
|
455 ), "wrong data for token 3, expected: 'AAAAAAAAA', found: {}".format(data)
|
amine@400
|
456 assert (
|
amine@400
|
457 start == 17
|
amine@403
|
458 ), "wrong start frame for token 3, expected: 17, found: {}".format(start)
|
amine@403
|
459 assert (
|
amine@403
|
460 end == 25
|
amine@403
|
461 ), "wrong end frame for token 3, expected: 25, found: {}".format(end)
|
amine@297
|
462
|
amine@2
|
463
|
amine@400
|
464 @pytest.fixture
|
amine@400
|
465 def tokenizer_max_continuous_silence_1(validator):
|
amine@400
|
466 return StreamTokenizer(
|
amine@400
|
467 validator,
|
amine@400
|
468 min_length=5,
|
amine@400
|
469 max_length=10,
|
amine@400
|
470 max_continuous_silence=1,
|
amine@400
|
471 init_min=3,
|
amine@400
|
472 init_max_silence=3,
|
amine@400
|
473 mode=0,
|
amine@400
|
474 )
|
amine@297
|
475
|
amine@297
|
476
|
amine@400
|
477 def test_min_5_max_10_max_continuous_silence_1(
|
amine@400
|
478 tokenizer_max_continuous_silence_1,
|
amine@400
|
479 ):
|
amine@400
|
480 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@400
|
481 # ^ ^^ ^ ^ ^
|
amine@400
|
482 # 3 12131517 26
|
amine@400
|
483 # (12 13 15 17)
|
amine@297
|
484
|
amine@400
|
485 tokens = tokenizer_max_continuous_silence_1.tokenize(data_source)
|
amine@297
|
486
|
amine@400
|
487 assert (
|
amine@400
|
488 len(tokens) == 3
|
amine@403
|
489 ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens))
|
amine@400
|
490 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
491
|
amine@400
|
492 data = "".join(tok1[0])
|
amine@400
|
493 start = tok1[1]
|
amine@400
|
494 end = tok1[2]
|
amine@400
|
495 assert (
|
amine@400
|
496 data == "AAAAAaAAAA"
|
amine@403
|
497 ), "wrong data for token 1, expected: 'AAAAAaAAAA', found: {}".format(data)
|
amine@400
|
498 assert (
|
amine@400
|
499 start == 3
|
amine@403
|
500 ), "wrong start frame for token 1, expected: 3, found: {}".format(start)
|
amine@403
|
501 assert (
|
amine@403
|
502 end == 12
|
amine@403
|
503 ), "wrong end frame for token 1, expected: 12, found: {}".format(end)
|
amine@297
|
504
|
amine@400
|
505 data = "".join(tok2[0])
|
amine@400
|
506 start = tok2[1]
|
amine@400
|
507 end = tok2[2]
|
amine@400
|
508 assert (
|
amine@400
|
509 data == "AAa"
|
amine@403
|
510 ), "wrong data for token 2, expected: 'AAa', found: {}".format(data)
|
amine@400
|
511 assert (
|
amine@400
|
512 start == 13
|
amine@403
|
513 ), "wrong start frame for token 2, expected: 13, found: {}".format(start)
|
amine@403
|
514 assert (
|
amine@403
|
515 end == 15
|
amine@403
|
516 ), "wrong end frame for token 2, expected: 15, found: {}".format(end)
|
amine@297
|
517
|
amine@400
|
518 data = "".join(tok3[0])
|
amine@400
|
519 start = tok3[1]
|
amine@400
|
520 end = tok3[2]
|
amine@400
|
521 assert (
|
amine@400
|
522 data == "AAAAAAAAAa"
|
amine@403
|
523 ), "wrong data for token 3, expected: 'AAAAAAAAAa', found: {}".format(data)
|
amine@400
|
524 assert (
|
amine@400
|
525 start == 17
|
amine@403
|
526 ), "wrong start frame for token 3, expected: 17, found: {}".format(start)
|
amine@403
|
527 assert (
|
amine@403
|
528 end == 26
|
amine@403
|
529 ), "wrong end frame for token 3, expected: 26, found: {}".format(end)
|
amine@297
|
530
|
amine@297
|
531
|
amine@400
|
532 @pytest.fixture
|
amine@400
|
533 def tokenizer_strict_min_length(validator):
|
amine@400
|
534 return StreamTokenizer(
|
amine@400
|
535 validator,
|
amine@400
|
536 min_length=5,
|
amine@400
|
537 max_length=8,
|
amine@400
|
538 max_continuous_silence=3,
|
amine@400
|
539 init_min=3,
|
amine@400
|
540 init_max_silence=3,
|
amine@400
|
541 mode=StreamTokenizer.STRICT_MIN_LENGTH,
|
amine@400
|
542 )
|
amine@297
|
543
|
amine@297
|
544
|
amine@400
|
545 def test_STRICT_MIN_LENGTH(tokenizer_strict_min_length):
|
amine@400
|
546 data_source = StringDataSource("aaAAAAAAAAAAAA")
|
amine@400
|
547 # ^ ^
|
amine@400
|
548 # 2 9
|
amine@297
|
549
|
amine@400
|
550 tokens = tokenizer_strict_min_length.tokenize(data_source)
|
amine@297
|
551
|
amine@400
|
552 assert (
|
amine@400
|
553 len(tokens) == 1
|
amine@403
|
554 ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens))
|
amine@400
|
555 tok1 = tokens[0]
|
amine@297
|
556
|
amine@400
|
557 data = "".join(tok1[0])
|
amine@400
|
558 start = tok1[1]
|
amine@400
|
559 end = tok1[2]
|
amine@400
|
560 assert (
|
amine@400
|
561 data == "AAAAAAAA"
|
amine@403
|
562 ), "wrong data for token 1, expected: 'AAAAAAAA', found: {}".format(data)
|
amine@400
|
563 assert (
|
amine@400
|
564 start == 2
|
amine@403
|
565 ), "wrong start frame for token 1, expected: 2, found: {}".format(start)
|
amine@403
|
566 assert (
|
amine@403
|
567 end == 9
|
amine@403
|
568 ), "wrong end frame for token 1, expected: 9, found: {}".format(end)
|
amine@297
|
569
|
amine@297
|
570
|
amine@400
|
571 @pytest.fixture
|
amine@400
|
572 def tokenizer_drop_trailing_silence(validator):
|
amine@400
|
573 return StreamTokenizer(
|
amine@400
|
574 validator,
|
amine@400
|
575 min_length=5,
|
amine@400
|
576 max_length=10,
|
amine@400
|
577 max_continuous_silence=2,
|
amine@400
|
578 init_min=3,
|
amine@400
|
579 init_max_silence=3,
|
amine@400
|
580 mode=StreamTokenizer.DROP_TRAILING_SILENCE,
|
amine@400
|
581 )
|
amine@297
|
582
|
amine@297
|
583
|
amine@400
|
584 def test_DROP_TAILING_SILENCE(tokenizer_drop_trailing_silence):
|
amine@400
|
585 data_source = StringDataSource("aaAAAAAaaaaa")
|
amine@400
|
586 # ^ ^
|
amine@400
|
587 # 2 6
|
amine@297
|
588
|
amine@400
|
589 tokens = tokenizer_drop_trailing_silence.tokenize(data_source)
|
amine@297
|
590
|
amine@400
|
591 assert (
|
amine@400
|
592 len(tokens) == 1
|
amine@403
|
593 ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens))
|
amine@400
|
594 tok1 = tokens[0]
|
amine@297
|
595
|
amine@400
|
596 data = "".join(tok1[0])
|
amine@400
|
597 start = tok1[1]
|
amine@400
|
598 end = tok1[2]
|
amine@400
|
599 assert (
|
amine@400
|
600 data == "AAAAA"
|
amine@403
|
601 ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data)
|
amine@400
|
602 assert (
|
amine@400
|
603 start == 2
|
amine@403
|
604 ), "wrong start frame for token 1, expected: 2, found: {}".format(start)
|
amine@403
|
605 assert (
|
amine@403
|
606 end == 6
|
amine@403
|
607 ), "wrong end frame for token 1, expected: 6, found: {}".format(end)
|
amine@297
|
608
|
amine@297
|
609
|
amine@400
|
610 @pytest.fixture
|
amine@400
|
611 def tokenizer_strict_min_and_drop_trailing_silence(validator):
|
amine@400
|
612 return StreamTokenizer(
|
amine@400
|
613 validator,
|
amine@400
|
614 min_length=5,
|
amine@400
|
615 max_length=8,
|
amine@400
|
616 max_continuous_silence=3,
|
amine@400
|
617 init_min=3,
|
amine@400
|
618 init_max_silence=3,
|
amine@400
|
619 mode=StreamTokenizer.STRICT_MIN_LENGTH
|
amine@400
|
620 | StreamTokenizer.DROP_TRAILING_SILENCE,
|
amine@400
|
621 )
|
amine@297
|
622
|
amine@297
|
623
|
amine@400
|
624 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(
|
amine@400
|
625 tokenizer_strict_min_and_drop_trailing_silence,
|
amine@400
|
626 ):
|
amine@400
|
627 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
|
amine@400
|
628 # ^ ^
|
amine@400
|
629 # 2 8
|
amine@297
|
630
|
amine@400
|
631 tokens = tokenizer_strict_min_and_drop_trailing_silence.tokenize(
|
amine@400
|
632 data_source
|
amine@400
|
633 )
|
amine@297
|
634
|
amine@400
|
635 assert (
|
amine@400
|
636 len(tokens) == 1
|
amine@403
|
637 ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens))
|
amine@400
|
638 tok1 = tokens[0]
|
amine@297
|
639
|
amine@400
|
640 data = "".join(tok1[0])
|
amine@400
|
641 start = tok1[1]
|
amine@400
|
642 end = tok1[2]
|
amine@400
|
643 assert (
|
amine@400
|
644 data == "AAAAAAAA"
|
amine@403
|
645 ), "wrong data for token 1, expected: 'AAAAAAAA', found: {}".format(data)
|
amine@400
|
646 assert (
|
amine@400
|
647 start == 2
|
amine@403
|
648 ), "wrong start frame for token 1, expected: 2, found: {}".format(start)
|
amine@403
|
649 assert (
|
amine@403
|
650 end == 9
|
amine@403
|
651 ), "wrong end frame for token 1, expected: 9, found: {}".format(end)
|
amine@297
|
652
|
amine@297
|
653
|
amine@400
|
654 @pytest.fixture
|
amine@400
|
655 def tokenizer_callback(validator):
|
amine@400
|
656 return StreamTokenizer(
|
amine@400
|
657 validator,
|
amine@400
|
658 min_length=5,
|
amine@400
|
659 max_length=8,
|
amine@400
|
660 max_continuous_silence=3,
|
amine@400
|
661 init_min=3,
|
amine@400
|
662 init_max_silence=3,
|
amine@400
|
663 mode=0,
|
amine@400
|
664 )
|
amine@297
|
665
|
amine@297
|
666
|
amine@400
|
667 def test_callback(tokenizer_callback):
|
amine@400
|
668 tokens = []
|
amine@297
|
669
|
amine@400
|
670 def callback(data, start, end):
|
amine@400
|
671 tokens.append((data, start, end))
|
amine@297
|
672
|
amine@400
|
673 data_source = StringDataSource("aaAAAAAAAAAAAAa")
|
amine@400
|
674 # ^ ^^ ^
|
amine@400
|
675 # 2 910 14
|
amine@297
|
676
|
amine@400
|
677 tokenizer_callback.tokenize(data_source, callback=callback)
|
amine@2
|
678
|
amine@400
|
679 assert (
|
amine@400
|
680 len(tokens) == 2
|
amine@403
|
681 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
|