amine@297
|
1 """
|
amine@2
|
2 @author: Amine Sehili <amine.sehili@gmail.com>
|
amine@2
|
3 September 2015
|
amine@2
|
4
|
amine@297
|
5 """
|
amine@2
|
6
|
amine@2
|
7 import unittest
|
amine@2
|
8 from auditok import StreamTokenizer, StringDataSource, DataValidator
|
amine@2
|
9
|
amine@2
|
10
|
amine@2
|
11 class AValidator(DataValidator):
|
amine@2
|
12 def is_valid(self, frame):
|
amine@2
|
13 return frame == "A"
|
amine@2
|
14
|
amine@2
|
15
|
amine@2
|
16 class TestStreamTokenizerInitParams(unittest.TestCase):
|
amine@2
|
17 def setUp(self):
|
amine@2
|
18 self.A_validator = AValidator()
|
amine@297
|
19
|
amine@2
|
20 # Completely deactivate init_min and init_max_silence
|
amine@2
|
21 # The tokenizer will only rely on the other parameters
|
amine@2
|
22 # Note that if init_min = 0, the value of init_max_silence
|
amine@2
|
23 # will have no effect
|
amine@2
|
24 def test_init_min_0_init_max_silence_0(self):
|
amine@297
|
25
|
amine@297
|
26 tokenizer = StreamTokenizer(
|
amine@297
|
27 self.A_validator,
|
amine@297
|
28 min_length=5,
|
amine@297
|
29 max_length=20,
|
amine@297
|
30 max_continuous_silence=4,
|
amine@297
|
31 init_min=0,
|
amine@297
|
32 init_max_silence=0,
|
amine@297
|
33 mode=0,
|
amine@297
|
34 )
|
amine@297
|
35
|
amine@2
|
36 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
|
amine@19
|
37 # ^ ^ ^ ^
|
amine@19
|
38 # 2 16 20 27
|
amine@2
|
39 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
40
|
amine@297
|
41 self.assertEqual(
|
amine@297
|
42 len(tokens),
|
amine@297
|
43 2,
|
amine@297
|
44 msg="wrong number of tokens, expected: 2, found: {0} ".format(
|
amine@297
|
45 len(tokens)
|
amine@297
|
46 ),
|
amine@297
|
47 )
|
amine@2
|
48 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
49
|
amine@2
|
50 # tok1[0]: data
|
amine@2
|
51 # tok1[1]: start frame (included)
|
amine@2
|
52 # tok1[2]: end frame (included)
|
amine@297
|
53
|
amine@297
|
54 data = "".join(tok1[0])
|
amine@2
|
55 start = tok1[1]
|
amine@2
|
56 end = tok1[2]
|
amine@297
|
57 self.assertEqual(
|
amine@297
|
58 data,
|
amine@297
|
59 "AaaaAaAaaAaAaaaa",
|
amine@297
|
60 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(
|
amine@297
|
61 data
|
amine@297
|
62 ),
|
amine@297
|
63 )
|
amine@297
|
64 self.assertEqual(
|
amine@297
|
65 start,
|
amine@297
|
66 1,
|
amine@297
|
67 msg="wrong start frame for token 1, expected: 1, found: {0} ".format(
|
amine@297
|
68 start
|
amine@297
|
69 ),
|
amine@297
|
70 )
|
amine@297
|
71 self.assertEqual(
|
amine@297
|
72 end,
|
amine@297
|
73 16,
|
amine@297
|
74 msg="wrong end frame for token 1, expected: 16, found: {0} ".format(
|
amine@297
|
75 end
|
amine@297
|
76 ),
|
amine@297
|
77 )
|
amine@297
|
78
|
amine@297
|
79 data = "".join(tok2[0])
|
amine@2
|
80 start = tok2[1]
|
amine@2
|
81 end = tok2[2]
|
amine@297
|
82 self.assertEqual(
|
amine@297
|
83 data,
|
amine@297
|
84 "AAAAAAAA",
|
amine@297
|
85 msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(
|
amine@297
|
86 data
|
amine@297
|
87 ),
|
amine@297
|
88 )
|
amine@297
|
89 self.assertEqual(
|
amine@297
|
90 start,
|
amine@297
|
91 20,
|
amine@297
|
92 msg="wrong start frame for token 2, expected: 20, found: {0} ".format(
|
amine@297
|
93 start
|
amine@297
|
94 ),
|
amine@297
|
95 )
|
amine@297
|
96 self.assertEqual(
|
amine@297
|
97 end,
|
amine@297
|
98 27,
|
amine@297
|
99 msg="wrong end frame for token 2, expected: 27, found: {0} ".format(
|
amine@297
|
100 end
|
amine@297
|
101 ),
|
amine@297
|
102 )
|
amine@297
|
103
|
amine@5
|
104 # A valid token is considered as so iff the tokenizer encounters
|
amine@2
|
105 # at least valid frames (init_min = 3) between witch there
|
amine@2
|
106 # are at most 0 consecutive non valid frames (init_max_silence = 0)
|
amine@2
|
107 # The tokenizer will only rely on the other parameters
|
amine@2
|
108 # In other words, a valid token must start with 3 valid frames
|
amine@2
|
109 def test_init_min_3_init_max_silence_0(self):
|
amine@297
|
110
|
amine@297
|
111 tokenizer = StreamTokenizer(
|
amine@297
|
112 self.A_validator,
|
amine@297
|
113 min_length=5,
|
amine@297
|
114 max_length=20,
|
amine@297
|
115 max_continuous_silence=4,
|
amine@297
|
116 init_min=3,
|
amine@297
|
117 init_max_silence=0,
|
amine@297
|
118 mode=0,
|
amine@297
|
119 )
|
amine@297
|
120
|
amine@297
|
121 data_source = StringDataSource(
|
amine@297
|
122 "aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA"
|
amine@297
|
123 )
|
amine@5
|
124 # ^ ^ ^ ^
|
amine@5
|
125 # 18 30 33 37
|
amine@297
|
126
|
amine@2
|
127 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
128
|
amine@297
|
129 self.assertEqual(
|
amine@297
|
130 len(tokens),
|
amine@297
|
131 2,
|
amine@297
|
132 msg="wrong number of tokens, expected: 2, found: {0} ".format(
|
amine@297
|
133 len(tokens)
|
amine@297
|
134 ),
|
amine@297
|
135 )
|
amine@2
|
136 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
137
|
amine@297
|
138 data = "".join(tok1[0])
|
amine@2
|
139 start = tok1[1]
|
amine@2
|
140 end = tok1[2]
|
amine@297
|
141 self.assertEqual(
|
amine@297
|
142 data,
|
amine@297
|
143 "AAAAAAAAAaaaa",
|
amine@297
|
144 msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(
|
amine@297
|
145 data
|
amine@297
|
146 ),
|
amine@297
|
147 )
|
amine@297
|
148 self.assertEqual(
|
amine@297
|
149 start,
|
amine@297
|
150 18,
|
amine@297
|
151 msg="wrong start frame for token 1, expected: 18, found: {0} ".format(
|
amine@297
|
152 start
|
amine@297
|
153 ),
|
amine@297
|
154 )
|
amine@297
|
155 self.assertEqual(
|
amine@297
|
156 end,
|
amine@297
|
157 30,
|
amine@297
|
158 msg="wrong end frame for token 1, expected: 30, found: {0} ".format(
|
amine@297
|
159 end
|
amine@297
|
160 ),
|
amine@297
|
161 )
|
amine@297
|
162
|
amine@297
|
163 data = "".join(tok2[0])
|
amine@2
|
164 start = tok2[1]
|
amine@2
|
165 end = tok2[2]
|
amine@297
|
166 self.assertEqual(
|
amine@297
|
167 data,
|
amine@297
|
168 "AAAAA",
|
amine@297
|
169 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
|
amine@297
|
170 data
|
amine@297
|
171 ),
|
amine@297
|
172 )
|
amine@297
|
173 self.assertEqual(
|
amine@297
|
174 start,
|
amine@297
|
175 33,
|
amine@297
|
176 msg="wrong start frame for token 2, expected: 33, found: {0} ".format(
|
amine@297
|
177 start
|
amine@297
|
178 ),
|
amine@297
|
179 )
|
amine@297
|
180 self.assertEqual(
|
amine@297
|
181 end,
|
amine@297
|
182 37,
|
amine@297
|
183 msg="wrong end frame for token 2, expected: 37, found: {0} ".format(
|
amine@297
|
184 end
|
amine@297
|
185 ),
|
amine@297
|
186 )
|
amine@297
|
187
|
amine@2
|
188 # A valid token is considered iff the tokenizer encounters
|
amine@2
|
189 # at least valid frames (init_min = 3) between witch there
|
amine@2
|
190 # are at most 2 consecutive non valid frames (init_max_silence = 2)
|
amine@2
|
191 def test_init_min_3_init_max_silence_2(self):
|
amine@297
|
192
|
amine@297
|
193 tokenizer = StreamTokenizer(
|
amine@297
|
194 self.A_validator,
|
amine@297
|
195 min_length=5,
|
amine@297
|
196 max_length=20,
|
amine@297
|
197 max_continuous_silence=4,
|
amine@297
|
198 init_min=3,
|
amine@297
|
199 init_max_silence=2,
|
amine@297
|
200 mode=0,
|
amine@297
|
201 )
|
amine@297
|
202
|
amine@297
|
203 data_source = StringDataSource(
|
amine@297
|
204 "aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA"
|
amine@297
|
205 )
|
amine@5
|
206 # ^ ^ ^ ^ ^ ^
|
amine@5
|
207 # 5 16 19 31 35 39
|
amine@2
|
208 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
209
|
amine@297
|
210 self.assertEqual(
|
amine@297
|
211 len(tokens),
|
amine@297
|
212 3,
|
amine@297
|
213 msg="wrong number of tokens, expected: 3, found: {0} ".format(
|
amine@297
|
214 len(tokens)
|
amine@297
|
215 ),
|
amine@297
|
216 )
|
amine@2
|
217 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
218
|
amine@297
|
219 data = "".join(tok1[0])
|
amine@2
|
220 start = tok1[1]
|
amine@2
|
221 end = tok1[2]
|
amine@297
|
222 self.assertEqual(
|
amine@297
|
223 data,
|
amine@297
|
224 "AaAaaAaAaaaa",
|
amine@297
|
225 msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(
|
amine@297
|
226 data
|
amine@297
|
227 ),
|
amine@297
|
228 )
|
amine@297
|
229 self.assertEqual(
|
amine@297
|
230 start,
|
amine@297
|
231 5,
|
amine@297
|
232 msg="wrong start frame for token 1, expected: 5, found: {0} ".format(
|
amine@297
|
233 start
|
amine@297
|
234 ),
|
amine@297
|
235 )
|
amine@297
|
236 self.assertEqual(
|
amine@297
|
237 end,
|
amine@297
|
238 16,
|
amine@297
|
239 msg="wrong end frame for token 1, expected: 16, found: {0} ".format(
|
amine@297
|
240 end
|
amine@297
|
241 ),
|
amine@297
|
242 )
|
amine@297
|
243
|
amine@297
|
244 data = "".join(tok2[0])
|
amine@2
|
245 start = tok2[1]
|
amine@2
|
246 end = tok2[2]
|
amine@297
|
247 self.assertEqual(
|
amine@297
|
248 data,
|
amine@297
|
249 "AAAAAAAAAaaaa",
|
amine@297
|
250 msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(
|
amine@297
|
251 data
|
amine@297
|
252 ),
|
amine@297
|
253 )
|
amine@297
|
254 self.assertEqual(
|
amine@297
|
255 start,
|
amine@297
|
256 19,
|
amine@297
|
257 msg="wrong start frame for token 2, expected: 19, found: {0} ".format(
|
amine@297
|
258 start
|
amine@297
|
259 ),
|
amine@297
|
260 )
|
amine@297
|
261 self.assertEqual(
|
amine@297
|
262 end,
|
amine@297
|
263 31,
|
amine@297
|
264 msg="wrong end frame for token 2, expected: 31, found: {0} ".format(
|
amine@297
|
265 end
|
amine@297
|
266 ),
|
amine@297
|
267 )
|
amine@297
|
268
|
amine@297
|
269 data = "".join(tok3[0])
|
amine@2
|
270 start = tok3[1]
|
amine@2
|
271 end = tok3[2]
|
amine@297
|
272 self.assertEqual(
|
amine@297
|
273 data,
|
amine@297
|
274 "AAAAA",
|
amine@297
|
275 msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(
|
amine@297
|
276 data
|
amine@297
|
277 ),
|
amine@297
|
278 )
|
amine@297
|
279 self.assertEqual(
|
amine@297
|
280 start,
|
amine@297
|
281 35,
|
amine@297
|
282 msg="wrong start frame for token 2, expected: 35, found: {0} ".format(
|
amine@297
|
283 start
|
amine@297
|
284 ),
|
amine@297
|
285 )
|
amine@297
|
286 self.assertEqual(
|
amine@297
|
287 end,
|
amine@297
|
288 39,
|
amine@297
|
289 msg="wrong end frame for token 2, expected: 39, found: {0} ".format(
|
amine@297
|
290 end
|
amine@297
|
291 ),
|
amine@297
|
292 )
|
amine@297
|
293
|
amine@297
|
294
|
amine@2
|
295 class TestStreamTokenizerMinMaxLength(unittest.TestCase):
|
amine@2
|
296 def setUp(self):
|
amine@2
|
297 self.A_validator = AValidator()
|
amine@297
|
298
|
amine@2
|
299 def test_min_length_6_init_max_length_20(self):
|
amine@297
|
300
|
amine@297
|
301 tokenizer = StreamTokenizer(
|
amine@297
|
302 self.A_validator,
|
amine@297
|
303 min_length=6,
|
amine@297
|
304 max_length=20,
|
amine@297
|
305 max_continuous_silence=2,
|
amine@297
|
306 init_min=3,
|
amine@297
|
307 init_max_silence=3,
|
amine@297
|
308 mode=0,
|
amine@297
|
309 )
|
amine@297
|
310
|
amine@2
|
311 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@5
|
312 # ^ ^ ^ ^
|
amine@5
|
313 # 1 14 18 28
|
amine@297
|
314
|
amine@2
|
315 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
316
|
amine@297
|
317 self.assertEqual(
|
amine@297
|
318 len(tokens),
|
amine@297
|
319 2,
|
amine@297
|
320 msg="wrong number of tokens, expected: 2, found: {0} ".format(
|
amine@297
|
321 len(tokens)
|
amine@297
|
322 ),
|
amine@297
|
323 )
|
amine@2
|
324 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
325
|
amine@297
|
326 data = "".join(tok1[0])
|
amine@2
|
327 start = tok1[1]
|
amine@2
|
328 end = tok1[2]
|
amine@297
|
329 self.assertEqual(
|
amine@297
|
330 data,
|
amine@297
|
331 "AaaaAaAaaAaAaa",
|
amine@297
|
332 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format(
|
amine@297
|
333 data
|
amine@297
|
334 ),
|
amine@297
|
335 )
|
amine@297
|
336 self.assertEqual(
|
amine@297
|
337 start,
|
amine@297
|
338 1,
|
amine@297
|
339 msg="wrong start frame for token 1, expected: 1, found: {0} ".format(
|
amine@297
|
340 start
|
amine@297
|
341 ),
|
amine@297
|
342 )
|
amine@297
|
343 self.assertEqual(
|
amine@297
|
344 end,
|
amine@297
|
345 14,
|
amine@297
|
346 msg="wrong end frame for token 1, expected: 14, found: {0} ".format(
|
amine@297
|
347 end
|
amine@297
|
348 ),
|
amine@297
|
349 )
|
amine@297
|
350
|
amine@297
|
351 data = "".join(tok2[0])
|
amine@2
|
352 start = tok2[1]
|
amine@2
|
353 end = tok2[2]
|
amine@297
|
354 self.assertEqual(
|
amine@297
|
355 data,
|
amine@297
|
356 "AAAAAAAAAaa",
|
amine@297
|
357 msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format(
|
amine@297
|
358 data
|
amine@297
|
359 ),
|
amine@297
|
360 )
|
amine@297
|
361 self.assertEqual(
|
amine@297
|
362 start,
|
amine@297
|
363 18,
|
amine@297
|
364 msg="wrong start frame for token 2, expected: 18, found: {0} ".format(
|
amine@297
|
365 start
|
amine@297
|
366 ),
|
amine@297
|
367 )
|
amine@297
|
368 self.assertEqual(
|
amine@297
|
369 end,
|
amine@297
|
370 28,
|
amine@297
|
371 msg="wrong end frame for token 2, expected: 28, found: {0} ".format(
|
amine@297
|
372 end
|
amine@297
|
373 ),
|
amine@297
|
374 )
|
amine@297
|
375
|
amine@2
|
376 def test_min_length_1_init_max_length_1(self):
|
amine@297
|
377
|
amine@297
|
378 tokenizer = StreamTokenizer(
|
amine@297
|
379 self.A_validator,
|
amine@297
|
380 min_length=1,
|
amine@297
|
381 max_length=1,
|
amine@297
|
382 max_continuous_silence=0,
|
amine@297
|
383 init_min=0,
|
amine@297
|
384 init_max_silence=0,
|
amine@297
|
385 mode=0,
|
amine@297
|
386 )
|
amine@297
|
387
|
amine@297
|
388 data_source = StringDataSource(
|
amine@297
|
389 "AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA"
|
amine@297
|
390 )
|
amine@297
|
391
|
amine@2
|
392 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
393
|
amine@297
|
394 self.assertEqual(
|
amine@297
|
395 len(tokens),
|
amine@297
|
396 21,
|
amine@297
|
397 msg="wrong number of tokens, expected: 21, found: {0} ".format(
|
amine@297
|
398 len(tokens)
|
amine@297
|
399 ),
|
amine@297
|
400 )
|
amine@297
|
401
|
amine@2
|
402 def test_min_length_10_init_max_length_20(self):
|
amine@297
|
403
|
amine@297
|
404 tokenizer = StreamTokenizer(
|
amine@297
|
405 self.A_validator,
|
amine@297
|
406 min_length=10,
|
amine@297
|
407 max_length=20,
|
amine@297
|
408 max_continuous_silence=4,
|
amine@297
|
409 init_min=3,
|
amine@297
|
410 init_max_silence=3,
|
amine@297
|
411 mode=0,
|
amine@297
|
412 )
|
amine@297
|
413
|
amine@297
|
414 data_source = StringDataSource(
|
amine@297
|
415 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA"
|
amine@297
|
416 )
|
amine@5
|
417 # ^ ^ ^ ^
|
amine@5
|
418 # 1 16 30 45
|
amine@297
|
419
|
amine@2
|
420 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
421
|
amine@297
|
422 self.assertEqual(
|
amine@297
|
423 len(tokens),
|
amine@297
|
424 2,
|
amine@297
|
425 msg="wrong number of tokens, expected: 2, found: {0} ".format(
|
amine@297
|
426 len(tokens)
|
amine@297
|
427 ),
|
amine@297
|
428 )
|
amine@2
|
429 tok1, tok2 = tokens[0], tokens[1]
|
amine@297
|
430
|
amine@297
|
431 data = "".join(tok1[0])
|
amine@2
|
432 start = tok1[1]
|
amine@2
|
433 end = tok1[2]
|
amine@297
|
434 self.assertEqual(
|
amine@297
|
435 data,
|
amine@297
|
436 "AaaaAaAaaAaAaaaa",
|
amine@297
|
437 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format(
|
amine@297
|
438 data
|
amine@297
|
439 ),
|
amine@297
|
440 )
|
amine@297
|
441 self.assertEqual(
|
amine@297
|
442 start,
|
amine@297
|
443 1,
|
amine@297
|
444 msg="wrong start frame for token 1, expected: 1, found: {0} ".format(
|
amine@297
|
445 start
|
amine@297
|
446 ),
|
amine@297
|
447 )
|
amine@297
|
448 self.assertEqual(
|
amine@297
|
449 end,
|
amine@297
|
450 16,
|
amine@297
|
451 msg="wrong end frame for token 1, expected: 16, found: {0} ".format(
|
amine@297
|
452 end
|
amine@297
|
453 ),
|
amine@297
|
454 )
|
amine@297
|
455
|
amine@297
|
456 data = "".join(tok2[0])
|
amine@2
|
457 start = tok2[1]
|
amine@2
|
458 end = tok2[2]
|
amine@297
|
459 self.assertEqual(
|
amine@297
|
460 data,
|
amine@297
|
461 "AAAAAaaAAaaAAA",
|
amine@297
|
462 msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format(
|
amine@297
|
463 data
|
amine@297
|
464 ),
|
amine@297
|
465 )
|
amine@297
|
466 self.assertEqual(
|
amine@297
|
467 start,
|
amine@297
|
468 30,
|
amine@297
|
469 msg="wrong start frame for token 2, expected: 30, found: {0} ".format(
|
amine@297
|
470 start
|
amine@297
|
471 ),
|
amine@297
|
472 )
|
amine@297
|
473 self.assertEqual(
|
amine@297
|
474 end,
|
amine@297
|
475 43,
|
amine@297
|
476 msg="wrong end frame for token 2, expected: 43, found: {0} ".format(
|
amine@297
|
477 end
|
amine@297
|
478 ),
|
amine@297
|
479 )
|
amine@297
|
480
|
amine@2
|
481 def test_min_length_4_init_max_length_5(self):
|
amine@297
|
482
|
amine@297
|
483 tokenizer = StreamTokenizer(
|
amine@297
|
484 self.A_validator,
|
amine@297
|
485 min_length=4,
|
amine@297
|
486 max_length=5,
|
amine@297
|
487 max_continuous_silence=4,
|
amine@297
|
488 init_min=3,
|
amine@297
|
489 init_max_silence=3,
|
amine@297
|
490 mode=0,
|
amine@297
|
491 )
|
amine@297
|
492
|
amine@297
|
493 data_source = StringDataSource(
|
amine@297
|
494 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa"
|
amine@297
|
495 )
|
amine@5
|
496 # ^ ^^ ^ ^ ^ ^ ^
|
amine@5
|
497 # 18 2223 27 32 36 42 46
|
amine@297
|
498
|
amine@2
|
499 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
500
|
amine@297
|
501 self.assertEqual(
|
amine@297
|
502 len(tokens),
|
amine@297
|
503 4,
|
amine@297
|
504 msg="wrong number of tokens, expected: 4, found: {0} ".format(
|
amine@297
|
505 len(tokens)
|
amine@297
|
506 ),
|
amine@297
|
507 )
|
amine@2
|
508 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
|
amine@297
|
509
|
amine@297
|
510 data = "".join(tok1[0])
|
amine@2
|
511 start = tok1[1]
|
amine@2
|
512 end = tok1[2]
|
amine@297
|
513 self.assertEqual(
|
amine@297
|
514 data,
|
amine@297
|
515 "AAAAA",
|
amine@297
|
516 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
|
amine@297
|
517 data
|
amine@297
|
518 ),
|
amine@297
|
519 )
|
amine@297
|
520 self.assertEqual(
|
amine@297
|
521 start,
|
amine@297
|
522 18,
|
amine@297
|
523 msg="wrong start frame for token 1, expected: 18, found: {0} ".format(
|
amine@297
|
524 start
|
amine@297
|
525 ),
|
amine@297
|
526 )
|
amine@297
|
527 self.assertEqual(
|
amine@297
|
528 end,
|
amine@297
|
529 22,
|
amine@297
|
530 msg="wrong end frame for token 1, expected: 22, found: {0} ".format(
|
amine@297
|
531 end
|
amine@297
|
532 ),
|
amine@297
|
533 )
|
amine@297
|
534
|
amine@297
|
535 data = "".join(tok2[0])
|
amine@2
|
536 start = tok2[1]
|
amine@2
|
537 end = tok2[2]
|
amine@297
|
538 self.assertEqual(
|
amine@297
|
539 data,
|
amine@297
|
540 "AAAaa",
|
amine@297
|
541 msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(
|
amine@297
|
542 data
|
amine@297
|
543 ),
|
amine@297
|
544 )
|
amine@297
|
545 self.assertEqual(
|
amine@297
|
546 start,
|
amine@297
|
547 23,
|
amine@297
|
548 msg="wrong start frame for token 1, expected: 23, found: {0} ".format(
|
amine@297
|
549 start
|
amine@297
|
550 ),
|
amine@297
|
551 )
|
amine@297
|
552 self.assertEqual(
|
amine@297
|
553 end,
|
amine@297
|
554 27,
|
amine@297
|
555 msg="wrong end frame for token 1, expected: 27, found: {0} ".format(
|
amine@297
|
556 end
|
amine@297
|
557 ),
|
amine@297
|
558 )
|
amine@297
|
559
|
amine@297
|
560 data = "".join(tok3[0])
|
amine@2
|
561 start = tok3[1]
|
amine@2
|
562 end = tok3[2]
|
amine@297
|
563 self.assertEqual(
|
amine@297
|
564 data,
|
amine@297
|
565 "AAAAA",
|
amine@297
|
566 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
|
amine@297
|
567 data
|
amine@297
|
568 ),
|
amine@297
|
569 )
|
amine@297
|
570 self.assertEqual(
|
amine@297
|
571 start,
|
amine@297
|
572 32,
|
amine@297
|
573 msg="wrong start frame for token 1, expected: 1, found: {0} ".format(
|
amine@297
|
574 start
|
amine@297
|
575 ),
|
amine@297
|
576 )
|
amine@297
|
577 self.assertEqual(
|
amine@297
|
578 end,
|
amine@297
|
579 36,
|
amine@297
|
580 msg="wrong end frame for token 1, expected: 7, found: {0} ".format(
|
amine@297
|
581 end
|
amine@297
|
582 ),
|
amine@297
|
583 )
|
amine@297
|
584
|
amine@297
|
585 data = "".join(tok4[0])
|
amine@2
|
586 start = tok4[1]
|
amine@2
|
587 end = tok4[2]
|
amine@297
|
588 self.assertEqual(
|
amine@297
|
589 data,
|
amine@297
|
590 "AAaaA",
|
amine@297
|
591 msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(
|
amine@297
|
592 data
|
amine@297
|
593 ),
|
amine@297
|
594 )
|
amine@297
|
595 self.assertEqual(
|
amine@297
|
596 start,
|
amine@297
|
597 42,
|
amine@297
|
598 msg="wrong start frame for token 2, expected: 17, found: {0} ".format(
|
amine@297
|
599 start
|
amine@297
|
600 ),
|
amine@297
|
601 )
|
amine@297
|
602 self.assertEqual(
|
amine@297
|
603 end,
|
amine@297
|
604 46,
|
amine@297
|
605 msg="wrong end frame for token 2, expected: 22, found: {0} ".format(
|
amine@297
|
606 end
|
amine@297
|
607 ),
|
amine@297
|
608 )
|
amine@297
|
609
|
amine@297
|
610
|
amine@2
|
611 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase):
|
amine@2
|
612 def setUp(self):
|
amine@2
|
613 self.A_validator = AValidator()
|
amine@297
|
614
|
amine@2
|
615 def test_min_5_max_10_max_continuous_silence_0(self):
|
amine@2
|
616
|
amine@297
|
617 tokenizer = StreamTokenizer(
|
amine@297
|
618 self.A_validator,
|
amine@297
|
619 min_length=5,
|
amine@297
|
620 max_length=10,
|
amine@297
|
621 max_continuous_silence=0,
|
amine@297
|
622 init_min=3,
|
amine@297
|
623 init_max_silence=3,
|
amine@297
|
624 mode=0,
|
amine@297
|
625 )
|
amine@297
|
626
|
amine@2
|
627 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@5
|
628 # ^ ^ ^ ^ ^ ^
|
amine@5
|
629 # 3 7 9 14 17 25
|
amine@297
|
630
|
amine@2
|
631 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
632
|
amine@297
|
633 self.assertEqual(
|
amine@297
|
634 len(tokens),
|
amine@297
|
635 3,
|
amine@297
|
636 msg="wrong number of tokens, expected: 3, found: {0} ".format(
|
amine@297
|
637 len(tokens)
|
amine@297
|
638 ),
|
amine@297
|
639 )
|
amine@2
|
640 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
641
|
amine@297
|
642 data = "".join(tok1[0])
|
amine@2
|
643 start = tok1[1]
|
amine@2
|
644 end = tok1[2]
|
amine@297
|
645 self.assertEqual(
|
amine@297
|
646 data,
|
amine@297
|
647 "AAAAA",
|
amine@297
|
648 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
|
amine@297
|
649 data
|
amine@297
|
650 ),
|
amine@297
|
651 )
|
amine@297
|
652 self.assertEqual(
|
amine@297
|
653 start,
|
amine@297
|
654 3,
|
amine@297
|
655 msg="wrong start frame for token 1, expected: 3, found: {0} ".format(
|
amine@297
|
656 start
|
amine@297
|
657 ),
|
amine@297
|
658 )
|
amine@297
|
659 self.assertEqual(
|
amine@297
|
660 end,
|
amine@297
|
661 7,
|
amine@297
|
662 msg="wrong end frame for token 1, expected: 7, found: {0} ".format(
|
amine@297
|
663 end
|
amine@297
|
664 ),
|
amine@297
|
665 )
|
amine@297
|
666
|
amine@297
|
667 data = "".join(tok2[0])
|
amine@2
|
668 start = tok2[1]
|
amine@2
|
669 end = tok2[2]
|
amine@297
|
670 self.assertEqual(
|
amine@297
|
671 data,
|
amine@297
|
672 "AAAAAA",
|
amine@297
|
673 msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format(
|
amine@297
|
674 data
|
amine@297
|
675 ),
|
amine@297
|
676 )
|
amine@297
|
677 self.assertEqual(
|
amine@297
|
678 start,
|
amine@297
|
679 9,
|
amine@297
|
680 msg="wrong start frame for token 1, expected: 9, found: {0} ".format(
|
amine@297
|
681 start
|
amine@297
|
682 ),
|
amine@297
|
683 )
|
amine@297
|
684 self.assertEqual(
|
amine@297
|
685 end,
|
amine@297
|
686 14,
|
amine@297
|
687 msg="wrong end frame for token 1, expected: 14, found: {0} ".format(
|
amine@297
|
688 end
|
amine@297
|
689 ),
|
amine@297
|
690 )
|
amine@297
|
691
|
amine@297
|
692 data = "".join(tok3[0])
|
amine@2
|
693 start = tok3[1]
|
amine@2
|
694 end = tok3[2]
|
amine@297
|
695 self.assertEqual(
|
amine@297
|
696 data,
|
amine@297
|
697 "AAAAAAAAA",
|
amine@297
|
698 msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format(
|
amine@297
|
699 data
|
amine@297
|
700 ),
|
amine@297
|
701 )
|
amine@297
|
702 self.assertEqual(
|
amine@297
|
703 start,
|
amine@297
|
704 17,
|
amine@297
|
705 msg="wrong start frame for token 1, expected: 17, found: {0} ".format(
|
amine@297
|
706 start
|
amine@297
|
707 ),
|
amine@297
|
708 )
|
amine@297
|
709 self.assertEqual(
|
amine@297
|
710 end,
|
amine@297
|
711 25,
|
amine@297
|
712 msg="wrong end frame for token 1, expected: 25, found: {0} ".format(
|
amine@297
|
713 end
|
amine@297
|
714 ),
|
amine@297
|
715 )
|
amine@297
|
716
|
amine@2
|
717 def test_min_5_max_10_max_continuous_silence_1(self):
|
amine@2
|
718
|
amine@297
|
719 tokenizer = StreamTokenizer(
|
amine@297
|
720 self.A_validator,
|
amine@297
|
721 min_length=5,
|
amine@297
|
722 max_length=10,
|
amine@297
|
723 max_continuous_silence=1,
|
amine@297
|
724 init_min=3,
|
amine@297
|
725 init_max_silence=3,
|
amine@297
|
726 mode=0,
|
amine@297
|
727 )
|
amine@297
|
728
|
amine@2
|
729 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@5
|
730 # ^ ^^ ^ ^ ^
|
amine@5
|
731 # 3 12131517 26
|
amine@5
|
732 # (12 13 15 17)
|
amine@297
|
733
|
amine@2
|
734 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
735
|
amine@297
|
736 self.assertEqual(
|
amine@297
|
737 len(tokens),
|
amine@297
|
738 3,
|
amine@297
|
739 msg="wrong number of tokens, expected: 3, found: {0} ".format(
|
amine@297
|
740 len(tokens)
|
amine@297
|
741 ),
|
amine@297
|
742 )
|
amine@2
|
743 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@297
|
744
|
amine@297
|
745 data = "".join(tok1[0])
|
amine@2
|
746 start = tok1[1]
|
amine@2
|
747 end = tok1[2]
|
amine@297
|
748 self.assertEqual(
|
amine@297
|
749 data,
|
amine@297
|
750 "AAAAAaAAAA",
|
amine@297
|
751 msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(
|
amine@297
|
752 data
|
amine@297
|
753 ),
|
amine@297
|
754 )
|
amine@297
|
755 self.assertEqual(
|
amine@297
|
756 start,
|
amine@297
|
757 3,
|
amine@297
|
758 msg="wrong start frame for token 1, expected: 3, found: {0} ".format(
|
amine@297
|
759 start
|
amine@297
|
760 ),
|
amine@297
|
761 )
|
amine@297
|
762 self.assertEqual(
|
amine@297
|
763 end,
|
amine@297
|
764 12,
|
amine@297
|
765 msg="wrong end frame for token 1, expected: 10, found: {0} ".format(
|
amine@297
|
766 end
|
amine@297
|
767 ),
|
amine@297
|
768 )
|
amine@297
|
769
|
amine@297
|
770 data = "".join(tok2[0])
|
amine@2
|
771 start = tok2[1]
|
amine@2
|
772 end = tok2[2]
|
amine@297
|
773 self.assertEqual(
|
amine@297
|
774 data,
|
amine@297
|
775 "AAa",
|
amine@297
|
776 msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(
|
amine@297
|
777 data
|
amine@297
|
778 ),
|
amine@297
|
779 )
|
amine@297
|
780 self.assertEqual(
|
amine@297
|
781 start,
|
amine@297
|
782 13,
|
amine@297
|
783 msg="wrong start frame for token 1, expected: 9, found: {0} ".format(
|
amine@297
|
784 start
|
amine@297
|
785 ),
|
amine@297
|
786 )
|
amine@297
|
787 self.assertEqual(
|
amine@297
|
788 end,
|
amine@297
|
789 15,
|
amine@297
|
790 msg="wrong end frame for token 1, expected: 14, found: {0} ".format(
|
amine@297
|
791 end
|
amine@297
|
792 ),
|
amine@297
|
793 )
|
amine@297
|
794
|
amine@297
|
795 data = "".join(tok3[0])
|
amine@2
|
796 start = tok3[1]
|
amine@2
|
797 end = tok3[2]
|
amine@297
|
798 self.assertEqual(
|
amine@297
|
799 data,
|
amine@297
|
800 "AAAAAAAAAa",
|
amine@297
|
801 msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(
|
amine@297
|
802 data
|
amine@297
|
803 ),
|
amine@297
|
804 )
|
amine@297
|
805 self.assertEqual(
|
amine@297
|
806 start,
|
amine@297
|
807 17,
|
amine@297
|
808 msg="wrong start frame for token 1, expected: 17, found: {0} ".format(
|
amine@297
|
809 start
|
amine@297
|
810 ),
|
amine@297
|
811 )
|
amine@297
|
812 self.assertEqual(
|
amine@297
|
813 end,
|
amine@297
|
814 26,
|
amine@297
|
815 msg="wrong end frame for token 1, expected: 26, found: {0} ".format(
|
amine@297
|
816 end
|
amine@297
|
817 ),
|
amine@297
|
818 )
|
amine@297
|
819
|
amine@297
|
820
|
amine@2
|
821 class TestStreamTokenizerModes(unittest.TestCase):
|
amine@2
|
822 def setUp(self):
|
amine@2
|
823 self.A_validator = AValidator()
|
amine@297
|
824
|
amine@2
|
825 def test_STRICT_MIN_LENGTH(self):
|
amine@297
|
826
|
amine@297
|
827 tokenizer = StreamTokenizer(
|
amine@297
|
828 self.A_validator,
|
amine@297
|
829 min_length=5,
|
amine@297
|
830 max_length=8,
|
amine@297
|
831 max_continuous_silence=3,
|
amine@297
|
832 init_min=3,
|
amine@297
|
833 init_max_silence=3,
|
amine@297
|
834 mode=StreamTokenizer.STRICT_MIN_LENGTH,
|
amine@297
|
835 )
|
amine@297
|
836
|
amine@2
|
837 data_source = StringDataSource("aaAAAAAAAAAAAA")
|
amine@5
|
838 # ^ ^
|
amine@5
|
839 # 2 9
|
amine@297
|
840
|
amine@2
|
841 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
842
|
amine@297
|
843 self.assertEqual(
|
amine@297
|
844 len(tokens),
|
amine@297
|
845 1,
|
amine@297
|
846 msg="wrong number of tokens, expected: 1, found: {0} ".format(
|
amine@297
|
847 len(tokens)
|
amine@297
|
848 ),
|
amine@297
|
849 )
|
amine@2
|
850 tok1 = tokens[0]
|
amine@297
|
851
|
amine@297
|
852 data = "".join(tok1[0])
|
amine@2
|
853 start = tok1[1]
|
amine@2
|
854 end = tok1[2]
|
amine@297
|
855 self.assertEqual(
|
amine@297
|
856 data,
|
amine@297
|
857 "AAAAAAAA",
|
amine@297
|
858 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(
|
amine@297
|
859 data
|
amine@297
|
860 ),
|
amine@297
|
861 )
|
amine@297
|
862 self.assertEqual(
|
amine@297
|
863 start,
|
amine@297
|
864 2,
|
amine@297
|
865 msg="wrong start frame for token 1, expected: 2, found: {0} ".format(
|
amine@297
|
866 start
|
amine@297
|
867 ),
|
amine@297
|
868 )
|
amine@297
|
869 self.assertEqual(
|
amine@297
|
870 end,
|
amine@297
|
871 9,
|
amine@297
|
872 msg="wrong end frame for token 1, expected: 9, found: {0} ".format(
|
amine@297
|
873 end
|
amine@297
|
874 ),
|
amine@297
|
875 )
|
amine@297
|
876
|
amine@3
|
877 def test_DROP_TAILING_SILENCE(self):
|
amine@297
|
878
|
amine@297
|
879 tokenizer = StreamTokenizer(
|
amine@297
|
880 self.A_validator,
|
amine@297
|
881 min_length=5,
|
amine@297
|
882 max_length=10,
|
amine@297
|
883 max_continuous_silence=2,
|
amine@297
|
884 init_min=3,
|
amine@297
|
885 init_max_silence=3,
|
amine@297
|
886 mode=StreamTokenizer.DROP_TRAILING_SILENCE,
|
amine@297
|
887 )
|
amine@297
|
888
|
amine@2
|
889 data_source = StringDataSource("aaAAAAAaaaaa")
|
amine@5
|
890 # ^ ^
|
amine@5
|
891 # 2 6
|
amine@297
|
892
|
amine@2
|
893 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
894
|
amine@297
|
895 self.assertEqual(
|
amine@297
|
896 len(tokens),
|
amine@297
|
897 1,
|
amine@297
|
898 msg="wrong number of tokens, expected: 1, found: {0} ".format(
|
amine@297
|
899 len(tokens)
|
amine@297
|
900 ),
|
amine@297
|
901 )
|
amine@2
|
902 tok1 = tokens[0]
|
amine@297
|
903
|
amine@297
|
904 data = "".join(tok1[0])
|
amine@2
|
905 start = tok1[1]
|
amine@2
|
906 end = tok1[2]
|
amine@297
|
907 self.assertEqual(
|
amine@297
|
908 data,
|
amine@297
|
909 "AAAAA",
|
amine@297
|
910 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
|
amine@297
|
911 data
|
amine@297
|
912 ),
|
amine@297
|
913 )
|
amine@297
|
914 self.assertEqual(
|
amine@297
|
915 start,
|
amine@297
|
916 2,
|
amine@297
|
917 msg="wrong start frame for token 1, expected: 2, found: {0} ".format(
|
amine@297
|
918 start
|
amine@297
|
919 ),
|
amine@297
|
920 )
|
amine@297
|
921 self.assertEqual(
|
amine@297
|
922 end,
|
amine@297
|
923 6,
|
amine@297
|
924 msg="wrong end frame for token 1, expected: 6, found: {0} ".format(
|
amine@297
|
925 end
|
amine@297
|
926 ),
|
amine@297
|
927 )
|
amine@297
|
928
|
amine@3
|
929 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self):
|
amine@297
|
930
|
amine@297
|
931 tokenizer = StreamTokenizer(
|
amine@297
|
932 self.A_validator,
|
amine@297
|
933 min_length=5,
|
amine@297
|
934 max_length=8,
|
amine@297
|
935 max_continuous_silence=3,
|
amine@297
|
936 init_min=3,
|
amine@297
|
937 init_max_silence=3,
|
amine@297
|
938 mode=StreamTokenizer.STRICT_MIN_LENGTH
|
amine@297
|
939 | StreamTokenizer.DROP_TRAILING_SILENCE,
|
amine@297
|
940 )
|
amine@297
|
941
|
amine@2
|
942 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
|
amine@5
|
943 # ^ ^
|
amine@5
|
944 # 2 8
|
amine@297
|
945
|
amine@2
|
946 tokens = tokenizer.tokenize(data_source)
|
amine@297
|
947
|
amine@297
|
948 self.assertEqual(
|
amine@297
|
949 len(tokens),
|
amine@297
|
950 1,
|
amine@297
|
951 msg="wrong number of tokens, expected: 1, found: {0} ".format(
|
amine@297
|
952 len(tokens)
|
amine@297
|
953 ),
|
amine@297
|
954 )
|
amine@2
|
955 tok1 = tokens[0]
|
amine@297
|
956
|
amine@297
|
957 data = "".join(tok1[0])
|
amine@2
|
958 start = tok1[1]
|
amine@2
|
959 end = tok1[2]
|
amine@297
|
960 self.assertEqual(
|
amine@297
|
961 data,
|
amine@297
|
962 "AAAAAAAA",
|
amine@297
|
963 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(
|
amine@297
|
964 data
|
amine@297
|
965 ),
|
amine@297
|
966 )
|
amine@297
|
967 self.assertEqual(
|
amine@297
|
968 start,
|
amine@297
|
969 2,
|
amine@297
|
970 msg="wrong start frame for token 1, expected: 2, found: {0} ".format(
|
amine@297
|
971 start
|
amine@297
|
972 ),
|
amine@297
|
973 )
|
amine@297
|
974 self.assertEqual(
|
amine@297
|
975 end,
|
amine@297
|
976 9,
|
amine@297
|
977 msg="wrong end frame for token 1, expected: 9, found: {0} ".format(
|
amine@297
|
978 end
|
amine@297
|
979 ),
|
amine@297
|
980 )
|
amine@297
|
981
|
amine@297
|
982
|
amine@2
|
983 class TestStreamTokenizerCallback(unittest.TestCase):
|
amine@2
|
984 def setUp(self):
|
amine@2
|
985 self.A_validator = AValidator()
|
amine@297
|
986
|
amine@2
|
987 def test_callback(self):
|
amine@297
|
988
|
amine@2
|
989 tokens = []
|
amine@297
|
990
|
amine@2
|
991 def callback(data, start, end):
|
amine@2
|
992 tokens.append((data, start, end))
|
amine@297
|
993
|
amine@297
|
994 tokenizer = StreamTokenizer(
|
amine@297
|
995 self.A_validator,
|
amine@297
|
996 min_length=5,
|
amine@297
|
997 max_length=8,
|
amine@297
|
998 max_continuous_silence=3,
|
amine@297
|
999 init_min=3,
|
amine@297
|
1000 init_max_silence=3,
|
amine@297
|
1001 mode=0,
|
amine@297
|
1002 )
|
amine@297
|
1003
|
amine@2
|
1004 data_source = StringDataSource("aaAAAAAAAAAAAAa")
|
amine@5
|
1005 # ^ ^^ ^
|
amine@5
|
1006 # 2 910 14
|
amine@297
|
1007
|
amine@2
|
1008 tokenizer.tokenize(data_source, callback=callback)
|
amine@297
|
1009
|
amine@297
|
1010 self.assertEqual(
|
amine@297
|
1011 len(tokens),
|
amine@297
|
1012 2,
|
amine@297
|
1013 msg="wrong number of tokens, expected: 1, found: {0} ".format(
|
amine@297
|
1014 len(tokens)
|
amine@297
|
1015 ),
|
amine@297
|
1016 )
|
amine@2
|
1017
|
amine@2
|
1018
|
amine@2
|
1019 if __name__ == "__main__":
|
amine@297
|
1020 # import sys;sys.argv = ['', 'Test.testName']
|
amine@2
|
1021 unittest.main()
|