Mercurial > hg > auditok
comparison tests/test_StreamTokenizer.py @ 334:f7cbf707a34e
Refactor test_StreamTokenizer
author | Amine Sehili <amine.sehili@gmail.com> |
---|---|
date | Fri, 25 Oct 2019 20:56:12 +0100 |
parents | 7259b1eb9329 |
children | 9f17aa9a4018 |
comparison
equal
deleted
inserted
replaced
333:6fc2d27bd2ef | 334:f7cbf707a34e |
---|---|
55 start = tok1[1] | 55 start = tok1[1] |
56 end = tok1[2] | 56 end = tok1[2] |
57 self.assertEqual( | 57 self.assertEqual( |
58 data, | 58 data, |
59 "AaaaAaAaaAaAaaaa", | 59 "AaaaAaAaaAaAaaaa", |
60 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format( | 60 msg=( |
61 data | 61 "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', " |
62 ), | 62 "found: {0} " |
63 ).format(data), | |
63 ) | 64 ) |
64 self.assertEqual( | 65 self.assertEqual( |
65 start, | 66 start, |
66 1, | 67 1, |
67 msg="wrong start frame for token 1, expected: 1, found: {0} ".format( | 68 msg=( |
68 start | 69 "wrong start frame for token 1, expected: 1, found: {0} " |
69 ), | 70 ).format(start), |
70 ) | 71 ) |
71 self.assertEqual( | 72 self.assertEqual( |
72 end, | 73 end, |
73 16, | 74 16, |
74 msg="wrong end frame for token 1, expected: 16, found: {0} ".format( | 75 msg=( |
75 end | 76 "wrong end frame for token 1, expected: 16, found: {0} " |
76 ), | 77 ).format(end), |
77 ) | 78 ) |
78 | 79 |
79 data = "".join(tok2[0]) | 80 data = "".join(tok2[0]) |
80 start = tok2[1] | 81 start = tok2[1] |
81 end = tok2[2] | 82 end = tok2[2] |
82 self.assertEqual( | 83 self.assertEqual( |
83 data, | 84 data, |
84 "AAAAAAAA", | 85 "AAAAAAAA", |
85 msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format( | 86 msg=( |
86 data | 87 "wrong data for token 1, expected: 'AAAAAAAA', found: {0} " |
87 ), | 88 ).format(data), |
88 ) | 89 ) |
89 self.assertEqual( | 90 self.assertEqual( |
90 start, | 91 start, |
91 20, | 92 20, |
92 msg="wrong start frame for token 2, expected: 20, found: {0} ".format( | 93 msg=( |
93 start | 94 "wrong start frame for token 2, expected: 20, found: {0} " |
94 ), | 95 ).format(start), |
95 ) | 96 ) |
96 self.assertEqual( | 97 self.assertEqual( |
97 end, | 98 end, |
98 27, | 99 27, |
99 msg="wrong end frame for token 2, expected: 27, found: {0} ".format( | 100 msg=( |
100 end | 101 "wrong end frame for token 2, expected: 27, found: {0} " |
101 ), | 102 ).format(end), |
102 ) | 103 ) |
103 | 104 |
104 # A valid token is considered as so iff the tokenizer encounters | 105 # A valid token is considered as so iff the tokenizer encounters |
105 # at least valid frames (init_min = 3) between witch there | 106 # at least valid frames (init_min = 3) between witch there |
106 # are at most 0 consecutive non valid frames (init_max_silence = 0) | 107 # are at most 0 consecutive non valid frames (init_max_silence = 0) |
119 ) | 120 ) |
120 | 121 |
121 data_source = StringDataSource( | 122 data_source = StringDataSource( |
122 "aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA" | 123 "aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA" |
123 ) | 124 ) |
124 # ^ ^ ^ ^ | 125 # ^ ^ ^ ^ |
125 # 18 30 33 37 | 126 # 18 30 33 37 |
126 | 127 |
127 tokens = tokenizer.tokenize(data_source) | 128 tokens = tokenizer.tokenize(data_source) |
128 | 129 |
129 self.assertEqual( | 130 self.assertEqual( |
130 len(tokens), | 131 len(tokens), |
139 start = tok1[1] | 140 start = tok1[1] |
140 end = tok1[2] | 141 end = tok1[2] |
141 self.assertEqual( | 142 self.assertEqual( |
142 data, | 143 data, |
143 "AAAAAAAAAaaaa", | 144 "AAAAAAAAAaaaa", |
144 msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format( | 145 msg=( |
145 data | 146 "wrong data for token 1, expected: 'AAAAAAAAAaaaa', " |
146 ), | 147 "found: '{0}' " |
148 ).format(data), | |
147 ) | 149 ) |
148 self.assertEqual( | 150 self.assertEqual( |
149 start, | 151 start, |
150 18, | 152 18, |
151 msg="wrong start frame for token 1, expected: 18, found: {0} ".format( | 153 msg=( |
152 start | 154 "wrong start frame for token 1, expected: 18, found: {0} " |
153 ), | 155 ).format(start), |
154 ) | 156 ) |
155 self.assertEqual( | 157 self.assertEqual( |
156 end, | 158 end, |
157 30, | 159 30, |
158 msg="wrong end frame for token 1, expected: 30, found: {0} ".format( | 160 msg=( |
159 end | 161 "wrong end frame for token 1, expected: 30, found: {0} " |
160 ), | 162 ).format(end), |
161 ) | 163 ) |
162 | 164 |
163 data = "".join(tok2[0]) | 165 data = "".join(tok2[0]) |
164 start = tok2[1] | 166 start = tok2[1] |
165 end = tok2[2] | 167 end = tok2[2] |
166 self.assertEqual( | 168 self.assertEqual( |
167 data, | 169 data, |
168 "AAAAA", | 170 "AAAAA", |
169 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( | 171 msg=( |
170 data | 172 "wrong data for token 1, expected: 'AAAAA', found: '{0}' " |
171 ), | 173 ).format(data), |
172 ) | 174 ) |
173 self.assertEqual( | 175 self.assertEqual( |
174 start, | 176 start, |
175 33, | 177 33, |
176 msg="wrong start frame for token 2, expected: 33, found: {0} ".format( | 178 msg=( |
177 start | 179 "wrong start frame for token 2, expected: 33, found: {0} " |
178 ), | 180 ).format(start), |
179 ) | 181 ) |
180 self.assertEqual( | 182 self.assertEqual( |
181 end, | 183 end, |
182 37, | 184 37, |
183 msg="wrong end frame for token 2, expected: 37, found: {0} ".format( | 185 msg=( |
184 end | 186 "wrong end frame for token 2, expected: 37, found: {0} " |
185 ), | 187 ).format(end), |
186 ) | 188 ) |
187 | 189 |
188 # A valid token is considered iff the tokenizer encounters | 190 # A valid token is considered iff the tokenizer encounters |
189 # at least valid frames (init_min = 3) between witch there | 191 # at least valid frames (init_min = 3) between witch there |
190 # are at most 2 consecutive non valid frames (init_max_silence = 2) | 192 # are at most 2 consecutive non valid frames (init_max_silence = 2) |
201 ) | 203 ) |
202 | 204 |
203 data_source = StringDataSource( | 205 data_source = StringDataSource( |
204 "aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA" | 206 "aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA" |
205 ) | 207 ) |
206 # ^ ^ ^ ^ ^ ^ | 208 # ^ ^ ^ ^ ^ ^ |
207 # 5 16 19 31 35 39 | 209 # 5 16 19 31 35 39 |
208 tokens = tokenizer.tokenize(data_source) | 210 tokens = tokenizer.tokenize(data_source) |
209 | 211 |
210 self.assertEqual( | 212 self.assertEqual( |
211 len(tokens), | 213 len(tokens), |
212 3, | 214 3, |
220 start = tok1[1] | 222 start = tok1[1] |
221 end = tok1[2] | 223 end = tok1[2] |
222 self.assertEqual( | 224 self.assertEqual( |
223 data, | 225 data, |
224 "AaAaaAaAaaaa", | 226 "AaAaaAaAaaaa", |
225 msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format( | 227 msg=( |
226 data | 228 "wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' " |
227 ), | 229 ).format(data), |
228 ) | 230 ) |
229 self.assertEqual( | 231 self.assertEqual( |
230 start, | 232 start, |
231 5, | 233 5, |
232 msg="wrong start frame for token 1, expected: 5, found: {0} ".format( | 234 msg=( |
233 start | 235 "wrong start frame for token 1, expected: 5, found: {0} " |
234 ), | 236 ).format(start), |
235 ) | 237 ) |
236 self.assertEqual( | 238 self.assertEqual( |
237 end, | 239 end, |
238 16, | 240 16, |
239 msg="wrong end frame for token 1, expected: 16, found: {0} ".format( | 241 msg=( |
240 end | 242 "wrong end frame for token 1, expected: 16, found: {0} " |
241 ), | 243 ).format(end), |
242 ) | 244 ) |
243 | 245 |
244 data = "".join(tok2[0]) | 246 data = "".join(tok2[0]) |
245 start = tok2[1] | 247 start = tok2[1] |
246 end = tok2[2] | 248 end = tok2[2] |
247 self.assertEqual( | 249 self.assertEqual( |
248 data, | 250 data, |
249 "AAAAAAAAAaaaa", | 251 "AAAAAAAAAaaaa", |
250 msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format( | 252 msg=( |
251 data | 253 "wrong data for token 2, expected: 'AAAAAAAAAaaaa', " |
252 ), | 254 "found: '{0}' " |
255 ).format(data), | |
253 ) | 256 ) |
254 self.assertEqual( | 257 self.assertEqual( |
255 start, | 258 start, |
256 19, | 259 19, |
257 msg="wrong start frame for token 2, expected: 19, found: {0} ".format( | 260 msg=( |
258 start | 261 "wrong start frame for token 2, expected: 19, found: {0} " |
259 ), | 262 ).format(start), |
260 ) | 263 ) |
261 self.assertEqual( | 264 self.assertEqual( |
262 end, | 265 end, |
263 31, | 266 31, |
264 msg="wrong end frame for token 2, expected: 31, found: {0} ".format( | 267 msg=( |
265 end | 268 "wrong end frame for token 2, expected: 31, found: {0} " |
266 ), | 269 ).format(end), |
267 ) | 270 ) |
268 | 271 |
269 data = "".join(tok3[0]) | 272 data = "".join(tok3[0]) |
270 start = tok3[1] | 273 start = tok3[1] |
271 end = tok3[2] | 274 end = tok3[2] |
272 self.assertEqual( | 275 self.assertEqual( |
273 data, | 276 data, |
274 "AAAAA", | 277 "AAAAA", |
275 msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format( | 278 msg=( |
276 data | 279 "wrong data for token 3, expected: 'AAAAA', found: '{0}' " |
277 ), | 280 ).format(data), |
278 ) | 281 ) |
279 self.assertEqual( | 282 self.assertEqual( |
280 start, | 283 start, |
281 35, | 284 35, |
282 msg="wrong start frame for token 2, expected: 35, found: {0} ".format( | 285 msg=( |
283 start | 286 "wrong start frame for token 2, expected: 35, found: {0} " |
284 ), | 287 ).format(start), |
285 ) | 288 ) |
286 self.assertEqual( | 289 self.assertEqual( |
287 end, | 290 end, |
288 39, | 291 39, |
289 msg="wrong end frame for token 2, expected: 39, found: {0} ".format( | 292 msg=( |
290 end | 293 "wrong end frame for token 2, expected: 39, found: {0} " |
291 ), | 294 ).format(end), |
292 ) | 295 ) |
293 | 296 |
294 | 297 |
295 class TestStreamTokenizerMinMaxLength(unittest.TestCase): | 298 class TestStreamTokenizerMinMaxLength(unittest.TestCase): |
296 def setUp(self): | 299 def setUp(self): |
327 start = tok1[1] | 330 start = tok1[1] |
328 end = tok1[2] | 331 end = tok1[2] |
329 self.assertEqual( | 332 self.assertEqual( |
330 data, | 333 data, |
331 "AaaaAaAaaAaAaa", | 334 "AaaaAaAaaAaAaa", |
332 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format( | 335 msg=( |
333 data | 336 "wrong data for token 1, expected: 'AaaaAaAaaAaAaa', " |
334 ), | 337 "found: '{0}' " |
338 ).format(data), | |
335 ) | 339 ) |
336 self.assertEqual( | 340 self.assertEqual( |
337 start, | 341 start, |
338 1, | 342 1, |
339 msg="wrong start frame for token 1, expected: 1, found: {0} ".format( | 343 msg=( |
340 start | 344 "wrong start frame for token 1, expected: 1, found: {0} " |
341 ), | 345 ).format(start), |
342 ) | 346 ) |
343 self.assertEqual( | 347 self.assertEqual( |
344 end, | 348 end, |
345 14, | 349 14, |
346 msg="wrong end frame for token 1, expected: 14, found: {0} ".format( | 350 msg=( |
347 end | 351 "wrong end frame for token 1, expected: 14, found: {0} " |
348 ), | 352 ).format(end), |
349 ) | 353 ) |
350 | 354 |
351 data = "".join(tok2[0]) | 355 data = "".join(tok2[0]) |
352 start = tok2[1] | 356 start = tok2[1] |
353 end = tok2[2] | 357 end = tok2[2] |
354 self.assertEqual( | 358 self.assertEqual( |
355 data, | 359 data, |
356 "AAAAAAAAAaa", | 360 "AAAAAAAAAaa", |
357 msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format( | 361 msg=( |
358 data | 362 "wrong data for token 2, expected: 'AAAAAAAAAaa', " |
359 ), | 363 "found: '{0}' " |
364 ).format(data), | |
360 ) | 365 ) |
361 self.assertEqual( | 366 self.assertEqual( |
362 start, | 367 start, |
363 18, | 368 18, |
364 msg="wrong start frame for token 2, expected: 18, found: {0} ".format( | 369 msg=( |
365 start | 370 "wrong start frame for token 2, expected: 18, found: {0} " |
366 ), | 371 ).format(start), |
367 ) | 372 ) |
368 self.assertEqual( | 373 self.assertEqual( |
369 end, | 374 end, |
370 28, | 375 28, |
371 msg="wrong end frame for token 2, expected: 28, found: {0} ".format( | 376 msg=( |
372 end | 377 "wrong end frame for token 2, expected: 28, found: {0} " |
373 ), | 378 ).format(end), |
374 ) | 379 ) |
375 | 380 |
376 def test_min_length_1_init_max_length_1(self): | 381 def test_min_length_1_init_max_length_1(self): |
377 | 382 |
378 tokenizer = StreamTokenizer( | 383 tokenizer = StreamTokenizer( |
412 ) | 417 ) |
413 | 418 |
414 data_source = StringDataSource( | 419 data_source = StringDataSource( |
415 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA" | 420 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA" |
416 ) | 421 ) |
417 # ^ ^ ^ ^ | 422 # ^ ^ ^ ^ |
418 # 1 16 30 45 | 423 # 1 16 30 45 |
419 | 424 |
420 tokens = tokenizer.tokenize(data_source) | 425 tokens = tokenizer.tokenize(data_source) |
421 | 426 |
422 self.assertEqual( | 427 self.assertEqual( |
423 len(tokens), | 428 len(tokens), |
432 start = tok1[1] | 437 start = tok1[1] |
433 end = tok1[2] | 438 end = tok1[2] |
434 self.assertEqual( | 439 self.assertEqual( |
435 data, | 440 data, |
436 "AaaaAaAaaAaAaaaa", | 441 "AaaaAaAaaAaAaaaa", |
437 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format( | 442 msg=( |
438 data | 443 "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', " |
439 ), | 444 "found: '{0}' " |
445 ).format(data), | |
440 ) | 446 ) |
441 self.assertEqual( | 447 self.assertEqual( |
442 start, | 448 start, |
443 1, | 449 1, |
444 msg="wrong start frame for token 1, expected: 1, found: {0} ".format( | 450 msg=( |
445 start | 451 "wrong start frame for token 1, expected: 1, found: {0} " |
446 ), | 452 ).format(start), |
447 ) | 453 ) |
448 self.assertEqual( | 454 self.assertEqual( |
449 end, | 455 end, |
450 16, | 456 16, |
451 msg="wrong end frame for token 1, expected: 16, found: {0} ".format( | 457 msg=( |
452 end | 458 "wrong end frame for token 1, expected: 16, found: {0} " |
453 ), | 459 ).format(end), |
454 ) | 460 ) |
455 | 461 |
456 data = "".join(tok2[0]) | 462 data = "".join(tok2[0]) |
457 start = tok2[1] | 463 start = tok2[1] |
458 end = tok2[2] | 464 end = tok2[2] |
459 self.assertEqual( | 465 self.assertEqual( |
460 data, | 466 data, |
461 "AAAAAaaAAaaAAA", | 467 "AAAAAaaAAaaAAA", |
462 msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format( | 468 msg=( |
463 data | 469 "wrong data for token 2, expected: 'AAAAAaaAAaaAAA', " |
464 ), | 470 "found: '{0}' " |
471 ).format(data), | |
465 ) | 472 ) |
466 self.assertEqual( | 473 self.assertEqual( |
467 start, | 474 start, |
468 30, | 475 30, |
469 msg="wrong start frame for token 2, expected: 30, found: {0} ".format( | 476 msg=( |
470 start | 477 "wrong start frame for token 2, expected: 30, found: {0} " |
471 ), | 478 ).format(start), |
472 ) | 479 ) |
473 self.assertEqual( | 480 self.assertEqual( |
474 end, | 481 end, |
475 43, | 482 43, |
476 msg="wrong end frame for token 2, expected: 43, found: {0} ".format( | 483 msg=( |
477 end | 484 "wrong end frame for token 2, expected: 43, found: {0} " |
478 ), | 485 ).format(end), |
479 ) | 486 ) |
480 | 487 |
481 def test_min_length_4_init_max_length_5(self): | 488 def test_min_length_4_init_max_length_5(self): |
482 | 489 |
483 tokenizer = StreamTokenizer( | 490 tokenizer = StreamTokenizer( |
491 ) | 498 ) |
492 | 499 |
493 data_source = StringDataSource( | 500 data_source = StringDataSource( |
494 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa" | 501 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa" |
495 ) | 502 ) |
496 # ^ ^^ ^ ^ ^ ^ ^ | 503 # ^ ^^ ^ ^ ^ ^ ^ |
497 # 18 2223 27 32 36 42 46 | 504 # 18 2223 27 32 36 42 46 |
498 | 505 |
499 tokens = tokenizer.tokenize(data_source) | 506 tokens = tokenizer.tokenize(data_source) |
500 | 507 |
501 self.assertEqual( | 508 self.assertEqual( |
502 len(tokens), | 509 len(tokens), |
511 start = tok1[1] | 518 start = tok1[1] |
512 end = tok1[2] | 519 end = tok1[2] |
513 self.assertEqual( | 520 self.assertEqual( |
514 data, | 521 data, |
515 "AAAAA", | 522 "AAAAA", |
516 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( | 523 msg=( |
517 data | 524 "wrong data for token 1, expected: 'AAAAA', found: '{0}' " |
518 ), | 525 ).format(data), |
519 ) | 526 ) |
520 self.assertEqual( | 527 self.assertEqual( |
521 start, | 528 start, |
522 18, | 529 18, |
523 msg="wrong start frame for token 1, expected: 18, found: {0} ".format( | 530 msg=( |
524 start | 531 "wrong start frame for token 1, expected: 18, found: {0} " |
525 ), | 532 ).format(start), |
526 ) | 533 ) |
527 self.assertEqual( | 534 self.assertEqual( |
528 end, | 535 end, |
529 22, | 536 22, |
530 msg="wrong end frame for token 1, expected: 22, found: {0} ".format( | 537 msg=( |
531 end | 538 "wrong end frame for token 1, expected: 22, found: {0} " |
532 ), | 539 ).format(end), |
533 ) | 540 ) |
534 | 541 |
535 data = "".join(tok2[0]) | 542 data = "".join(tok2[0]) |
536 start = tok2[1] | 543 start = tok2[1] |
537 end = tok2[2] | 544 end = tok2[2] |
538 self.assertEqual( | 545 self.assertEqual( |
539 data, | 546 data, |
540 "AAAaa", | 547 "AAAaa", |
541 msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format( | 548 msg=( |
542 data | 549 "wrong data for token 1, expected: 'AAAaa', found: '{0}' " |
543 ), | 550 ).format(data), |
544 ) | 551 ) |
545 self.assertEqual( | 552 self.assertEqual( |
546 start, | 553 start, |
547 23, | 554 23, |
548 msg="wrong start frame for token 1, expected: 23, found: {0} ".format( | 555 msg=( |
549 start | 556 "wrong start frame for token 1, expected: 23, found: {0} " |
550 ), | 557 ).format(start), |
551 ) | 558 ) |
552 self.assertEqual( | 559 self.assertEqual( |
553 end, | 560 end, |
554 27, | 561 27, |
555 msg="wrong end frame for token 1, expected: 27, found: {0} ".format( | 562 msg=( |
556 end | 563 "wrong end frame for token 1, expected: 27, found: {0} " |
557 ), | 564 ).format(end), |
558 ) | 565 ) |
559 | 566 |
560 data = "".join(tok3[0]) | 567 data = "".join(tok3[0]) |
561 start = tok3[1] | 568 start = tok3[1] |
562 end = tok3[2] | 569 end = tok3[2] |
563 self.assertEqual( | 570 self.assertEqual( |
564 data, | 571 data, |
565 "AAAAA", | 572 "AAAAA", |
566 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( | 573 msg=( |
567 data | 574 "wrong data for token 1, expected: 'AAAAA', found: '{0}' " |
568 ), | 575 ).format(data), |
569 ) | 576 ) |
570 self.assertEqual( | 577 self.assertEqual( |
571 start, | 578 start, |
572 32, | 579 32, |
573 msg="wrong start frame for token 1, expected: 1, found: {0} ".format( | 580 msg=( |
574 start | 581 "wrong start frame for token 1, expected: 1, found: {0} " |
575 ), | 582 ).format(start), |
576 ) | 583 ) |
577 self.assertEqual( | 584 self.assertEqual( |
578 end, | 585 end, |
579 36, | 586 36, |
580 msg="wrong end frame for token 1, expected: 7, found: {0} ".format( | 587 msg=( |
581 end | 588 "wrong end frame for token 1, expected: 7, found: {0} " |
582 ), | 589 ).format(end), |
583 ) | 590 ) |
584 | 591 |
585 data = "".join(tok4[0]) | 592 data = "".join(tok4[0]) |
586 start = tok4[1] | 593 start = tok4[1] |
587 end = tok4[2] | 594 end = tok4[2] |
588 self.assertEqual( | 595 self.assertEqual( |
589 data, | 596 data, |
590 "AAaaA", | 597 "AAaaA", |
591 msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format( | 598 msg=( |
592 data | 599 "wrong data for token 2, expected: 'AAaaA', found: '{0}' " |
593 ), | 600 ).format(data), |
594 ) | 601 ) |
595 self.assertEqual( | 602 self.assertEqual( |
596 start, | 603 start, |
597 42, | 604 42, |
598 msg="wrong start frame for token 2, expected: 17, found: {0} ".format( | 605 msg=( |
599 start | 606 "wrong start frame for token 2, expected: 17, found: {0} " |
600 ), | 607 ).format(start), |
601 ) | 608 ) |
602 self.assertEqual( | 609 self.assertEqual( |
603 end, | 610 end, |
604 46, | 611 46, |
605 msg="wrong end frame for token 2, expected: 22, found: {0} ".format( | 612 msg=( |
606 end | 613 "wrong end frame for token 2, expected: 22, found: {0} " |
607 ), | 614 ).format(end), |
608 ) | 615 ) |
609 | 616 |
610 | 617 |
611 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase): | 618 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase): |
612 def setUp(self): | 619 def setUp(self): |
643 start = tok1[1] | 650 start = tok1[1] |
644 end = tok1[2] | 651 end = tok1[2] |
645 self.assertEqual( | 652 self.assertEqual( |
646 data, | 653 data, |
647 "AAAAA", | 654 "AAAAA", |
648 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( | 655 msg=( |
649 data | 656 "wrong data for token 1, expected: 'AAAAA', found: '{0}' " |
650 ), | 657 ).format(data), |
651 ) | 658 ) |
652 self.assertEqual( | 659 self.assertEqual( |
653 start, | 660 start, |
654 3, | 661 3, |
655 msg="wrong start frame for token 1, expected: 3, found: {0} ".format( | 662 msg=( |
656 start | 663 "wrong start frame for token 1, expected: 3, found: {0} " |
657 ), | 664 ).format(start), |
658 ) | 665 ) |
659 self.assertEqual( | 666 self.assertEqual( |
660 end, | 667 end, |
661 7, | 668 7, |
662 msg="wrong end frame for token 1, expected: 7, found: {0} ".format( | 669 msg=( |
663 end | 670 "wrong end frame for token 1, expected: 7, found: {0} " |
664 ), | 671 ).format(end), |
665 ) | 672 ) |
666 | 673 |
667 data = "".join(tok2[0]) | 674 data = "".join(tok2[0]) |
668 start = tok2[1] | 675 start = tok2[1] |
669 end = tok2[2] | 676 end = tok2[2] |
670 self.assertEqual( | 677 self.assertEqual( |
671 data, | 678 data, |
672 "AAAAAA", | 679 "AAAAAA", |
673 msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format( | 680 msg=( |
674 data | 681 "wrong data for token 1, expected: 'AAAAAA', found: '{0}' " |
675 ), | 682 ).format(data), |
676 ) | 683 ) |
677 self.assertEqual( | 684 self.assertEqual( |
678 start, | 685 start, |
679 9, | 686 9, |
680 msg="wrong start frame for token 1, expected: 9, found: {0} ".format( | 687 msg=( |
681 start | 688 "wrong start frame for token 1, expected: 9, found: {0} " |
682 ), | 689 ).format(start), |
683 ) | 690 ) |
684 self.assertEqual( | 691 self.assertEqual( |
685 end, | 692 end, |
686 14, | 693 14, |
687 msg="wrong end frame for token 1, expected: 14, found: {0} ".format( | 694 msg=( |
688 end | 695 "wrong end frame for token 1, expected: 14, found: {0} " |
689 ), | 696 ).format(end), |
690 ) | 697 ) |
691 | 698 |
692 data = "".join(tok3[0]) | 699 data = "".join(tok3[0]) |
693 start = tok3[1] | 700 start = tok3[1] |
694 end = tok3[2] | 701 end = tok3[2] |
695 self.assertEqual( | 702 self.assertEqual( |
696 data, | 703 data, |
697 "AAAAAAAAA", | 704 "AAAAAAAAA", |
698 msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format( | 705 msg=( |
699 data | 706 "wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' " |
700 ), | 707 ).format(data), |
701 ) | 708 ) |
702 self.assertEqual( | 709 self.assertEqual( |
703 start, | 710 start, |
704 17, | 711 17, |
705 msg="wrong start frame for token 1, expected: 17, found: {0} ".format( | 712 msg=( |
706 start | 713 "wrong start frame for token 1, expected: 17, found: {0} " |
707 ), | 714 ).format(start), |
708 ) | 715 ) |
709 self.assertEqual( | 716 self.assertEqual( |
710 end, | 717 end, |
711 25, | 718 25, |
712 msg="wrong end frame for token 1, expected: 25, found: {0} ".format( | 719 msg=( |
713 end | 720 "wrong end frame for token 1, expected: 25, found: {0} " |
714 ), | 721 ).format(end), |
715 ) | 722 ) |
716 | 723 |
717 def test_min_5_max_10_max_continuous_silence_1(self): | 724 def test_min_5_max_10_max_continuous_silence_1(self): |
718 | 725 |
719 tokenizer = StreamTokenizer( | 726 tokenizer = StreamTokenizer( |
746 start = tok1[1] | 753 start = tok1[1] |
747 end = tok1[2] | 754 end = tok1[2] |
748 self.assertEqual( | 755 self.assertEqual( |
749 data, | 756 data, |
750 "AAAAAaAAAA", | 757 "AAAAAaAAAA", |
751 msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format( | 758 msg=( |
752 data | 759 "wrong data for token 1, expected: 'AAAAAaAAAA', " |
753 ), | 760 "found: '{0}' " |
761 ).format(data), | |
754 ) | 762 ) |
755 self.assertEqual( | 763 self.assertEqual( |
756 start, | 764 start, |
757 3, | 765 3, |
758 msg="wrong start frame for token 1, expected: 3, found: {0} ".format( | 766 msg=( |
759 start | 767 "wrong start frame for token 1, expected: 3, found: {0} " |
760 ), | 768 ).format(start), |
761 ) | 769 ) |
762 self.assertEqual( | 770 self.assertEqual( |
763 end, | 771 end, |
764 12, | 772 12, |
765 msg="wrong end frame for token 1, expected: 10, found: {0} ".format( | 773 msg=( |
766 end | 774 "wrong end frame for token 1, expected: 10, found: {0} " |
767 ), | 775 ).format(end), |
768 ) | 776 ) |
769 | 777 |
770 data = "".join(tok2[0]) | 778 data = "".join(tok2[0]) |
771 start = tok2[1] | 779 start = tok2[1] |
772 end = tok2[2] | 780 end = tok2[2] |
773 self.assertEqual( | 781 self.assertEqual( |
774 data, | 782 data, |
775 "AAa", | 783 "AAa", |
776 msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format( | 784 msg=( |
777 data | 785 "wrong data for token 1, expected: 'AAa', found: '{0}' " |
778 ), | 786 ).format(data), |
779 ) | 787 ) |
780 self.assertEqual( | 788 self.assertEqual( |
781 start, | 789 start, |
782 13, | 790 13, |
783 msg="wrong start frame for token 1, expected: 9, found: {0} ".format( | 791 msg=( |
784 start | 792 "wrong start frame for token 1, expected: 9, found: {0} " |
785 ), | 793 ).format(start), |
786 ) | 794 ) |
787 self.assertEqual( | 795 self.assertEqual( |
788 end, | 796 end, |
789 15, | 797 15, |
790 msg="wrong end frame for token 1, expected: 14, found: {0} ".format( | 798 msg=( |
791 end | 799 "wrong end frame for token 1, expected: 14, found: {0} " |
792 ), | 800 ).format(end), |
793 ) | 801 ) |
794 | 802 |
795 data = "".join(tok3[0]) | 803 data = "".join(tok3[0]) |
796 start = tok3[1] | 804 start = tok3[1] |
797 end = tok3[2] | 805 end = tok3[2] |
798 self.assertEqual( | 806 self.assertEqual( |
799 data, | 807 data, |
800 "AAAAAAAAAa", | 808 "AAAAAAAAAa", |
801 msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format( | 809 msg=( |
802 data | 810 "wrong data for token 1, expected: 'AAAAAAAAAa', " |
803 ), | 811 "found: '{0}' " |
812 ).format(data), | |
804 ) | 813 ) |
805 self.assertEqual( | 814 self.assertEqual( |
806 start, | 815 start, |
807 17, | 816 17, |
808 msg="wrong start frame for token 1, expected: 17, found: {0} ".format( | 817 msg=( |
809 start | 818 "wrong start frame for token 1, expected: 17, found: {0} " |
810 ), | 819 ).format(start), |
811 ) | 820 ) |
812 self.assertEqual( | 821 self.assertEqual( |
813 end, | 822 end, |
814 26, | 823 26, |
815 msg="wrong end frame for token 1, expected: 26, found: {0} ".format( | 824 msg=( |
816 end | 825 "wrong end frame for token 1, expected: 26, found: {0} " |
817 ), | 826 ).format(end), |
818 ) | 827 ) |
819 | 828 |
820 | 829 |
821 class TestStreamTokenizerModes(unittest.TestCase): | 830 class TestStreamTokenizerModes(unittest.TestCase): |
822 def setUp(self): | 831 def setUp(self): |
853 start = tok1[1] | 862 start = tok1[1] |
854 end = tok1[2] | 863 end = tok1[2] |
855 self.assertEqual( | 864 self.assertEqual( |
856 data, | 865 data, |
857 "AAAAAAAA", | 866 "AAAAAAAA", |
858 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format( | 867 msg=( |
859 data | 868 "wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' " |
860 ), | 869 ).format(data), |
861 ) | 870 ) |
862 self.assertEqual( | 871 self.assertEqual( |
863 start, | 872 start, |
864 2, | 873 2, |
865 msg="wrong start frame for token 1, expected: 2, found: {0} ".format( | 874 msg=( |
866 start | 875 "wrong start frame for token 1, expected: 2, found: {0} " |
867 ), | 876 ).format(start), |
868 ) | 877 ) |
869 self.assertEqual( | 878 self.assertEqual( |
870 end, | 879 end, |
871 9, | 880 9, |
872 msg="wrong end frame for token 1, expected: 9, found: {0} ".format( | 881 msg=( |
873 end | 882 "wrong end frame for token 1, expected: 9, found: {0} " |
874 ), | 883 ).format(end), |
875 ) | 884 ) |
876 | 885 |
877 def test_DROP_TAILING_SILENCE(self): | 886 def test_DROP_TAILING_SILENCE(self): |
878 | 887 |
879 tokenizer = StreamTokenizer( | 888 tokenizer = StreamTokenizer( |
905 start = tok1[1] | 914 start = tok1[1] |
906 end = tok1[2] | 915 end = tok1[2] |
907 self.assertEqual( | 916 self.assertEqual( |
908 data, | 917 data, |
909 "AAAAA", | 918 "AAAAA", |
910 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format( | 919 msg=( |
911 data | 920 "wrong data for token 1, expected: 'AAAAA', found: '{0}' " |
912 ), | 921 ).format(data), |
913 ) | 922 ) |
914 self.assertEqual( | 923 self.assertEqual( |
915 start, | 924 start, |
916 2, | 925 2, |
917 msg="wrong start frame for token 1, expected: 2, found: {0} ".format( | 926 msg=( |
918 start | 927 "wrong start frame for token 1, expected: 2, found: {0} " |
919 ), | 928 ).format(start), |
920 ) | 929 ) |
921 self.assertEqual( | 930 self.assertEqual( |
922 end, | 931 end, |
923 6, | 932 6, |
924 msg="wrong end frame for token 1, expected: 6, found: {0} ".format( | 933 msg=( |
925 end | 934 "wrong end frame for token 1, expected: 6, found: {0} " |
926 ), | 935 ).format(end), |
927 ) | 936 ) |
928 | 937 |
929 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self): | 938 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self): |
930 | 939 |
931 tokenizer = StreamTokenizer( | 940 tokenizer = StreamTokenizer( |
958 start = tok1[1] | 967 start = tok1[1] |
959 end = tok1[2] | 968 end = tok1[2] |
960 self.assertEqual( | 969 self.assertEqual( |
961 data, | 970 data, |
962 "AAAAAAAA", | 971 "AAAAAAAA", |
963 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format( | 972 msg=( |
964 data | 973 "wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' " |
965 ), | 974 ).format(data), |
966 ) | 975 ) |
967 self.assertEqual( | 976 self.assertEqual( |
968 start, | 977 start, |
969 2, | 978 2, |
970 msg="wrong start frame for token 1, expected: 2, found: {0} ".format( | 979 msg=( |
971 start | 980 "wrong start frame for token 1, expected: 2, found: {0} " |
972 ), | 981 ).format(start), |
973 ) | 982 ) |
974 self.assertEqual( | 983 self.assertEqual( |
975 end, | 984 end, |
976 9, | 985 9, |
977 msg="wrong end frame for token 1, expected: 9, found: {0} ".format( | 986 msg=( |
978 end | 987 "wrong end frame for token 1, expected: 9, found: {0} " |
979 ), | 988 ).format(end), |
980 ) | 989 ) |
981 | 990 |
982 | 991 |
983 class TestStreamTokenizerCallback(unittest.TestCase): | 992 class TestStreamTokenizerCallback(unittest.TestCase): |
984 def setUp(self): | 993 def setUp(self): |