annotate tests/test_StreamTokenizer.py @ 314:12a030453422

Change mode
author Amine Sehili <amine.sehili@gmail.com>
date Mon, 14 Oct 2019 20:25:12 +0100
parents 7259b1eb9329
children f7cbf707a34e
rev   line source
amine@297 1 """
amine@2 2 @author: Amine Sehili <amine.sehili@gmail.com>
amine@2 3 September 2015
amine@2 4
amine@297 5 """
amine@2 6
amine@2 7 import unittest
amine@2 8 from auditok import StreamTokenizer, StringDataSource, DataValidator
amine@2 9
amine@2 10
amine@2 11 class AValidator(DataValidator):
amine@2 12 def is_valid(self, frame):
amine@2 13 return frame == "A"
amine@2 14
amine@2 15
amine@2 16 class TestStreamTokenizerInitParams(unittest.TestCase):
amine@2 17 def setUp(self):
amine@2 18 self.A_validator = AValidator()
amine@297 19
amine@2 20 # Completely deactivate init_min and init_max_silence
amine@2 21 # The tokenizer will only rely on the other parameters
amine@2 22 # Note that if init_min = 0, the value of init_max_silence
amine@2 23 # will have no effect
amine@2 24 def test_init_min_0_init_max_silence_0(self):
amine@297 25
amine@297 26 tokenizer = StreamTokenizer(
amine@297 27 self.A_validator,
amine@297 28 min_length=5,
amine@297 29 max_length=20,
amine@297 30 max_continuous_silence=4,
amine@297 31 init_min=0,
amine@297 32 init_max_silence=0,
amine@297 33 mode=0,
amine@297 34 )
amine@297 35
amine@2 36 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
amine@19 37 # ^ ^ ^ ^
amine@19 38 # 2 16 20 27
amine@2 39 tokens = tokenizer.tokenize(data_source)
amine@297 40
amine@297 41 self.assertEqual(
amine@297 42 len(tokens),
amine@297 43 2,
amine@297 44 msg="wrong number of tokens, expected: 2, found: {0} ".format(
amine@297 45 len(tokens)
amine@297 46 ),
amine@297 47 )
amine@2 48 tok1, tok2 = tokens[0], tokens[1]
amine@297 49
amine@2 50 # tok1[0]: data
amine@2 51 # tok1[1]: start frame (included)
amine@2 52 # tok1[2]: end frame (included)
amine@297 53
amine@297 54 data = "".join(tok1[0])
amine@2 55 start = tok1[1]
amine@2 56 end = tok1[2]
amine@297 57 self.assertEqual(
amine@297 58 data,
amine@297 59 "AaaaAaAaaAaAaaaa",
amine@297 60 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(
amine@297 61 data
amine@297 62 ),
amine@297 63 )
amine@297 64 self.assertEqual(
amine@297 65 start,
amine@297 66 1,
amine@297 67 msg="wrong start frame for token 1, expected: 1, found: {0} ".format(
amine@297 68 start
amine@297 69 ),
amine@297 70 )
amine@297 71 self.assertEqual(
amine@297 72 end,
amine@297 73 16,
amine@297 74 msg="wrong end frame for token 1, expected: 16, found: {0} ".format(
amine@297 75 end
amine@297 76 ),
amine@297 77 )
amine@297 78
amine@297 79 data = "".join(tok2[0])
amine@2 80 start = tok2[1]
amine@2 81 end = tok2[2]
amine@297 82 self.assertEqual(
amine@297 83 data,
amine@297 84 "AAAAAAAA",
amine@297 85 msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(
amine@297 86 data
amine@297 87 ),
amine@297 88 )
amine@297 89 self.assertEqual(
amine@297 90 start,
amine@297 91 20,
amine@297 92 msg="wrong start frame for token 2, expected: 20, found: {0} ".format(
amine@297 93 start
amine@297 94 ),
amine@297 95 )
amine@297 96 self.assertEqual(
amine@297 97 end,
amine@297 98 27,
amine@297 99 msg="wrong end frame for token 2, expected: 27, found: {0} ".format(
amine@297 100 end
amine@297 101 ),
amine@297 102 )
amine@297 103
amine@5 104 # A valid token is considered as so iff the tokenizer encounters
amine@2 105 # at least valid frames (init_min = 3) between witch there
amine@2 106 # are at most 0 consecutive non valid frames (init_max_silence = 0)
amine@2 107 # The tokenizer will only rely on the other parameters
amine@2 108 # In other words, a valid token must start with 3 valid frames
amine@2 109 def test_init_min_3_init_max_silence_0(self):
amine@297 110
amine@297 111 tokenizer = StreamTokenizer(
amine@297 112 self.A_validator,
amine@297 113 min_length=5,
amine@297 114 max_length=20,
amine@297 115 max_continuous_silence=4,
amine@297 116 init_min=3,
amine@297 117 init_max_silence=0,
amine@297 118 mode=0,
amine@297 119 )
amine@297 120
amine@297 121 data_source = StringDataSource(
amine@297 122 "aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA"
amine@297 123 )
amine@5 124 # ^ ^ ^ ^
amine@5 125 # 18 30 33 37
amine@297 126
amine@2 127 tokens = tokenizer.tokenize(data_source)
amine@297 128
amine@297 129 self.assertEqual(
amine@297 130 len(tokens),
amine@297 131 2,
amine@297 132 msg="wrong number of tokens, expected: 2, found: {0} ".format(
amine@297 133 len(tokens)
amine@297 134 ),
amine@297 135 )
amine@2 136 tok1, tok2 = tokens[0], tokens[1]
amine@297 137
amine@297 138 data = "".join(tok1[0])
amine@2 139 start = tok1[1]
amine@2 140 end = tok1[2]
amine@297 141 self.assertEqual(
amine@297 142 data,
amine@297 143 "AAAAAAAAAaaaa",
amine@297 144 msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(
amine@297 145 data
amine@297 146 ),
amine@297 147 )
amine@297 148 self.assertEqual(
amine@297 149 start,
amine@297 150 18,
amine@297 151 msg="wrong start frame for token 1, expected: 18, found: {0} ".format(
amine@297 152 start
amine@297 153 ),
amine@297 154 )
amine@297 155 self.assertEqual(
amine@297 156 end,
amine@297 157 30,
amine@297 158 msg="wrong end frame for token 1, expected: 30, found: {0} ".format(
amine@297 159 end
amine@297 160 ),
amine@297 161 )
amine@297 162
amine@297 163 data = "".join(tok2[0])
amine@2 164 start = tok2[1]
amine@2 165 end = tok2[2]
amine@297 166 self.assertEqual(
amine@297 167 data,
amine@297 168 "AAAAA",
amine@297 169 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
amine@297 170 data
amine@297 171 ),
amine@297 172 )
amine@297 173 self.assertEqual(
amine@297 174 start,
amine@297 175 33,
amine@297 176 msg="wrong start frame for token 2, expected: 33, found: {0} ".format(
amine@297 177 start
amine@297 178 ),
amine@297 179 )
amine@297 180 self.assertEqual(
amine@297 181 end,
amine@297 182 37,
amine@297 183 msg="wrong end frame for token 2, expected: 37, found: {0} ".format(
amine@297 184 end
amine@297 185 ),
amine@297 186 )
amine@297 187
amine@2 188 # A valid token is considered iff the tokenizer encounters
amine@2 189 # at least valid frames (init_min = 3) between witch there
amine@2 190 # are at most 2 consecutive non valid frames (init_max_silence = 2)
amine@2 191 def test_init_min_3_init_max_silence_2(self):
amine@297 192
amine@297 193 tokenizer = StreamTokenizer(
amine@297 194 self.A_validator,
amine@297 195 min_length=5,
amine@297 196 max_length=20,
amine@297 197 max_continuous_silence=4,
amine@297 198 init_min=3,
amine@297 199 init_max_silence=2,
amine@297 200 mode=0,
amine@297 201 )
amine@297 202
amine@297 203 data_source = StringDataSource(
amine@297 204 "aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA"
amine@297 205 )
amine@5 206 # ^ ^ ^ ^ ^ ^
amine@5 207 # 5 16 19 31 35 39
amine@2 208 tokens = tokenizer.tokenize(data_source)
amine@297 209
amine@297 210 self.assertEqual(
amine@297 211 len(tokens),
amine@297 212 3,
amine@297 213 msg="wrong number of tokens, expected: 3, found: {0} ".format(
amine@297 214 len(tokens)
amine@297 215 ),
amine@297 216 )
amine@2 217 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 218
amine@297 219 data = "".join(tok1[0])
amine@2 220 start = tok1[1]
amine@2 221 end = tok1[2]
amine@297 222 self.assertEqual(
amine@297 223 data,
amine@297 224 "AaAaaAaAaaaa",
amine@297 225 msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(
amine@297 226 data
amine@297 227 ),
amine@297 228 )
amine@297 229 self.assertEqual(
amine@297 230 start,
amine@297 231 5,
amine@297 232 msg="wrong start frame for token 1, expected: 5, found: {0} ".format(
amine@297 233 start
amine@297 234 ),
amine@297 235 )
amine@297 236 self.assertEqual(
amine@297 237 end,
amine@297 238 16,
amine@297 239 msg="wrong end frame for token 1, expected: 16, found: {0} ".format(
amine@297 240 end
amine@297 241 ),
amine@297 242 )
amine@297 243
amine@297 244 data = "".join(tok2[0])
amine@2 245 start = tok2[1]
amine@2 246 end = tok2[2]
amine@297 247 self.assertEqual(
amine@297 248 data,
amine@297 249 "AAAAAAAAAaaaa",
amine@297 250 msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(
amine@297 251 data
amine@297 252 ),
amine@297 253 )
amine@297 254 self.assertEqual(
amine@297 255 start,
amine@297 256 19,
amine@297 257 msg="wrong start frame for token 2, expected: 19, found: {0} ".format(
amine@297 258 start
amine@297 259 ),
amine@297 260 )
amine@297 261 self.assertEqual(
amine@297 262 end,
amine@297 263 31,
amine@297 264 msg="wrong end frame for token 2, expected: 31, found: {0} ".format(
amine@297 265 end
amine@297 266 ),
amine@297 267 )
amine@297 268
amine@297 269 data = "".join(tok3[0])
amine@2 270 start = tok3[1]
amine@2 271 end = tok3[2]
amine@297 272 self.assertEqual(
amine@297 273 data,
amine@297 274 "AAAAA",
amine@297 275 msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(
amine@297 276 data
amine@297 277 ),
amine@297 278 )
amine@297 279 self.assertEqual(
amine@297 280 start,
amine@297 281 35,
amine@297 282 msg="wrong start frame for token 2, expected: 35, found: {0} ".format(
amine@297 283 start
amine@297 284 ),
amine@297 285 )
amine@297 286 self.assertEqual(
amine@297 287 end,
amine@297 288 39,
amine@297 289 msg="wrong end frame for token 2, expected: 39, found: {0} ".format(
amine@297 290 end
amine@297 291 ),
amine@297 292 )
amine@297 293
amine@297 294
amine@2 295 class TestStreamTokenizerMinMaxLength(unittest.TestCase):
amine@2 296 def setUp(self):
amine@2 297 self.A_validator = AValidator()
amine@297 298
amine@2 299 def test_min_length_6_init_max_length_20(self):
amine@297 300
amine@297 301 tokenizer = StreamTokenizer(
amine@297 302 self.A_validator,
amine@297 303 min_length=6,
amine@297 304 max_length=20,
amine@297 305 max_continuous_silence=2,
amine@297 306 init_min=3,
amine@297 307 init_max_silence=3,
amine@297 308 mode=0,
amine@297 309 )
amine@297 310
amine@2 311 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@5 312 # ^ ^ ^ ^
amine@5 313 # 1 14 18 28
amine@297 314
amine@2 315 tokens = tokenizer.tokenize(data_source)
amine@297 316
amine@297 317 self.assertEqual(
amine@297 318 len(tokens),
amine@297 319 2,
amine@297 320 msg="wrong number of tokens, expected: 2, found: {0} ".format(
amine@297 321 len(tokens)
amine@297 322 ),
amine@297 323 )
amine@2 324 tok1, tok2 = tokens[0], tokens[1]
amine@297 325
amine@297 326 data = "".join(tok1[0])
amine@2 327 start = tok1[1]
amine@2 328 end = tok1[2]
amine@297 329 self.assertEqual(
amine@297 330 data,
amine@297 331 "AaaaAaAaaAaAaa",
amine@297 332 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format(
amine@297 333 data
amine@297 334 ),
amine@297 335 )
amine@297 336 self.assertEqual(
amine@297 337 start,
amine@297 338 1,
amine@297 339 msg="wrong start frame for token 1, expected: 1, found: {0} ".format(
amine@297 340 start
amine@297 341 ),
amine@297 342 )
amine@297 343 self.assertEqual(
amine@297 344 end,
amine@297 345 14,
amine@297 346 msg="wrong end frame for token 1, expected: 14, found: {0} ".format(
amine@297 347 end
amine@297 348 ),
amine@297 349 )
amine@297 350
amine@297 351 data = "".join(tok2[0])
amine@2 352 start = tok2[1]
amine@2 353 end = tok2[2]
amine@297 354 self.assertEqual(
amine@297 355 data,
amine@297 356 "AAAAAAAAAaa",
amine@297 357 msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format(
amine@297 358 data
amine@297 359 ),
amine@297 360 )
amine@297 361 self.assertEqual(
amine@297 362 start,
amine@297 363 18,
amine@297 364 msg="wrong start frame for token 2, expected: 18, found: {0} ".format(
amine@297 365 start
amine@297 366 ),
amine@297 367 )
amine@297 368 self.assertEqual(
amine@297 369 end,
amine@297 370 28,
amine@297 371 msg="wrong end frame for token 2, expected: 28, found: {0} ".format(
amine@297 372 end
amine@297 373 ),
amine@297 374 )
amine@297 375
amine@2 376 def test_min_length_1_init_max_length_1(self):
amine@297 377
amine@297 378 tokenizer = StreamTokenizer(
amine@297 379 self.A_validator,
amine@297 380 min_length=1,
amine@297 381 max_length=1,
amine@297 382 max_continuous_silence=0,
amine@297 383 init_min=0,
amine@297 384 init_max_silence=0,
amine@297 385 mode=0,
amine@297 386 )
amine@297 387
amine@297 388 data_source = StringDataSource(
amine@297 389 "AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA"
amine@297 390 )
amine@297 391
amine@2 392 tokens = tokenizer.tokenize(data_source)
amine@297 393
amine@297 394 self.assertEqual(
amine@297 395 len(tokens),
amine@297 396 21,
amine@297 397 msg="wrong number of tokens, expected: 21, found: {0} ".format(
amine@297 398 len(tokens)
amine@297 399 ),
amine@297 400 )
amine@297 401
amine@2 402 def test_min_length_10_init_max_length_20(self):
amine@297 403
amine@297 404 tokenizer = StreamTokenizer(
amine@297 405 self.A_validator,
amine@297 406 min_length=10,
amine@297 407 max_length=20,
amine@297 408 max_continuous_silence=4,
amine@297 409 init_min=3,
amine@297 410 init_max_silence=3,
amine@297 411 mode=0,
amine@297 412 )
amine@297 413
amine@297 414 data_source = StringDataSource(
amine@297 415 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA"
amine@297 416 )
amine@5 417 # ^ ^ ^ ^
amine@5 418 # 1 16 30 45
amine@297 419
amine@2 420 tokens = tokenizer.tokenize(data_source)
amine@297 421
amine@297 422 self.assertEqual(
amine@297 423 len(tokens),
amine@297 424 2,
amine@297 425 msg="wrong number of tokens, expected: 2, found: {0} ".format(
amine@297 426 len(tokens)
amine@297 427 ),
amine@297 428 )
amine@2 429 tok1, tok2 = tokens[0], tokens[1]
amine@297 430
amine@297 431 data = "".join(tok1[0])
amine@2 432 start = tok1[1]
amine@2 433 end = tok1[2]
amine@297 434 self.assertEqual(
amine@297 435 data,
amine@297 436 "AaaaAaAaaAaAaaaa",
amine@297 437 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format(
amine@297 438 data
amine@297 439 ),
amine@297 440 )
amine@297 441 self.assertEqual(
amine@297 442 start,
amine@297 443 1,
amine@297 444 msg="wrong start frame for token 1, expected: 1, found: {0} ".format(
amine@297 445 start
amine@297 446 ),
amine@297 447 )
amine@297 448 self.assertEqual(
amine@297 449 end,
amine@297 450 16,
amine@297 451 msg="wrong end frame for token 1, expected: 16, found: {0} ".format(
amine@297 452 end
amine@297 453 ),
amine@297 454 )
amine@297 455
amine@297 456 data = "".join(tok2[0])
amine@2 457 start = tok2[1]
amine@2 458 end = tok2[2]
amine@297 459 self.assertEqual(
amine@297 460 data,
amine@297 461 "AAAAAaaAAaaAAA",
amine@297 462 msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format(
amine@297 463 data
amine@297 464 ),
amine@297 465 )
amine@297 466 self.assertEqual(
amine@297 467 start,
amine@297 468 30,
amine@297 469 msg="wrong start frame for token 2, expected: 30, found: {0} ".format(
amine@297 470 start
amine@297 471 ),
amine@297 472 )
amine@297 473 self.assertEqual(
amine@297 474 end,
amine@297 475 43,
amine@297 476 msg="wrong end frame for token 2, expected: 43, found: {0} ".format(
amine@297 477 end
amine@297 478 ),
amine@297 479 )
amine@297 480
amine@2 481 def test_min_length_4_init_max_length_5(self):
amine@297 482
amine@297 483 tokenizer = StreamTokenizer(
amine@297 484 self.A_validator,
amine@297 485 min_length=4,
amine@297 486 max_length=5,
amine@297 487 max_continuous_silence=4,
amine@297 488 init_min=3,
amine@297 489 init_max_silence=3,
amine@297 490 mode=0,
amine@297 491 )
amine@297 492
amine@297 493 data_source = StringDataSource(
amine@297 494 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa"
amine@297 495 )
amine@5 496 # ^ ^^ ^ ^ ^ ^ ^
amine@5 497 # 18 2223 27 32 36 42 46
amine@297 498
amine@2 499 tokens = tokenizer.tokenize(data_source)
amine@297 500
amine@297 501 self.assertEqual(
amine@297 502 len(tokens),
amine@297 503 4,
amine@297 504 msg="wrong number of tokens, expected: 4, found: {0} ".format(
amine@297 505 len(tokens)
amine@297 506 ),
amine@297 507 )
amine@2 508 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
amine@297 509
amine@297 510 data = "".join(tok1[0])
amine@2 511 start = tok1[1]
amine@2 512 end = tok1[2]
amine@297 513 self.assertEqual(
amine@297 514 data,
amine@297 515 "AAAAA",
amine@297 516 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
amine@297 517 data
amine@297 518 ),
amine@297 519 )
amine@297 520 self.assertEqual(
amine@297 521 start,
amine@297 522 18,
amine@297 523 msg="wrong start frame for token 1, expected: 18, found: {0} ".format(
amine@297 524 start
amine@297 525 ),
amine@297 526 )
amine@297 527 self.assertEqual(
amine@297 528 end,
amine@297 529 22,
amine@297 530 msg="wrong end frame for token 1, expected: 22, found: {0} ".format(
amine@297 531 end
amine@297 532 ),
amine@297 533 )
amine@297 534
amine@297 535 data = "".join(tok2[0])
amine@2 536 start = tok2[1]
amine@2 537 end = tok2[2]
amine@297 538 self.assertEqual(
amine@297 539 data,
amine@297 540 "AAAaa",
amine@297 541 msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(
amine@297 542 data
amine@297 543 ),
amine@297 544 )
amine@297 545 self.assertEqual(
amine@297 546 start,
amine@297 547 23,
amine@297 548 msg="wrong start frame for token 1, expected: 23, found: {0} ".format(
amine@297 549 start
amine@297 550 ),
amine@297 551 )
amine@297 552 self.assertEqual(
amine@297 553 end,
amine@297 554 27,
amine@297 555 msg="wrong end frame for token 1, expected: 27, found: {0} ".format(
amine@297 556 end
amine@297 557 ),
amine@297 558 )
amine@297 559
amine@297 560 data = "".join(tok3[0])
amine@2 561 start = tok3[1]
amine@2 562 end = tok3[2]
amine@297 563 self.assertEqual(
amine@297 564 data,
amine@297 565 "AAAAA",
amine@297 566 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
amine@297 567 data
amine@297 568 ),
amine@297 569 )
amine@297 570 self.assertEqual(
amine@297 571 start,
amine@297 572 32,
amine@297 573 msg="wrong start frame for token 1, expected: 1, found: {0} ".format(
amine@297 574 start
amine@297 575 ),
amine@297 576 )
amine@297 577 self.assertEqual(
amine@297 578 end,
amine@297 579 36,
amine@297 580 msg="wrong end frame for token 1, expected: 7, found: {0} ".format(
amine@297 581 end
amine@297 582 ),
amine@297 583 )
amine@297 584
amine@297 585 data = "".join(tok4[0])
amine@2 586 start = tok4[1]
amine@2 587 end = tok4[2]
amine@297 588 self.assertEqual(
amine@297 589 data,
amine@297 590 "AAaaA",
amine@297 591 msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(
amine@297 592 data
amine@297 593 ),
amine@297 594 )
amine@297 595 self.assertEqual(
amine@297 596 start,
amine@297 597 42,
amine@297 598 msg="wrong start frame for token 2, expected: 17, found: {0} ".format(
amine@297 599 start
amine@297 600 ),
amine@297 601 )
amine@297 602 self.assertEqual(
amine@297 603 end,
amine@297 604 46,
amine@297 605 msg="wrong end frame for token 2, expected: 22, found: {0} ".format(
amine@297 606 end
amine@297 607 ),
amine@297 608 )
amine@297 609
amine@297 610
amine@2 611 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase):
amine@2 612 def setUp(self):
amine@2 613 self.A_validator = AValidator()
amine@297 614
amine@2 615 def test_min_5_max_10_max_continuous_silence_0(self):
amine@2 616
amine@297 617 tokenizer = StreamTokenizer(
amine@297 618 self.A_validator,
amine@297 619 min_length=5,
amine@297 620 max_length=10,
amine@297 621 max_continuous_silence=0,
amine@297 622 init_min=3,
amine@297 623 init_max_silence=3,
amine@297 624 mode=0,
amine@297 625 )
amine@297 626
amine@2 627 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@5 628 # ^ ^ ^ ^ ^ ^
amine@5 629 # 3 7 9 14 17 25
amine@297 630
amine@2 631 tokens = tokenizer.tokenize(data_source)
amine@297 632
amine@297 633 self.assertEqual(
amine@297 634 len(tokens),
amine@297 635 3,
amine@297 636 msg="wrong number of tokens, expected: 3, found: {0} ".format(
amine@297 637 len(tokens)
amine@297 638 ),
amine@297 639 )
amine@2 640 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 641
amine@297 642 data = "".join(tok1[0])
amine@2 643 start = tok1[1]
amine@2 644 end = tok1[2]
amine@297 645 self.assertEqual(
amine@297 646 data,
amine@297 647 "AAAAA",
amine@297 648 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
amine@297 649 data
amine@297 650 ),
amine@297 651 )
amine@297 652 self.assertEqual(
amine@297 653 start,
amine@297 654 3,
amine@297 655 msg="wrong start frame for token 1, expected: 3, found: {0} ".format(
amine@297 656 start
amine@297 657 ),
amine@297 658 )
amine@297 659 self.assertEqual(
amine@297 660 end,
amine@297 661 7,
amine@297 662 msg="wrong end frame for token 1, expected: 7, found: {0} ".format(
amine@297 663 end
amine@297 664 ),
amine@297 665 )
amine@297 666
amine@297 667 data = "".join(tok2[0])
amine@2 668 start = tok2[1]
amine@2 669 end = tok2[2]
amine@297 670 self.assertEqual(
amine@297 671 data,
amine@297 672 "AAAAAA",
amine@297 673 msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format(
amine@297 674 data
amine@297 675 ),
amine@297 676 )
amine@297 677 self.assertEqual(
amine@297 678 start,
amine@297 679 9,
amine@297 680 msg="wrong start frame for token 1, expected: 9, found: {0} ".format(
amine@297 681 start
amine@297 682 ),
amine@297 683 )
amine@297 684 self.assertEqual(
amine@297 685 end,
amine@297 686 14,
amine@297 687 msg="wrong end frame for token 1, expected: 14, found: {0} ".format(
amine@297 688 end
amine@297 689 ),
amine@297 690 )
amine@297 691
amine@297 692 data = "".join(tok3[0])
amine@2 693 start = tok3[1]
amine@2 694 end = tok3[2]
amine@297 695 self.assertEqual(
amine@297 696 data,
amine@297 697 "AAAAAAAAA",
amine@297 698 msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format(
amine@297 699 data
amine@297 700 ),
amine@297 701 )
amine@297 702 self.assertEqual(
amine@297 703 start,
amine@297 704 17,
amine@297 705 msg="wrong start frame for token 1, expected: 17, found: {0} ".format(
amine@297 706 start
amine@297 707 ),
amine@297 708 )
amine@297 709 self.assertEqual(
amine@297 710 end,
amine@297 711 25,
amine@297 712 msg="wrong end frame for token 1, expected: 25, found: {0} ".format(
amine@297 713 end
amine@297 714 ),
amine@297 715 )
amine@297 716
amine@2 717 def test_min_5_max_10_max_continuous_silence_1(self):
amine@2 718
amine@297 719 tokenizer = StreamTokenizer(
amine@297 720 self.A_validator,
amine@297 721 min_length=5,
amine@297 722 max_length=10,
amine@297 723 max_continuous_silence=1,
amine@297 724 init_min=3,
amine@297 725 init_max_silence=3,
amine@297 726 mode=0,
amine@297 727 )
amine@297 728
amine@2 729 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@5 730 # ^ ^^ ^ ^ ^
amine@5 731 # 3 12131517 26
amine@5 732 # (12 13 15 17)
amine@297 733
amine@2 734 tokens = tokenizer.tokenize(data_source)
amine@297 735
amine@297 736 self.assertEqual(
amine@297 737 len(tokens),
amine@297 738 3,
amine@297 739 msg="wrong number of tokens, expected: 3, found: {0} ".format(
amine@297 740 len(tokens)
amine@297 741 ),
amine@297 742 )
amine@2 743 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 744
amine@297 745 data = "".join(tok1[0])
amine@2 746 start = tok1[1]
amine@2 747 end = tok1[2]
amine@297 748 self.assertEqual(
amine@297 749 data,
amine@297 750 "AAAAAaAAAA",
amine@297 751 msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(
amine@297 752 data
amine@297 753 ),
amine@297 754 )
amine@297 755 self.assertEqual(
amine@297 756 start,
amine@297 757 3,
amine@297 758 msg="wrong start frame for token 1, expected: 3, found: {0} ".format(
amine@297 759 start
amine@297 760 ),
amine@297 761 )
amine@297 762 self.assertEqual(
amine@297 763 end,
amine@297 764 12,
amine@297 765 msg="wrong end frame for token 1, expected: 10, found: {0} ".format(
amine@297 766 end
amine@297 767 ),
amine@297 768 )
amine@297 769
amine@297 770 data = "".join(tok2[0])
amine@2 771 start = tok2[1]
amine@2 772 end = tok2[2]
amine@297 773 self.assertEqual(
amine@297 774 data,
amine@297 775 "AAa",
amine@297 776 msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(
amine@297 777 data
amine@297 778 ),
amine@297 779 )
amine@297 780 self.assertEqual(
amine@297 781 start,
amine@297 782 13,
amine@297 783 msg="wrong start frame for token 1, expected: 9, found: {0} ".format(
amine@297 784 start
amine@297 785 ),
amine@297 786 )
amine@297 787 self.assertEqual(
amine@297 788 end,
amine@297 789 15,
amine@297 790 msg="wrong end frame for token 1, expected: 14, found: {0} ".format(
amine@297 791 end
amine@297 792 ),
amine@297 793 )
amine@297 794
amine@297 795 data = "".join(tok3[0])
amine@2 796 start = tok3[1]
amine@2 797 end = tok3[2]
amine@297 798 self.assertEqual(
amine@297 799 data,
amine@297 800 "AAAAAAAAAa",
amine@297 801 msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(
amine@297 802 data
amine@297 803 ),
amine@297 804 )
amine@297 805 self.assertEqual(
amine@297 806 start,
amine@297 807 17,
amine@297 808 msg="wrong start frame for token 1, expected: 17, found: {0} ".format(
amine@297 809 start
amine@297 810 ),
amine@297 811 )
amine@297 812 self.assertEqual(
amine@297 813 end,
amine@297 814 26,
amine@297 815 msg="wrong end frame for token 1, expected: 26, found: {0} ".format(
amine@297 816 end
amine@297 817 ),
amine@297 818 )
amine@297 819
amine@297 820
amine@2 821 class TestStreamTokenizerModes(unittest.TestCase):
amine@2 822 def setUp(self):
amine@2 823 self.A_validator = AValidator()
amine@297 824
amine@2 825 def test_STRICT_MIN_LENGTH(self):
amine@297 826
amine@297 827 tokenizer = StreamTokenizer(
amine@297 828 self.A_validator,
amine@297 829 min_length=5,
amine@297 830 max_length=8,
amine@297 831 max_continuous_silence=3,
amine@297 832 init_min=3,
amine@297 833 init_max_silence=3,
amine@297 834 mode=StreamTokenizer.STRICT_MIN_LENGTH,
amine@297 835 )
amine@297 836
amine@2 837 data_source = StringDataSource("aaAAAAAAAAAAAA")
amine@5 838 # ^ ^
amine@5 839 # 2 9
amine@297 840
amine@2 841 tokens = tokenizer.tokenize(data_source)
amine@297 842
amine@297 843 self.assertEqual(
amine@297 844 len(tokens),
amine@297 845 1,
amine@297 846 msg="wrong number of tokens, expected: 1, found: {0} ".format(
amine@297 847 len(tokens)
amine@297 848 ),
amine@297 849 )
amine@2 850 tok1 = tokens[0]
amine@297 851
amine@297 852 data = "".join(tok1[0])
amine@2 853 start = tok1[1]
amine@2 854 end = tok1[2]
amine@297 855 self.assertEqual(
amine@297 856 data,
amine@297 857 "AAAAAAAA",
amine@297 858 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(
amine@297 859 data
amine@297 860 ),
amine@297 861 )
amine@297 862 self.assertEqual(
amine@297 863 start,
amine@297 864 2,
amine@297 865 msg="wrong start frame for token 1, expected: 2, found: {0} ".format(
amine@297 866 start
amine@297 867 ),
amine@297 868 )
amine@297 869 self.assertEqual(
amine@297 870 end,
amine@297 871 9,
amine@297 872 msg="wrong end frame for token 1, expected: 9, found: {0} ".format(
amine@297 873 end
amine@297 874 ),
amine@297 875 )
amine@297 876
amine@3 877 def test_DROP_TAILING_SILENCE(self):
amine@297 878
amine@297 879 tokenizer = StreamTokenizer(
amine@297 880 self.A_validator,
amine@297 881 min_length=5,
amine@297 882 max_length=10,
amine@297 883 max_continuous_silence=2,
amine@297 884 init_min=3,
amine@297 885 init_max_silence=3,
amine@297 886 mode=StreamTokenizer.DROP_TRAILING_SILENCE,
amine@297 887 )
amine@297 888
amine@2 889 data_source = StringDataSource("aaAAAAAaaaaa")
amine@5 890 # ^ ^
amine@5 891 # 2 6
amine@297 892
amine@2 893 tokens = tokenizer.tokenize(data_source)
amine@297 894
amine@297 895 self.assertEqual(
amine@297 896 len(tokens),
amine@297 897 1,
amine@297 898 msg="wrong number of tokens, expected: 1, found: {0} ".format(
amine@297 899 len(tokens)
amine@297 900 ),
amine@297 901 )
amine@2 902 tok1 = tokens[0]
amine@297 903
amine@297 904 data = "".join(tok1[0])
amine@2 905 start = tok1[1]
amine@2 906 end = tok1[2]
amine@297 907 self.assertEqual(
amine@297 908 data,
amine@297 909 "AAAAA",
amine@297 910 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(
amine@297 911 data
amine@297 912 ),
amine@297 913 )
amine@297 914 self.assertEqual(
amine@297 915 start,
amine@297 916 2,
amine@297 917 msg="wrong start frame for token 1, expected: 2, found: {0} ".format(
amine@297 918 start
amine@297 919 ),
amine@297 920 )
amine@297 921 self.assertEqual(
amine@297 922 end,
amine@297 923 6,
amine@297 924 msg="wrong end frame for token 1, expected: 6, found: {0} ".format(
amine@297 925 end
amine@297 926 ),
amine@297 927 )
amine@297 928
amine@3 929 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self):
amine@297 930
amine@297 931 tokenizer = StreamTokenizer(
amine@297 932 self.A_validator,
amine@297 933 min_length=5,
amine@297 934 max_length=8,
amine@297 935 max_continuous_silence=3,
amine@297 936 init_min=3,
amine@297 937 init_max_silence=3,
amine@297 938 mode=StreamTokenizer.STRICT_MIN_LENGTH
amine@297 939 | StreamTokenizer.DROP_TRAILING_SILENCE,
amine@297 940 )
amine@297 941
amine@2 942 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
amine@5 943 # ^ ^
amine@5 944 # 2 8
amine@297 945
amine@2 946 tokens = tokenizer.tokenize(data_source)
amine@297 947
amine@297 948 self.assertEqual(
amine@297 949 len(tokens),
amine@297 950 1,
amine@297 951 msg="wrong number of tokens, expected: 1, found: {0} ".format(
amine@297 952 len(tokens)
amine@297 953 ),
amine@297 954 )
amine@2 955 tok1 = tokens[0]
amine@297 956
amine@297 957 data = "".join(tok1[0])
amine@2 958 start = tok1[1]
amine@2 959 end = tok1[2]
amine@297 960 self.assertEqual(
amine@297 961 data,
amine@297 962 "AAAAAAAA",
amine@297 963 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(
amine@297 964 data
amine@297 965 ),
amine@297 966 )
amine@297 967 self.assertEqual(
amine@297 968 start,
amine@297 969 2,
amine@297 970 msg="wrong start frame for token 1, expected: 2, found: {0} ".format(
amine@297 971 start
amine@297 972 ),
amine@297 973 )
amine@297 974 self.assertEqual(
amine@297 975 end,
amine@297 976 9,
amine@297 977 msg="wrong end frame for token 1, expected: 9, found: {0} ".format(
amine@297 978 end
amine@297 979 ),
amine@297 980 )
amine@297 981
amine@297 982
amine@2 983 class TestStreamTokenizerCallback(unittest.TestCase):
amine@2 984 def setUp(self):
amine@2 985 self.A_validator = AValidator()
amine@297 986
amine@2 987 def test_callback(self):
amine@297 988
amine@2 989 tokens = []
amine@297 990
amine@2 991 def callback(data, start, end):
amine@2 992 tokens.append((data, start, end))
amine@297 993
amine@297 994 tokenizer = StreamTokenizer(
amine@297 995 self.A_validator,
amine@297 996 min_length=5,
amine@297 997 max_length=8,
amine@297 998 max_continuous_silence=3,
amine@297 999 init_min=3,
amine@297 1000 init_max_silence=3,
amine@297 1001 mode=0,
amine@297 1002 )
amine@297 1003
amine@2 1004 data_source = StringDataSource("aaAAAAAAAAAAAAa")
amine@5 1005 # ^ ^^ ^
amine@5 1006 # 2 910 14
amine@297 1007
amine@2 1008 tokenizer.tokenize(data_source, callback=callback)
amine@297 1009
amine@297 1010 self.assertEqual(
amine@297 1011 len(tokens),
amine@297 1012 2,
amine@297 1013 msg="wrong number of tokens, expected: 1, found: {0} ".format(
amine@297 1014 len(tokens)
amine@297 1015 ),
amine@297 1016 )
amine@2 1017
amine@2 1018
amine@2 1019 if __name__ == "__main__":
amine@297 1020 # import sys;sys.argv = ['', 'Test.testName']
amine@2 1021 unittest.main()