annotate tests/test_StreamTokenizer.py @ 334:f7cbf707a34e

Refactor test_StreamTokenizer
author Amine Sehili <amine.sehili@gmail.com>
date Fri, 25 Oct 2019 20:56:12 +0100
parents 7259b1eb9329
children 9f17aa9a4018
rev   line source
amine@297 1 """
amine@2 2 @author: Amine Sehili <amine.sehili@gmail.com>
amine@2 3 September 2015
amine@2 4
amine@297 5 """
amine@2 6
amine@2 7 import unittest
amine@2 8 from auditok import StreamTokenizer, StringDataSource, DataValidator
amine@2 9
amine@2 10
amine@2 11 class AValidator(DataValidator):
amine@2 12 def is_valid(self, frame):
amine@2 13 return frame == "A"
amine@2 14
amine@2 15
amine@2 16 class TestStreamTokenizerInitParams(unittest.TestCase):
amine@2 17 def setUp(self):
amine@2 18 self.A_validator = AValidator()
amine@297 19
amine@2 20 # Completely deactivate init_min and init_max_silence
amine@2 21 # The tokenizer will only rely on the other parameters
amine@2 22 # Note that if init_min = 0, the value of init_max_silence
amine@2 23 # will have no effect
amine@2 24 def test_init_min_0_init_max_silence_0(self):
amine@297 25
amine@297 26 tokenizer = StreamTokenizer(
amine@297 27 self.A_validator,
amine@297 28 min_length=5,
amine@297 29 max_length=20,
amine@297 30 max_continuous_silence=4,
amine@297 31 init_min=0,
amine@297 32 init_max_silence=0,
amine@297 33 mode=0,
amine@297 34 )
amine@297 35
amine@2 36 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
amine@19 37 # ^ ^ ^ ^
amine@19 38 # 2 16 20 27
amine@2 39 tokens = tokenizer.tokenize(data_source)
amine@297 40
amine@297 41 self.assertEqual(
amine@297 42 len(tokens),
amine@297 43 2,
amine@297 44 msg="wrong number of tokens, expected: 2, found: {0} ".format(
amine@297 45 len(tokens)
amine@297 46 ),
amine@297 47 )
amine@2 48 tok1, tok2 = tokens[0], tokens[1]
amine@297 49
amine@2 50 # tok1[0]: data
amine@2 51 # tok1[1]: start frame (included)
amine@2 52 # tok1[2]: end frame (included)
amine@297 53
amine@297 54 data = "".join(tok1[0])
amine@2 55 start = tok1[1]
amine@2 56 end = tok1[2]
amine@297 57 self.assertEqual(
amine@297 58 data,
amine@297 59 "AaaaAaAaaAaAaaaa",
amine@334 60 msg=(
amine@334 61 "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', "
amine@334 62 "found: {0} "
amine@334 63 ).format(data),
amine@297 64 )
amine@297 65 self.assertEqual(
amine@297 66 start,
amine@297 67 1,
amine@334 68 msg=(
amine@334 69 "wrong start frame for token 1, expected: 1, found: {0} "
amine@334 70 ).format(start),
amine@297 71 )
amine@297 72 self.assertEqual(
amine@297 73 end,
amine@297 74 16,
amine@334 75 msg=(
amine@334 76 "wrong end frame for token 1, expected: 16, found: {0} "
amine@334 77 ).format(end),
amine@297 78 )
amine@297 79
amine@297 80 data = "".join(tok2[0])
amine@2 81 start = tok2[1]
amine@2 82 end = tok2[2]
amine@297 83 self.assertEqual(
amine@297 84 data,
amine@297 85 "AAAAAAAA",
amine@334 86 msg=(
amine@334 87 "wrong data for token 1, expected: 'AAAAAAAA', found: {0} "
amine@334 88 ).format(data),
amine@297 89 )
amine@297 90 self.assertEqual(
amine@297 91 start,
amine@297 92 20,
amine@334 93 msg=(
amine@334 94 "wrong start frame for token 2, expected: 20, found: {0} "
amine@334 95 ).format(start),
amine@297 96 )
amine@297 97 self.assertEqual(
amine@297 98 end,
amine@297 99 27,
amine@334 100 msg=(
amine@334 101 "wrong end frame for token 2, expected: 27, found: {0} "
amine@334 102 ).format(end),
amine@297 103 )
amine@297 104
amine@5 105 # A valid token is considered as so iff the tokenizer encounters
amine@2 106 # at least valid frames (init_min = 3) between witch there
amine@2 107 # are at most 0 consecutive non valid frames (init_max_silence = 0)
amine@2 108 # The tokenizer will only rely on the other parameters
amine@2 109 # In other words, a valid token must start with 3 valid frames
amine@2 110 def test_init_min_3_init_max_silence_0(self):
amine@297 111
amine@297 112 tokenizer = StreamTokenizer(
amine@297 113 self.A_validator,
amine@297 114 min_length=5,
amine@297 115 max_length=20,
amine@297 116 max_continuous_silence=4,
amine@297 117 init_min=3,
amine@297 118 init_max_silence=0,
amine@297 119 mode=0,
amine@297 120 )
amine@297 121
amine@297 122 data_source = StringDataSource(
amine@297 123 "aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA"
amine@297 124 )
amine@334 125 # ^ ^ ^ ^
amine@334 126 # 18 30 33 37
amine@297 127
amine@2 128 tokens = tokenizer.tokenize(data_source)
amine@297 129
amine@297 130 self.assertEqual(
amine@297 131 len(tokens),
amine@297 132 2,
amine@297 133 msg="wrong number of tokens, expected: 2, found: {0} ".format(
amine@297 134 len(tokens)
amine@297 135 ),
amine@297 136 )
amine@2 137 tok1, tok2 = tokens[0], tokens[1]
amine@297 138
amine@297 139 data = "".join(tok1[0])
amine@2 140 start = tok1[1]
amine@2 141 end = tok1[2]
amine@297 142 self.assertEqual(
amine@297 143 data,
amine@297 144 "AAAAAAAAAaaaa",
amine@334 145 msg=(
amine@334 146 "wrong data for token 1, expected: 'AAAAAAAAAaaaa', "
amine@334 147 "found: '{0}' "
amine@334 148 ).format(data),
amine@297 149 )
amine@297 150 self.assertEqual(
amine@297 151 start,
amine@297 152 18,
amine@334 153 msg=(
amine@334 154 "wrong start frame for token 1, expected: 18, found: {0} "
amine@334 155 ).format(start),
amine@297 156 )
amine@297 157 self.assertEqual(
amine@297 158 end,
amine@297 159 30,
amine@334 160 msg=(
amine@334 161 "wrong end frame for token 1, expected: 30, found: {0} "
amine@334 162 ).format(end),
amine@297 163 )
amine@297 164
amine@297 165 data = "".join(tok2[0])
amine@2 166 start = tok2[1]
amine@2 167 end = tok2[2]
amine@297 168 self.assertEqual(
amine@297 169 data,
amine@297 170 "AAAAA",
amine@334 171 msg=(
amine@334 172 "wrong data for token 1, expected: 'AAAAA', found: '{0}' "
amine@334 173 ).format(data),
amine@297 174 )
amine@297 175 self.assertEqual(
amine@297 176 start,
amine@297 177 33,
amine@334 178 msg=(
amine@334 179 "wrong start frame for token 2, expected: 33, found: {0} "
amine@334 180 ).format(start),
amine@297 181 )
amine@297 182 self.assertEqual(
amine@297 183 end,
amine@297 184 37,
amine@334 185 msg=(
amine@334 186 "wrong end frame for token 2, expected: 37, found: {0} "
amine@334 187 ).format(end),
amine@297 188 )
amine@297 189
amine@2 190 # A valid token is considered iff the tokenizer encounters
amine@2 191 # at least valid frames (init_min = 3) between witch there
amine@2 192 # are at most 2 consecutive non valid frames (init_max_silence = 2)
amine@2 193 def test_init_min_3_init_max_silence_2(self):
amine@297 194
amine@297 195 tokenizer = StreamTokenizer(
amine@297 196 self.A_validator,
amine@297 197 min_length=5,
amine@297 198 max_length=20,
amine@297 199 max_continuous_silence=4,
amine@297 200 init_min=3,
amine@297 201 init_max_silence=2,
amine@297 202 mode=0,
amine@297 203 )
amine@297 204
amine@297 205 data_source = StringDataSource(
amine@297 206 "aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA"
amine@297 207 )
amine@334 208 # ^ ^ ^ ^ ^ ^
amine@334 209 # 5 16 19 31 35 39
amine@2 210 tokens = tokenizer.tokenize(data_source)
amine@297 211
amine@297 212 self.assertEqual(
amine@297 213 len(tokens),
amine@297 214 3,
amine@297 215 msg="wrong number of tokens, expected: 3, found: {0} ".format(
amine@297 216 len(tokens)
amine@297 217 ),
amine@297 218 )
amine@2 219 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 220
amine@297 221 data = "".join(tok1[0])
amine@2 222 start = tok1[1]
amine@2 223 end = tok1[2]
amine@297 224 self.assertEqual(
amine@297 225 data,
amine@297 226 "AaAaaAaAaaaa",
amine@334 227 msg=(
amine@334 228 "wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' "
amine@334 229 ).format(data),
amine@297 230 )
amine@297 231 self.assertEqual(
amine@297 232 start,
amine@297 233 5,
amine@334 234 msg=(
amine@334 235 "wrong start frame for token 1, expected: 5, found: {0} "
amine@334 236 ).format(start),
amine@297 237 )
amine@297 238 self.assertEqual(
amine@297 239 end,
amine@297 240 16,
amine@334 241 msg=(
amine@334 242 "wrong end frame for token 1, expected: 16, found: {0} "
amine@334 243 ).format(end),
amine@297 244 )
amine@297 245
amine@297 246 data = "".join(tok2[0])
amine@2 247 start = tok2[1]
amine@2 248 end = tok2[2]
amine@297 249 self.assertEqual(
amine@297 250 data,
amine@297 251 "AAAAAAAAAaaaa",
amine@334 252 msg=(
amine@334 253 "wrong data for token 2, expected: 'AAAAAAAAAaaaa', "
amine@334 254 "found: '{0}' "
amine@334 255 ).format(data),
amine@297 256 )
amine@297 257 self.assertEqual(
amine@297 258 start,
amine@297 259 19,
amine@334 260 msg=(
amine@334 261 "wrong start frame for token 2, expected: 19, found: {0} "
amine@334 262 ).format(start),
amine@297 263 )
amine@297 264 self.assertEqual(
amine@297 265 end,
amine@297 266 31,
amine@334 267 msg=(
amine@334 268 "wrong end frame for token 2, expected: 31, found: {0} "
amine@334 269 ).format(end),
amine@297 270 )
amine@297 271
amine@297 272 data = "".join(tok3[0])
amine@2 273 start = tok3[1]
amine@2 274 end = tok3[2]
amine@297 275 self.assertEqual(
amine@297 276 data,
amine@297 277 "AAAAA",
amine@334 278 msg=(
amine@334 279 "wrong data for token 3, expected: 'AAAAA', found: '{0}' "
amine@334 280 ).format(data),
amine@297 281 )
amine@297 282 self.assertEqual(
amine@297 283 start,
amine@297 284 35,
amine@334 285 msg=(
amine@334 286 "wrong start frame for token 2, expected: 35, found: {0} "
amine@334 287 ).format(start),
amine@297 288 )
amine@297 289 self.assertEqual(
amine@297 290 end,
amine@297 291 39,
amine@334 292 msg=(
amine@334 293 "wrong end frame for token 2, expected: 39, found: {0} "
amine@334 294 ).format(end),
amine@297 295 )
amine@297 296
amine@297 297
amine@2 298 class TestStreamTokenizerMinMaxLength(unittest.TestCase):
amine@2 299 def setUp(self):
amine@2 300 self.A_validator = AValidator()
amine@297 301
amine@2 302 def test_min_length_6_init_max_length_20(self):
amine@297 303
amine@297 304 tokenizer = StreamTokenizer(
amine@297 305 self.A_validator,
amine@297 306 min_length=6,
amine@297 307 max_length=20,
amine@297 308 max_continuous_silence=2,
amine@297 309 init_min=3,
amine@297 310 init_max_silence=3,
amine@297 311 mode=0,
amine@297 312 )
amine@297 313
amine@2 314 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@5 315 # ^ ^ ^ ^
amine@5 316 # 1 14 18 28
amine@297 317
amine@2 318 tokens = tokenizer.tokenize(data_source)
amine@297 319
amine@297 320 self.assertEqual(
amine@297 321 len(tokens),
amine@297 322 2,
amine@297 323 msg="wrong number of tokens, expected: 2, found: {0} ".format(
amine@297 324 len(tokens)
amine@297 325 ),
amine@297 326 )
amine@2 327 tok1, tok2 = tokens[0], tokens[1]
amine@297 328
amine@297 329 data = "".join(tok1[0])
amine@2 330 start = tok1[1]
amine@2 331 end = tok1[2]
amine@297 332 self.assertEqual(
amine@297 333 data,
amine@297 334 "AaaaAaAaaAaAaa",
amine@334 335 msg=(
amine@334 336 "wrong data for token 1, expected: 'AaaaAaAaaAaAaa', "
amine@334 337 "found: '{0}' "
amine@334 338 ).format(data),
amine@297 339 )
amine@297 340 self.assertEqual(
amine@297 341 start,
amine@297 342 1,
amine@334 343 msg=(
amine@334 344 "wrong start frame for token 1, expected: 1, found: {0} "
amine@334 345 ).format(start),
amine@297 346 )
amine@297 347 self.assertEqual(
amine@297 348 end,
amine@297 349 14,
amine@334 350 msg=(
amine@334 351 "wrong end frame for token 1, expected: 14, found: {0} "
amine@334 352 ).format(end),
amine@297 353 )
amine@297 354
amine@297 355 data = "".join(tok2[0])
amine@2 356 start = tok2[1]
amine@2 357 end = tok2[2]
amine@297 358 self.assertEqual(
amine@297 359 data,
amine@297 360 "AAAAAAAAAaa",
amine@334 361 msg=(
amine@334 362 "wrong data for token 2, expected: 'AAAAAAAAAaa', "
amine@334 363 "found: '{0}' "
amine@334 364 ).format(data),
amine@297 365 )
amine@297 366 self.assertEqual(
amine@297 367 start,
amine@297 368 18,
amine@334 369 msg=(
amine@334 370 "wrong start frame for token 2, expected: 18, found: {0} "
amine@334 371 ).format(start),
amine@297 372 )
amine@297 373 self.assertEqual(
amine@297 374 end,
amine@297 375 28,
amine@334 376 msg=(
amine@334 377 "wrong end frame for token 2, expected: 28, found: {0} "
amine@334 378 ).format(end),
amine@297 379 )
amine@297 380
amine@2 381 def test_min_length_1_init_max_length_1(self):
amine@297 382
amine@297 383 tokenizer = StreamTokenizer(
amine@297 384 self.A_validator,
amine@297 385 min_length=1,
amine@297 386 max_length=1,
amine@297 387 max_continuous_silence=0,
amine@297 388 init_min=0,
amine@297 389 init_max_silence=0,
amine@297 390 mode=0,
amine@297 391 )
amine@297 392
amine@297 393 data_source = StringDataSource(
amine@297 394 "AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA"
amine@297 395 )
amine@297 396
amine@2 397 tokens = tokenizer.tokenize(data_source)
amine@297 398
amine@297 399 self.assertEqual(
amine@297 400 len(tokens),
amine@297 401 21,
amine@297 402 msg="wrong number of tokens, expected: 21, found: {0} ".format(
amine@297 403 len(tokens)
amine@297 404 ),
amine@297 405 )
amine@297 406
amine@2 407 def test_min_length_10_init_max_length_20(self):
amine@297 408
amine@297 409 tokenizer = StreamTokenizer(
amine@297 410 self.A_validator,
amine@297 411 min_length=10,
amine@297 412 max_length=20,
amine@297 413 max_continuous_silence=4,
amine@297 414 init_min=3,
amine@297 415 init_max_silence=3,
amine@297 416 mode=0,
amine@297 417 )
amine@297 418
amine@297 419 data_source = StringDataSource(
amine@297 420 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA"
amine@297 421 )
amine@334 422 # ^ ^ ^ ^
amine@334 423 # 1 16 30 45
amine@297 424
amine@2 425 tokens = tokenizer.tokenize(data_source)
amine@297 426
amine@297 427 self.assertEqual(
amine@297 428 len(tokens),
amine@297 429 2,
amine@297 430 msg="wrong number of tokens, expected: 2, found: {0} ".format(
amine@297 431 len(tokens)
amine@297 432 ),
amine@297 433 )
amine@2 434 tok1, tok2 = tokens[0], tokens[1]
amine@297 435
amine@297 436 data = "".join(tok1[0])
amine@2 437 start = tok1[1]
amine@2 438 end = tok1[2]
amine@297 439 self.assertEqual(
amine@297 440 data,
amine@297 441 "AaaaAaAaaAaAaaaa",
amine@334 442 msg=(
amine@334 443 "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', "
amine@334 444 "found: '{0}' "
amine@334 445 ).format(data),
amine@297 446 )
amine@297 447 self.assertEqual(
amine@297 448 start,
amine@297 449 1,
amine@334 450 msg=(
amine@334 451 "wrong start frame for token 1, expected: 1, found: {0} "
amine@334 452 ).format(start),
amine@297 453 )
amine@297 454 self.assertEqual(
amine@297 455 end,
amine@297 456 16,
amine@334 457 msg=(
amine@334 458 "wrong end frame for token 1, expected: 16, found: {0} "
amine@334 459 ).format(end),
amine@297 460 )
amine@297 461
amine@297 462 data = "".join(tok2[0])
amine@2 463 start = tok2[1]
amine@2 464 end = tok2[2]
amine@297 465 self.assertEqual(
amine@297 466 data,
amine@297 467 "AAAAAaaAAaaAAA",
amine@334 468 msg=(
amine@334 469 "wrong data for token 2, expected: 'AAAAAaaAAaaAAA', "
amine@334 470 "found: '{0}' "
amine@334 471 ).format(data),
amine@297 472 )
amine@297 473 self.assertEqual(
amine@297 474 start,
amine@297 475 30,
amine@334 476 msg=(
amine@334 477 "wrong start frame for token 2, expected: 30, found: {0} "
amine@334 478 ).format(start),
amine@297 479 )
amine@297 480 self.assertEqual(
amine@297 481 end,
amine@297 482 43,
amine@334 483 msg=(
amine@334 484 "wrong end frame for token 2, expected: 43, found: {0} "
amine@334 485 ).format(end),
amine@297 486 )
amine@297 487
amine@2 488 def test_min_length_4_init_max_length_5(self):
amine@297 489
amine@297 490 tokenizer = StreamTokenizer(
amine@297 491 self.A_validator,
amine@297 492 min_length=4,
amine@297 493 max_length=5,
amine@297 494 max_continuous_silence=4,
amine@297 495 init_min=3,
amine@297 496 init_max_silence=3,
amine@297 497 mode=0,
amine@297 498 )
amine@297 499
amine@297 500 data_source = StringDataSource(
amine@297 501 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa"
amine@297 502 )
amine@334 503 # ^ ^^ ^ ^ ^ ^ ^
amine@334 504 # 18 2223 27 32 36 42 46
amine@297 505
amine@2 506 tokens = tokenizer.tokenize(data_source)
amine@297 507
amine@297 508 self.assertEqual(
amine@297 509 len(tokens),
amine@297 510 4,
amine@297 511 msg="wrong number of tokens, expected: 4, found: {0} ".format(
amine@297 512 len(tokens)
amine@297 513 ),
amine@297 514 )
amine@2 515 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
amine@297 516
amine@297 517 data = "".join(tok1[0])
amine@2 518 start = tok1[1]
amine@2 519 end = tok1[2]
amine@297 520 self.assertEqual(
amine@297 521 data,
amine@297 522 "AAAAA",
amine@334 523 msg=(
amine@334 524 "wrong data for token 1, expected: 'AAAAA', found: '{0}' "
amine@334 525 ).format(data),
amine@297 526 )
amine@297 527 self.assertEqual(
amine@297 528 start,
amine@297 529 18,
amine@334 530 msg=(
amine@334 531 "wrong start frame for token 1, expected: 18, found: {0} "
amine@334 532 ).format(start),
amine@297 533 )
amine@297 534 self.assertEqual(
amine@297 535 end,
amine@297 536 22,
amine@334 537 msg=(
amine@334 538 "wrong end frame for token 1, expected: 22, found: {0} "
amine@334 539 ).format(end),
amine@297 540 )
amine@297 541
amine@297 542 data = "".join(tok2[0])
amine@2 543 start = tok2[1]
amine@2 544 end = tok2[2]
amine@297 545 self.assertEqual(
amine@297 546 data,
amine@297 547 "AAAaa",
amine@334 548 msg=(
amine@334 549 "wrong data for token 1, expected: 'AAAaa', found: '{0}' "
amine@334 550 ).format(data),
amine@297 551 )
amine@297 552 self.assertEqual(
amine@297 553 start,
amine@297 554 23,
amine@334 555 msg=(
amine@334 556 "wrong start frame for token 1, expected: 23, found: {0} "
amine@334 557 ).format(start),
amine@297 558 )
amine@297 559 self.assertEqual(
amine@297 560 end,
amine@297 561 27,
amine@334 562 msg=(
amine@334 563 "wrong end frame for token 1, expected: 27, found: {0} "
amine@334 564 ).format(end),
amine@297 565 )
amine@297 566
amine@297 567 data = "".join(tok3[0])
amine@2 568 start = tok3[1]
amine@2 569 end = tok3[2]
amine@297 570 self.assertEqual(
amine@297 571 data,
amine@297 572 "AAAAA",
amine@334 573 msg=(
amine@334 574 "wrong data for token 1, expected: 'AAAAA', found: '{0}' "
amine@334 575 ).format(data),
amine@297 576 )
amine@297 577 self.assertEqual(
amine@297 578 start,
amine@297 579 32,
amine@334 580 msg=(
amine@334 581 "wrong start frame for token 1, expected: 1, found: {0} "
amine@334 582 ).format(start),
amine@297 583 )
amine@297 584 self.assertEqual(
amine@297 585 end,
amine@297 586 36,
amine@334 587 msg=(
amine@334 588 "wrong end frame for token 1, expected: 7, found: {0} "
amine@334 589 ).format(end),
amine@297 590 )
amine@297 591
amine@297 592 data = "".join(tok4[0])
amine@2 593 start = tok4[1]
amine@2 594 end = tok4[2]
amine@297 595 self.assertEqual(
amine@297 596 data,
amine@297 597 "AAaaA",
amine@334 598 msg=(
amine@334 599 "wrong data for token 2, expected: 'AAaaA', found: '{0}' "
amine@334 600 ).format(data),
amine@297 601 )
amine@297 602 self.assertEqual(
amine@297 603 start,
amine@297 604 42,
amine@334 605 msg=(
amine@334 606 "wrong start frame for token 2, expected: 17, found: {0} "
amine@334 607 ).format(start),
amine@297 608 )
amine@297 609 self.assertEqual(
amine@297 610 end,
amine@297 611 46,
amine@334 612 msg=(
amine@334 613 "wrong end frame for token 2, expected: 22, found: {0} "
amine@334 614 ).format(end),
amine@297 615 )
amine@297 616
amine@297 617
amine@2 618 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase):
amine@2 619 def setUp(self):
amine@2 620 self.A_validator = AValidator()
amine@297 621
amine@2 622 def test_min_5_max_10_max_continuous_silence_0(self):
amine@2 623
amine@297 624 tokenizer = StreamTokenizer(
amine@297 625 self.A_validator,
amine@297 626 min_length=5,
amine@297 627 max_length=10,
amine@297 628 max_continuous_silence=0,
amine@297 629 init_min=3,
amine@297 630 init_max_silence=3,
amine@297 631 mode=0,
amine@297 632 )
amine@297 633
amine@2 634 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@5 635 # ^ ^ ^ ^ ^ ^
amine@5 636 # 3 7 9 14 17 25
amine@297 637
amine@2 638 tokens = tokenizer.tokenize(data_source)
amine@297 639
amine@297 640 self.assertEqual(
amine@297 641 len(tokens),
amine@297 642 3,
amine@297 643 msg="wrong number of tokens, expected: 3, found: {0} ".format(
amine@297 644 len(tokens)
amine@297 645 ),
amine@297 646 )
amine@2 647 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 648
amine@297 649 data = "".join(tok1[0])
amine@2 650 start = tok1[1]
amine@2 651 end = tok1[2]
amine@297 652 self.assertEqual(
amine@297 653 data,
amine@297 654 "AAAAA",
amine@334 655 msg=(
amine@334 656 "wrong data for token 1, expected: 'AAAAA', found: '{0}' "
amine@334 657 ).format(data),
amine@297 658 )
amine@297 659 self.assertEqual(
amine@297 660 start,
amine@297 661 3,
amine@334 662 msg=(
amine@334 663 "wrong start frame for token 1, expected: 3, found: {0} "
amine@334 664 ).format(start),
amine@297 665 )
amine@297 666 self.assertEqual(
amine@297 667 end,
amine@297 668 7,
amine@334 669 msg=(
amine@334 670 "wrong end frame for token 1, expected: 7, found: {0} "
amine@334 671 ).format(end),
amine@297 672 )
amine@297 673
amine@297 674 data = "".join(tok2[0])
amine@2 675 start = tok2[1]
amine@2 676 end = tok2[2]
amine@297 677 self.assertEqual(
amine@297 678 data,
amine@297 679 "AAAAAA",
amine@334 680 msg=(
amine@334 681 "wrong data for token 1, expected: 'AAAAAA', found: '{0}' "
amine@334 682 ).format(data),
amine@297 683 )
amine@297 684 self.assertEqual(
amine@297 685 start,
amine@297 686 9,
amine@334 687 msg=(
amine@334 688 "wrong start frame for token 1, expected: 9, found: {0} "
amine@334 689 ).format(start),
amine@297 690 )
amine@297 691 self.assertEqual(
amine@297 692 end,
amine@297 693 14,
amine@334 694 msg=(
amine@334 695 "wrong end frame for token 1, expected: 14, found: {0} "
amine@334 696 ).format(end),
amine@297 697 )
amine@297 698
amine@297 699 data = "".join(tok3[0])
amine@2 700 start = tok3[1]
amine@2 701 end = tok3[2]
amine@297 702 self.assertEqual(
amine@297 703 data,
amine@297 704 "AAAAAAAAA",
amine@334 705 msg=(
amine@334 706 "wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' "
amine@334 707 ).format(data),
amine@297 708 )
amine@297 709 self.assertEqual(
amine@297 710 start,
amine@297 711 17,
amine@334 712 msg=(
amine@334 713 "wrong start frame for token 1, expected: 17, found: {0} "
amine@334 714 ).format(start),
amine@297 715 )
amine@297 716 self.assertEqual(
amine@297 717 end,
amine@297 718 25,
amine@334 719 msg=(
amine@334 720 "wrong end frame for token 1, expected: 25, found: {0} "
amine@334 721 ).format(end),
amine@297 722 )
amine@297 723
amine@2 724 def test_min_5_max_10_max_continuous_silence_1(self):
amine@2 725
amine@297 726 tokenizer = StreamTokenizer(
amine@297 727 self.A_validator,
amine@297 728 min_length=5,
amine@297 729 max_length=10,
amine@297 730 max_continuous_silence=1,
amine@297 731 init_min=3,
amine@297 732 init_max_silence=3,
amine@297 733 mode=0,
amine@297 734 )
amine@297 735
amine@2 736 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@5 737 # ^ ^^ ^ ^ ^
amine@5 738 # 3 12131517 26
amine@5 739 # (12 13 15 17)
amine@297 740
amine@2 741 tokens = tokenizer.tokenize(data_source)
amine@297 742
amine@297 743 self.assertEqual(
amine@297 744 len(tokens),
amine@297 745 3,
amine@297 746 msg="wrong number of tokens, expected: 3, found: {0} ".format(
amine@297 747 len(tokens)
amine@297 748 ),
amine@297 749 )
amine@2 750 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 751
amine@297 752 data = "".join(tok1[0])
amine@2 753 start = tok1[1]
amine@2 754 end = tok1[2]
amine@297 755 self.assertEqual(
amine@297 756 data,
amine@297 757 "AAAAAaAAAA",
amine@334 758 msg=(
amine@334 759 "wrong data for token 1, expected: 'AAAAAaAAAA', "
amine@334 760 "found: '{0}' "
amine@334 761 ).format(data),
amine@297 762 )
amine@297 763 self.assertEqual(
amine@297 764 start,
amine@297 765 3,
amine@334 766 msg=(
amine@334 767 "wrong start frame for token 1, expected: 3, found: {0} "
amine@334 768 ).format(start),
amine@297 769 )
amine@297 770 self.assertEqual(
amine@297 771 end,
amine@297 772 12,
amine@334 773 msg=(
amine@334 774 "wrong end frame for token 1, expected: 10, found: {0} "
amine@334 775 ).format(end),
amine@297 776 )
amine@297 777
amine@297 778 data = "".join(tok2[0])
amine@2 779 start = tok2[1]
amine@2 780 end = tok2[2]
amine@297 781 self.assertEqual(
amine@297 782 data,
amine@297 783 "AAa",
amine@334 784 msg=(
amine@334 785 "wrong data for token 1, expected: 'AAa', found: '{0}' "
amine@334 786 ).format(data),
amine@297 787 )
amine@297 788 self.assertEqual(
amine@297 789 start,
amine@297 790 13,
amine@334 791 msg=(
amine@334 792 "wrong start frame for token 1, expected: 9, found: {0} "
amine@334 793 ).format(start),
amine@297 794 )
amine@297 795 self.assertEqual(
amine@297 796 end,
amine@297 797 15,
amine@334 798 msg=(
amine@334 799 "wrong end frame for token 1, expected: 14, found: {0} "
amine@334 800 ).format(end),
amine@297 801 )
amine@297 802
amine@297 803 data = "".join(tok3[0])
amine@2 804 start = tok3[1]
amine@2 805 end = tok3[2]
amine@297 806 self.assertEqual(
amine@297 807 data,
amine@297 808 "AAAAAAAAAa",
amine@334 809 msg=(
amine@334 810 "wrong data for token 1, expected: 'AAAAAAAAAa', "
amine@334 811 "found: '{0}' "
amine@334 812 ).format(data),
amine@297 813 )
amine@297 814 self.assertEqual(
amine@297 815 start,
amine@297 816 17,
amine@334 817 msg=(
amine@334 818 "wrong start frame for token 1, expected: 17, found: {0} "
amine@334 819 ).format(start),
amine@297 820 )
amine@297 821 self.assertEqual(
amine@297 822 end,
amine@297 823 26,
amine@334 824 msg=(
amine@334 825 "wrong end frame for token 1, expected: 26, found: {0} "
amine@334 826 ).format(end),
amine@297 827 )
amine@297 828
amine@297 829
amine@2 830 class TestStreamTokenizerModes(unittest.TestCase):
amine@2 831 def setUp(self):
amine@2 832 self.A_validator = AValidator()
amine@297 833
amine@2 834 def test_STRICT_MIN_LENGTH(self):
amine@297 835
amine@297 836 tokenizer = StreamTokenizer(
amine@297 837 self.A_validator,
amine@297 838 min_length=5,
amine@297 839 max_length=8,
amine@297 840 max_continuous_silence=3,
amine@297 841 init_min=3,
amine@297 842 init_max_silence=3,
amine@297 843 mode=StreamTokenizer.STRICT_MIN_LENGTH,
amine@297 844 )
amine@297 845
amine@2 846 data_source = StringDataSource("aaAAAAAAAAAAAA")
amine@5 847 # ^ ^
amine@5 848 # 2 9
amine@297 849
amine@2 850 tokens = tokenizer.tokenize(data_source)
amine@297 851
amine@297 852 self.assertEqual(
amine@297 853 len(tokens),
amine@297 854 1,
amine@297 855 msg="wrong number of tokens, expected: 1, found: {0} ".format(
amine@297 856 len(tokens)
amine@297 857 ),
amine@297 858 )
amine@2 859 tok1 = tokens[0]
amine@297 860
amine@297 861 data = "".join(tok1[0])
amine@2 862 start = tok1[1]
amine@2 863 end = tok1[2]
amine@297 864 self.assertEqual(
amine@297 865 data,
amine@297 866 "AAAAAAAA",
amine@334 867 msg=(
amine@334 868 "wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' "
amine@334 869 ).format(data),
amine@297 870 )
amine@297 871 self.assertEqual(
amine@297 872 start,
amine@297 873 2,
amine@334 874 msg=(
amine@334 875 "wrong start frame for token 1, expected: 2, found: {0} "
amine@334 876 ).format(start),
amine@297 877 )
amine@297 878 self.assertEqual(
amine@297 879 end,
amine@297 880 9,
amine@334 881 msg=(
amine@334 882 "wrong end frame for token 1, expected: 9, found: {0} "
amine@334 883 ).format(end),
amine@297 884 )
amine@297 885
amine@3 886 def test_DROP_TAILING_SILENCE(self):
amine@297 887
amine@297 888 tokenizer = StreamTokenizer(
amine@297 889 self.A_validator,
amine@297 890 min_length=5,
amine@297 891 max_length=10,
amine@297 892 max_continuous_silence=2,
amine@297 893 init_min=3,
amine@297 894 init_max_silence=3,
amine@297 895 mode=StreamTokenizer.DROP_TRAILING_SILENCE,
amine@297 896 )
amine@297 897
amine@2 898 data_source = StringDataSource("aaAAAAAaaaaa")
amine@5 899 # ^ ^
amine@5 900 # 2 6
amine@297 901
amine@2 902 tokens = tokenizer.tokenize(data_source)
amine@297 903
amine@297 904 self.assertEqual(
amine@297 905 len(tokens),
amine@297 906 1,
amine@297 907 msg="wrong number of tokens, expected: 1, found: {0} ".format(
amine@297 908 len(tokens)
amine@297 909 ),
amine@297 910 )
amine@2 911 tok1 = tokens[0]
amine@297 912
amine@297 913 data = "".join(tok1[0])
amine@2 914 start = tok1[1]
amine@2 915 end = tok1[2]
amine@297 916 self.assertEqual(
amine@297 917 data,
amine@297 918 "AAAAA",
amine@334 919 msg=(
amine@334 920 "wrong data for token 1, expected: 'AAAAA', found: '{0}' "
amine@334 921 ).format(data),
amine@297 922 )
amine@297 923 self.assertEqual(
amine@297 924 start,
amine@297 925 2,
amine@334 926 msg=(
amine@334 927 "wrong start frame for token 1, expected: 2, found: {0} "
amine@334 928 ).format(start),
amine@297 929 )
amine@297 930 self.assertEqual(
amine@297 931 end,
amine@297 932 6,
amine@334 933 msg=(
amine@334 934 "wrong end frame for token 1, expected: 6, found: {0} "
amine@334 935 ).format(end),
amine@297 936 )
amine@297 937
amine@3 938 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self):
amine@297 939
amine@297 940 tokenizer = StreamTokenizer(
amine@297 941 self.A_validator,
amine@297 942 min_length=5,
amine@297 943 max_length=8,
amine@297 944 max_continuous_silence=3,
amine@297 945 init_min=3,
amine@297 946 init_max_silence=3,
amine@297 947 mode=StreamTokenizer.STRICT_MIN_LENGTH
amine@297 948 | StreamTokenizer.DROP_TRAILING_SILENCE,
amine@297 949 )
amine@297 950
amine@2 951 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
amine@5 952 # ^ ^
amine@5 953 # 2 8
amine@297 954
amine@2 955 tokens = tokenizer.tokenize(data_source)
amine@297 956
amine@297 957 self.assertEqual(
amine@297 958 len(tokens),
amine@297 959 1,
amine@297 960 msg="wrong number of tokens, expected: 1, found: {0} ".format(
amine@297 961 len(tokens)
amine@297 962 ),
amine@297 963 )
amine@2 964 tok1 = tokens[0]
amine@297 965
amine@297 966 data = "".join(tok1[0])
amine@2 967 start = tok1[1]
amine@2 968 end = tok1[2]
amine@297 969 self.assertEqual(
amine@297 970 data,
amine@297 971 "AAAAAAAA",
amine@334 972 msg=(
amine@334 973 "wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' "
amine@334 974 ).format(data),
amine@297 975 )
amine@297 976 self.assertEqual(
amine@297 977 start,
amine@297 978 2,
amine@334 979 msg=(
amine@334 980 "wrong start frame for token 1, expected: 2, found: {0} "
amine@334 981 ).format(start),
amine@297 982 )
amine@297 983 self.assertEqual(
amine@297 984 end,
amine@297 985 9,
amine@334 986 msg=(
amine@334 987 "wrong end frame for token 1, expected: 9, found: {0} "
amine@334 988 ).format(end),
amine@297 989 )
amine@297 990
amine@297 991
amine@2 992 class TestStreamTokenizerCallback(unittest.TestCase):
amine@2 993 def setUp(self):
amine@2 994 self.A_validator = AValidator()
amine@297 995
amine@2 996 def test_callback(self):
amine@297 997
amine@2 998 tokens = []
amine@297 999
amine@2 1000 def callback(data, start, end):
amine@2 1001 tokens.append((data, start, end))
amine@297 1002
amine@297 1003 tokenizer = StreamTokenizer(
amine@297 1004 self.A_validator,
amine@297 1005 min_length=5,
amine@297 1006 max_length=8,
amine@297 1007 max_continuous_silence=3,
amine@297 1008 init_min=3,
amine@297 1009 init_max_silence=3,
amine@297 1010 mode=0,
amine@297 1011 )
amine@297 1012
amine@2 1013 data_source = StringDataSource("aaAAAAAAAAAAAAa")
amine@5 1014 # ^ ^^ ^
amine@5 1015 # 2 910 14
amine@297 1016
amine@2 1017 tokenizer.tokenize(data_source, callback=callback)
amine@297 1018
amine@297 1019 self.assertEqual(
amine@297 1020 len(tokens),
amine@297 1021 2,
amine@297 1022 msg="wrong number of tokens, expected: 1, found: {0} ".format(
amine@297 1023 len(tokens)
amine@297 1024 ),
amine@297 1025 )
amine@2 1026
amine@2 1027
amine@2 1028 if __name__ == "__main__":
amine@297 1029 # import sys;sys.argv = ['', 'Test.testName']
amine@2 1030 unittest.main()