annotate tests/test_StreamTokenizer.py @ 455:7dae98b84cdd tip master

Merge branch 'master' of https://github.com/amsehili/auditok
author www-data <www-data@c4dm-xenserv-virt2.eecs.qmul.ac.uk>
date Tue, 03 Dec 2024 09:18:01 +0000
parents 996948ada980
children
rev   line source
amine@403 1 import os
amine@403 2
amine@400 3 import pytest
amine@403 4
amine@403 5 from auditok import DataValidator, StreamTokenizer, StringDataSource
amine@2 6
amine@2 7
amine@2 8 class AValidator(DataValidator):
amine@2 9 def is_valid(self, frame):
amine@2 10 return frame == "A"
amine@2 11
amine@2 12
amine@400 13 @pytest.fixture
amine@400 14 def validator():
amine@400 15 return AValidator()
amine@297 16
amine@297 17
amine@400 18 def test_init_min_0_init_max_silence_0(validator):
amine@400 19 tokenizer = StreamTokenizer(
amine@400 20 validator,
amine@400 21 min_length=5,
amine@400 22 max_length=20,
amine@400 23 max_continuous_silence=4,
amine@400 24 init_min=0,
amine@400 25 init_max_silence=0,
amine@400 26 mode=0,
amine@400 27 )
amine@297 28
amine@400 29 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
amine@400 30 # ^ ^ ^ ^
amine@400 31 # 2 16 20 27
amine@400 32 tokens = tokenizer.tokenize(data_source)
amine@297 33
amine@400 34 assert (
amine@400 35 len(tokens) == 2
amine@403 36 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
amine@400 37 tok1, tok2 = tokens[0], tokens[1]
amine@297 38
amine@400 39 data = "".join(tok1[0])
amine@400 40 start = tok1[1]
amine@400 41 end = tok1[2]
amine@400 42 assert (
amine@400 43 data == "AaaaAaAaaAaAaaaa"
amine@403 44 ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {}".format(
amine@403 45 data
amine@403 46 )
amine@400 47 assert (
amine@400 48 start == 1
amine@403 49 ), "wrong start frame for token 1, expected: 1, found: {}".format(start)
amine@403 50 assert (
amine@403 51 end == 16
amine@403 52 ), "wrong end frame for token 1, expected: 16, found: {}".format(end)
amine@297 53
amine@400 54 data = "".join(tok2[0])
amine@400 55 start = tok2[1]
amine@400 56 end = tok2[2]
amine@400 57 assert (
amine@400 58 data == "AAAAAAAA"
amine@403 59 ), "wrong data for token 2, expected: 'AAAAAAAA', found: {}".format(data)
amine@400 60 assert (
amine@400 61 start == 20
amine@403 62 ), "wrong start frame for token 2, expected: 20, found: {}".format(start)
amine@403 63 assert (
amine@403 64 end == 27
amine@403 65 ), "wrong end frame for token 2, expected: 27, found: {}".format(end)
amine@297 66
amine@297 67
amine@400 68 def test_init_min_3_init_max_silence_0(validator):
amine@400 69 tokenizer = StreamTokenizer(
amine@400 70 validator,
amine@400 71 min_length=5,
amine@400 72 max_length=20,
amine@400 73 max_continuous_silence=4,
amine@400 74 init_min=3,
amine@400 75 init_max_silence=0,
amine@400 76 mode=0,
amine@400 77 )
amine@297 78
amine@400 79 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
amine@400 80 # ^ ^ ^ ^
amine@400 81 # 18 30 33 37
amine@297 82
amine@400 83 tokens = tokenizer.tokenize(data_source)
amine@297 84
amine@400 85 assert (
amine@400 86 len(tokens) == 2
amine@403 87 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
amine@400 88 tok1, tok2 = tokens[0], tokens[1]
amine@297 89
amine@400 90 data = "".join(tok1[0])
amine@400 91 start = tok1[1]
amine@400 92 end = tok1[2]
amine@400 93 assert (
amine@400 94 data == "AAAAAAAAAaaaa"
amine@403 95 ), "wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: {}".format(
amine@403 96 data
amine@403 97 )
amine@400 98 assert (
amine@400 99 start == 18
amine@403 100 ), "wrong start frame for token 1, expected: 18, found: {}".format(start)
amine@403 101 assert (
amine@403 102 end == 30
amine@403 103 ), "wrong end frame for token 1, expected: 30, found: {}".format(end)
amine@297 104
amine@400 105 data = "".join(tok2[0])
amine@400 106 start = tok2[1]
amine@400 107 end = tok2[2]
amine@400 108 assert (
amine@400 109 data == "AAAAA"
amine@403 110 ), "wrong data for token 2, expected: 'AAAAA', found: {}".format(data)
amine@400 111 assert (
amine@400 112 start == 33
amine@403 113 ), "wrong start frame for token 2, expected: 33, found: {}".format(start)
amine@403 114 assert (
amine@403 115 end == 37
amine@403 116 ), "wrong end frame for token 2, expected: 37, found: {}".format(end)
amine@297 117
amine@297 118
amine@400 119 def test_init_min_3_init_max_silence_2(validator):
amine@400 120 tokenizer = StreamTokenizer(
amine@400 121 validator,
amine@400 122 min_length=5,
amine@400 123 max_length=20,
amine@400 124 max_continuous_silence=4,
amine@400 125 init_min=3,
amine@400 126 init_max_silence=2,
amine@400 127 mode=0,
amine@400 128 )
amine@297 129
amine@400 130 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
amine@400 131 # ^ ^ ^ ^ ^ ^
amine@400 132 # 5 16 19 31 35 39
amine@400 133 tokens = tokenizer.tokenize(data_source)
amine@297 134
amine@400 135 assert (
amine@400 136 len(tokens) == 3
amine@403 137 ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens))
amine@400 138 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 139
amine@400 140 data = "".join(tok1[0])
amine@400 141 start = tok1[1]
amine@400 142 end = tok1[2]
amine@400 143 assert (
amine@400 144 data == "AaAaaAaAaaaa"
amine@403 145 ), "wrong data for token 1, expected: 'AaAaaAaA', found: {}".format(data)
amine@400 146 assert (
amine@400 147 start == 5
amine@403 148 ), "wrong start frame for token 1, expected: 5, found: {}".format(start)
amine@403 149 assert (
amine@403 150 end == 16
amine@403 151 ), "wrong end frame for token 1, expected: 16, found: {}".format(end)
amine@297 152
amine@400 153 data = "".join(tok2[0])
amine@400 154 start = tok2[1]
amine@400 155 end = tok2[2]
amine@400 156 assert (
amine@400 157 data == "AAAAAAAAAaaaa"
amine@403 158 ), "wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: {}".format(
amine@403 159 data
amine@403 160 )
amine@400 161 assert (
amine@400 162 start == 19
amine@403 163 ), "wrong start frame for token 2, expected: 19, found: {}".format(start)
amine@403 164 assert (
amine@403 165 end == 31
amine@403 166 ), "wrong end frame for token 2, expected: 31, found: {}".format(end)
amine@297 167
amine@400 168 data = "".join(tok3[0])
amine@400 169 start = tok3[1]
amine@400 170 end = tok3[2]
amine@400 171 assert (
amine@400 172 data == "AAAAA"
amine@403 173 ), "wrong data for token 3, expected: 'AAAAA', found: {}".format(data)
amine@400 174 assert (
amine@400 175 start == 35
amine@403 176 ), "wrong start frame for token 3, expected: 35, found: {}".format(start)
amine@403 177 assert (
amine@403 178 end == 39
amine@403 179 ), "wrong end frame for token 3, expected: 39, found: {}".format(end)
amine@297 180
amine@297 181
amine@400 182 @pytest.fixture
amine@400 183 def tokenizer_min_max_length(validator):
amine@400 184 return StreamTokenizer(
amine@400 185 validator,
amine@400 186 min_length=6,
amine@400 187 max_length=20,
amine@400 188 max_continuous_silence=2,
amine@400 189 init_min=3,
amine@400 190 init_max_silence=3,
amine@400 191 mode=0,
amine@400 192 )
amine@297 193
amine@297 194
amine@400 195 def test_min_length_6_init_max_length_20(tokenizer_min_max_length):
amine@400 196 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@400 197 # ^ ^ ^ ^
amine@400 198 # 1 14 18 28
amine@297 199
amine@400 200 tokens = tokenizer_min_max_length.tokenize(data_source)
amine@297 201
amine@400 202 assert (
amine@400 203 len(tokens) == 2
amine@403 204 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
amine@400 205 tok1, tok2 = tokens[0], tokens[1]
amine@297 206
amine@400 207 data = "".join(tok1[0])
amine@400 208 start = tok1[1]
amine@400 209 end = tok1[2]
amine@400 210 assert (
amine@400 211 data == "AaaaAaAaaAaAaa"
amine@403 212 ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: {}".format(
amine@403 213 data
amine@403 214 )
amine@400 215 assert (
amine@400 216 start == 1
amine@403 217 ), "wrong start frame for token 1, expected: 1, found: {}".format(start)
amine@403 218 assert (
amine@403 219 end == 14
amine@403 220 ), "wrong end frame for token 1, expected: 14, found: {}".format(end)
amine@297 221
amine@400 222 data = "".join(tok2[0])
amine@400 223 start = tok2[1]
amine@400 224 end = tok2[2]
amine@400 225 assert (
amine@400 226 data == "AAAAAAAAAaa"
amine@403 227 ), "wrong data for token 2, expected: 'AAAAAAAAAaa', found: {}".format(data)
amine@400 228 assert (
amine@400 229 start == 18
amine@403 230 ), "wrong start frame for token 2, expected: 18, found: {}".format(start)
amine@403 231 assert (
amine@403 232 end == 28
amine@403 233 ), "wrong end frame for token 2, expected: 28, found: {}".format(end)
amine@297 234
amine@297 235
amine@400 236 @pytest.fixture
amine@400 237 def tokenizer_min_max_length_1_1(validator):
amine@400 238 return StreamTokenizer(
amine@400 239 validator,
amine@400 240 min_length=1,
amine@400 241 max_length=1,
amine@400 242 max_continuous_silence=0,
amine@400 243 init_min=0,
amine@400 244 init_max_silence=0,
amine@400 245 mode=0,
amine@400 246 )
amine@297 247
amine@297 248
amine@400 249 def test_min_length_1_init_max_length_1(tokenizer_min_max_length_1_1):
amine@400 250 data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@297 251
amine@400 252 tokens = tokenizer_min_max_length_1_1.tokenize(data_source)
amine@297 253
amine@400 254 assert (
amine@400 255 len(tokens) == 21
amine@403 256 ), "wrong number of tokens, expected: 21, found: {}".format(len(tokens))
amine@297 257
amine@297 258
amine@400 259 @pytest.fixture
amine@400 260 def tokenizer_min_max_length_10_20(validator):
amine@400 261 return StreamTokenizer(
amine@400 262 validator,
amine@400 263 min_length=10,
amine@400 264 max_length=20,
amine@400 265 max_continuous_silence=4,
amine@400 266 init_min=3,
amine@400 267 init_max_silence=3,
amine@400 268 mode=0,
amine@400 269 )
amine@297 270
amine@297 271
amine@400 272 def test_min_length_10_init_max_length_20(tokenizer_min_max_length_10_20):
amine@400 273 data_source = StringDataSource(
amine@400 274 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA"
amine@400 275 )
amine@400 276 # ^ ^ ^ ^
amine@400 277 # 1 16 30 45
amine@297 278
amine@400 279 tokens = tokenizer_min_max_length_10_20.tokenize(data_source)
amine@297 280
amine@400 281 assert (
amine@400 282 len(tokens) == 2
amine@403 283 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))
amine@400 284 tok1, tok2 = tokens[0], tokens[1]
amine@297 285
amine@400 286 data = "".join(tok1[0])
amine@400 287 start = tok1[1]
amine@400 288 end = tok1[2]
amine@400 289 assert (
amine@400 290 data == "AaaaAaAaaAaAaaaa"
amine@403 291 ), "wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {}".format(
amine@403 292 data
amine@403 293 )
amine@400 294 assert (
amine@400 295 start == 1
amine@403 296 ), "wrong start frame for token 1, expected: 1, found: {}".format(start)
amine@403 297 assert (
amine@403 298 end == 16
amine@403 299 ), "wrong end frame for token 1, expected: 16, found: {}".format(end)
amine@297 300
amine@400 301 data = "".join(tok2[0])
amine@400 302 start = tok2[1]
amine@400 303 end = tok2[2]
amine@400 304 assert (
amine@400 305 data == "AAAAAaaAAaaAAA"
amine@403 306 ), "wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: {}".format(
amine@403 307 data
amine@403 308 )
amine@400 309 assert (
amine@400 310 start == 30
amine@403 311 ), "wrong start frame for token 2, expected: 30, found: {}".format(start)
amine@403 312 assert (
amine@403 313 end == 43
amine@403 314 ), "wrong end frame for token 2, expected: 43, found: {}".format(end)
amine@297 315
amine@297 316
amine@400 317 @pytest.fixture
amine@400 318 def tokenizer_min_max_length_4_5(validator):
amine@400 319 return StreamTokenizer(
amine@400 320 validator,
amine@400 321 min_length=4,
amine@400 322 max_length=5,
amine@400 323 max_continuous_silence=4,
amine@400 324 init_min=3,
amine@400 325 init_max_silence=3,
amine@400 326 mode=0,
amine@400 327 )
amine@297 328
amine@297 329
amine@400 330 def test_min_length_4_init_max_length_5(tokenizer_min_max_length_4_5):
amine@400 331 data_source = StringDataSource(
amine@400 332 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa"
amine@400 333 )
amine@400 334 # ^ ^^ ^ ^ ^ ^ ^
amine@400 335 # 18 2223 27 32 36 42 46
amine@297 336
amine@400 337 tokens = tokenizer_min_max_length_4_5.tokenize(data_source)
amine@297 338
amine@400 339 assert (
amine@400 340 len(tokens) == 4
amine@403 341 ), "wrong number of tokens, expected: 4, found: {}".format(len(tokens))
amine@400 342 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
amine@297 343
amine@400 344 data = "".join(tok1[0])
amine@400 345 start = tok1[1]
amine@400 346 end = tok1[2]
amine@400 347 assert (
amine@400 348 data == "AAAAA"
amine@403 349 ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data)
amine@400 350 assert (
amine@400 351 start == 18
amine@403 352 ), "wrong start frame for token 1, expected: 18, found: {}".format(start)
amine@403 353 assert (
amine@403 354 end == 22
amine@403 355 ), "wrong end frame for token 1, expected: 22, found: {}".format(end)
amine@297 356
amine@400 357 data = "".join(tok2[0])
amine@400 358 start = tok2[1]
amine@400 359 end = tok2[2]
amine@400 360 assert (
amine@400 361 data == "AAAaa"
amine@403 362 ), "wrong data for token 2, expected: 'AAAaa', found: {}".format(data)
amine@400 363 assert (
amine@400 364 start == 23
amine@403 365 ), "wrong start frame for token 2, expected: 23, found: {}".format(start)
amine@403 366 assert (
amine@403 367 end == 27
amine@403 368 ), "wrong end frame for token 2, expected: 27, found: {}".format(end)
amine@297 369
amine@400 370 data = "".join(tok3[0])
amine@400 371 start = tok3[1]
amine@400 372 end = tok3[2]
amine@400 373 assert (
amine@400 374 data == "AAAAA"
amine@403 375 ), "wrong data for token 3, expected: 'AAAAA', found: {}".format(data)
amine@400 376 assert (
amine@400 377 start == 32
amine@403 378 ), "wrong start frame for token 3, expected: 32, found: {}".format(start)
amine@403 379 assert (
amine@403 380 end == 36
amine@403 381 ), "wrong end frame for token 3, expected: 36, found: {}".format(end)
amine@297 382
amine@400 383 data = "".join(tok4[0])
amine@400 384 start = tok4[1]
amine@400 385 end = tok4[2]
amine@400 386 assert (
amine@400 387 data == "AAaaA"
amine@403 388 ), "wrong data for token 4, expected: 'AAaaA', found: {}".format(data)
amine@400 389 assert (
amine@400 390 start == 42
amine@403 391 ), "wrong start frame for token 4, expected: 42, found: {}".format(start)
amine@403 392 assert (
amine@403 393 end == 46
amine@403 394 ), "wrong end frame for token 4, expected: 46, found: {}".format(end)
amine@297 395
amine@297 396
amine@400 397 @pytest.fixture
amine@400 398 def tokenizer_max_continuous_silence_0(validator):
amine@400 399 return StreamTokenizer(
amine@400 400 validator,
amine@400 401 min_length=5,
amine@400 402 max_length=10,
amine@400 403 max_continuous_silence=0,
amine@400 404 init_min=3,
amine@400 405 init_max_silence=3,
amine@400 406 mode=0,
amine@400 407 )
amine@2 408
amine@297 409
amine@400 410 def test_min_5_max_10_max_continuous_silence_0(
amine@400 411 tokenizer_max_continuous_silence_0,
amine@400 412 ):
amine@400 413 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@400 414 # ^ ^ ^ ^ ^ ^
amine@400 415 # 3 7 9 14 17 25
amine@297 416
amine@400 417 tokens = tokenizer_max_continuous_silence_0.tokenize(data_source)
amine@297 418
amine@400 419 assert (
amine@400 420 len(tokens) == 3
amine@403 421 ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens))
amine@400 422 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 423
amine@400 424 data = "".join(tok1[0])
amine@400 425 start = tok1[1]
amine@400 426 end = tok1[2]
amine@400 427 assert (
amine@400 428 data == "AAAAA"
amine@403 429 ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data)
amine@400 430 assert (
amine@400 431 start == 3
amine@403 432 ), "wrong start frame for token 1, expected: 3, found: {}".format(start)
amine@403 433 assert (
amine@403 434 end == 7
amine@403 435 ), "wrong end frame for token 1, expected: 7, found: {}".format(end)
amine@297 436
amine@400 437 data = "".join(tok2[0])
amine@400 438 start = tok2[1]
amine@400 439 end = tok2[2]
amine@400 440 assert (
amine@400 441 data == "AAAAAA"
amine@403 442 ), "wrong data for token 2, expected: 'AAAAAA', found: {}".format(data)
amine@400 443 assert (
amine@400 444 start == 9
amine@403 445 ), "wrong start frame for token 2, expected: 9, found: {}".format(start)
amine@403 446 assert (
amine@403 447 end == 14
amine@403 448 ), "wrong end frame for token 2, expected: 14, found: {}".format(end)
amine@297 449
amine@400 450 data = "".join(tok3[0])
amine@400 451 start = tok3[1]
amine@400 452 end = tok3[2]
amine@400 453 assert (
amine@400 454 data == "AAAAAAAAA"
amine@403 455 ), "wrong data for token 3, expected: 'AAAAAAAAA', found: {}".format(data)
amine@400 456 assert (
amine@400 457 start == 17
amine@403 458 ), "wrong start frame for token 3, expected: 17, found: {}".format(start)
amine@403 459 assert (
amine@403 460 end == 25
amine@403 461 ), "wrong end frame for token 3, expected: 25, found: {}".format(end)
amine@297 462
amine@2 463
amine@400 464 @pytest.fixture
amine@400 465 def tokenizer_max_continuous_silence_1(validator):
amine@400 466 return StreamTokenizer(
amine@400 467 validator,
amine@400 468 min_length=5,
amine@400 469 max_length=10,
amine@400 470 max_continuous_silence=1,
amine@400 471 init_min=3,
amine@400 472 init_max_silence=3,
amine@400 473 mode=0,
amine@400 474 )
amine@297 475
amine@297 476
amine@400 477 def test_min_5_max_10_max_continuous_silence_1(
amine@400 478 tokenizer_max_continuous_silence_1,
amine@400 479 ):
amine@400 480 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@400 481 # ^ ^^ ^ ^ ^
amine@400 482 # 3 12131517 26
amine@400 483 # (12 13 15 17)
amine@297 484
amine@400 485 tokens = tokenizer_max_continuous_silence_1.tokenize(data_source)
amine@297 486
amine@400 487 assert (
amine@400 488 len(tokens) == 3
amine@403 489 ), "wrong number of tokens, expected: 3, found: {}".format(len(tokens))
amine@400 490 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 491
amine@400 492 data = "".join(tok1[0])
amine@400 493 start = tok1[1]
amine@400 494 end = tok1[2]
amine@400 495 assert (
amine@400 496 data == "AAAAAaAAAA"
amine@403 497 ), "wrong data for token 1, expected: 'AAAAAaAAAA', found: {}".format(data)
amine@400 498 assert (
amine@400 499 start == 3
amine@403 500 ), "wrong start frame for token 1, expected: 3, found: {}".format(start)
amine@403 501 assert (
amine@403 502 end == 12
amine@403 503 ), "wrong end frame for token 1, expected: 12, found: {}".format(end)
amine@297 504
amine@400 505 data = "".join(tok2[0])
amine@400 506 start = tok2[1]
amine@400 507 end = tok2[2]
amine@400 508 assert (
amine@400 509 data == "AAa"
amine@403 510 ), "wrong data for token 2, expected: 'AAa', found: {}".format(data)
amine@400 511 assert (
amine@400 512 start == 13
amine@403 513 ), "wrong start frame for token 2, expected: 13, found: {}".format(start)
amine@403 514 assert (
amine@403 515 end == 15
amine@403 516 ), "wrong end frame for token 2, expected: 15, found: {}".format(end)
amine@297 517
amine@400 518 data = "".join(tok3[0])
amine@400 519 start = tok3[1]
amine@400 520 end = tok3[2]
amine@400 521 assert (
amine@400 522 data == "AAAAAAAAAa"
amine@403 523 ), "wrong data for token 3, expected: 'AAAAAAAAAa', found: {}".format(data)
amine@400 524 assert (
amine@400 525 start == 17
amine@403 526 ), "wrong start frame for token 3, expected: 17, found: {}".format(start)
amine@403 527 assert (
amine@403 528 end == 26
amine@403 529 ), "wrong end frame for token 3, expected: 26, found: {}".format(end)
amine@297 530
amine@297 531
amine@400 532 @pytest.fixture
amine@400 533 def tokenizer_strict_min_length(validator):
amine@400 534 return StreamTokenizer(
amine@400 535 validator,
amine@400 536 min_length=5,
amine@400 537 max_length=8,
amine@400 538 max_continuous_silence=3,
amine@400 539 init_min=3,
amine@400 540 init_max_silence=3,
amine@400 541 mode=StreamTokenizer.STRICT_MIN_LENGTH,
amine@400 542 )
amine@297 543
amine@297 544
amine@400 545 def test_STRICT_MIN_LENGTH(tokenizer_strict_min_length):
amine@400 546 data_source = StringDataSource("aaAAAAAAAAAAAA")
amine@400 547 # ^ ^
amine@400 548 # 2 9
amine@297 549
amine@400 550 tokens = tokenizer_strict_min_length.tokenize(data_source)
amine@297 551
amine@400 552 assert (
amine@400 553 len(tokens) == 1
amine@403 554 ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens))
amine@400 555 tok1 = tokens[0]
amine@297 556
amine@400 557 data = "".join(tok1[0])
amine@400 558 start = tok1[1]
amine@400 559 end = tok1[2]
amine@400 560 assert (
amine@400 561 data == "AAAAAAAA"
amine@403 562 ), "wrong data for token 1, expected: 'AAAAAAAA', found: {}".format(data)
amine@400 563 assert (
amine@400 564 start == 2
amine@403 565 ), "wrong start frame for token 1, expected: 2, found: {}".format(start)
amine@403 566 assert (
amine@403 567 end == 9
amine@403 568 ), "wrong end frame for token 1, expected: 9, found: {}".format(end)
amine@297 569
amine@297 570
amine@400 571 @pytest.fixture
amine@400 572 def tokenizer_drop_trailing_silence(validator):
amine@400 573 return StreamTokenizer(
amine@400 574 validator,
amine@400 575 min_length=5,
amine@400 576 max_length=10,
amine@400 577 max_continuous_silence=2,
amine@400 578 init_min=3,
amine@400 579 init_max_silence=3,
amine@400 580 mode=StreamTokenizer.DROP_TRAILING_SILENCE,
amine@400 581 )
amine@297 582
amine@297 583
amine@400 584 def test_DROP_TAILING_SILENCE(tokenizer_drop_trailing_silence):
amine@400 585 data_source = StringDataSource("aaAAAAAaaaaa")
amine@400 586 # ^ ^
amine@400 587 # 2 6
amine@297 588
amine@400 589 tokens = tokenizer_drop_trailing_silence.tokenize(data_source)
amine@297 590
amine@400 591 assert (
amine@400 592 len(tokens) == 1
amine@403 593 ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens))
amine@400 594 tok1 = tokens[0]
amine@297 595
amine@400 596 data = "".join(tok1[0])
amine@400 597 start = tok1[1]
amine@400 598 end = tok1[2]
amine@400 599 assert (
amine@400 600 data == "AAAAA"
amine@403 601 ), "wrong data for token 1, expected: 'AAAAA', found: {}".format(data)
amine@400 602 assert (
amine@400 603 start == 2
amine@403 604 ), "wrong start frame for token 1, expected: 2, found: {}".format(start)
amine@403 605 assert (
amine@403 606 end == 6
amine@403 607 ), "wrong end frame for token 1, expected: 6, found: {}".format(end)
amine@297 608
amine@297 609
amine@400 610 @pytest.fixture
amine@400 611 def tokenizer_strict_min_and_drop_trailing_silence(validator):
amine@400 612 return StreamTokenizer(
amine@400 613 validator,
amine@400 614 min_length=5,
amine@400 615 max_length=8,
amine@400 616 max_continuous_silence=3,
amine@400 617 init_min=3,
amine@400 618 init_max_silence=3,
amine@400 619 mode=StreamTokenizer.STRICT_MIN_LENGTH
amine@400 620 | StreamTokenizer.DROP_TRAILING_SILENCE,
amine@400 621 )
amine@297 622
amine@297 623
amine@400 624 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(
amine@400 625 tokenizer_strict_min_and_drop_trailing_silence,
amine@400 626 ):
amine@400 627 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
amine@400 628 # ^ ^
amine@400 629 # 2 8
amine@297 630
amine@400 631 tokens = tokenizer_strict_min_and_drop_trailing_silence.tokenize(
amine@400 632 data_source
amine@400 633 )
amine@297 634
amine@400 635 assert (
amine@400 636 len(tokens) == 1
amine@403 637 ), "wrong number of tokens, expected: 1, found: {}".format(len(tokens))
amine@400 638 tok1 = tokens[0]
amine@297 639
amine@400 640 data = "".join(tok1[0])
amine@400 641 start = tok1[1]
amine@400 642 end = tok1[2]
amine@400 643 assert (
amine@400 644 data == "AAAAAAAA"
amine@403 645 ), "wrong data for token 1, expected: 'AAAAAAAA', found: {}".format(data)
amine@400 646 assert (
amine@400 647 start == 2
amine@403 648 ), "wrong start frame for token 1, expected: 2, found: {}".format(start)
amine@403 649 assert (
amine@403 650 end == 9
amine@403 651 ), "wrong end frame for token 1, expected: 9, found: {}".format(end)
amine@297 652
amine@297 653
amine@400 654 @pytest.fixture
amine@400 655 def tokenizer_callback(validator):
amine@400 656 return StreamTokenizer(
amine@400 657 validator,
amine@400 658 min_length=5,
amine@400 659 max_length=8,
amine@400 660 max_continuous_silence=3,
amine@400 661 init_min=3,
amine@400 662 init_max_silence=3,
amine@400 663 mode=0,
amine@400 664 )
amine@297 665
amine@297 666
amine@400 667 def test_callback(tokenizer_callback):
amine@400 668 tokens = []
amine@297 669
amine@400 670 def callback(data, start, end):
amine@400 671 tokens.append((data, start, end))
amine@297 672
amine@400 673 data_source = StringDataSource("aaAAAAAAAAAAAAa")
amine@400 674 # ^ ^^ ^
amine@400 675 # 2 910 14
amine@297 676
amine@400 677 tokenizer_callback.tokenize(data_source, callback=callback)
amine@2 678
amine@400 679 assert (
amine@400 680 len(tokens) == 2
amine@403 681 ), "wrong number of tokens, expected: 2, found: {}".format(len(tokens))