annotate tests/test_StreamTokenizer.py @ 402:954c1e279068

Remove deprecated code from util.py
author Amine Sehili <amine.sehili@gmail.com>
date Sun, 26 May 2024 17:19:31 +0200
parents 323d59b404a2
children 996948ada980
rev   line source
amine@400 1 import pytest
amine@2 2 from auditok import StreamTokenizer, StringDataSource, DataValidator
amine@2 3
amine@2 4
amine@2 5 class AValidator(DataValidator):
amine@2 6 def is_valid(self, frame):
amine@2 7 return frame == "A"
amine@2 8
amine@2 9
amine@400 10 @pytest.fixture
amine@400 11 def validator():
amine@400 12 return AValidator()
amine@297 13
amine@297 14
amine@400 15 def test_init_min_0_init_max_silence_0(validator):
amine@400 16 tokenizer = StreamTokenizer(
amine@400 17 validator,
amine@400 18 min_length=5,
amine@400 19 max_length=20,
amine@400 20 max_continuous_silence=4,
amine@400 21 init_min=0,
amine@400 22 init_max_silence=0,
amine@400 23 mode=0,
amine@400 24 )
amine@297 25
amine@400 26 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
amine@400 27 # ^ ^ ^ ^
amine@400 28 # 2 16 20 27
amine@400 29 tokens = tokenizer.tokenize(data_source)
amine@297 30
amine@400 31 assert (
amine@400 32 len(tokens) == 2
amine@400 33 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
amine@400 34 tok1, tok2 = tokens[0], tokens[1]
amine@297 35
amine@400 36 data = "".join(tok1[0])
amine@400 37 start = tok1[1]
amine@400 38 end = tok1[2]
amine@400 39 assert (
amine@400 40 data == "AaaaAaAaaAaAaaaa"
amine@400 41 ), f"wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {data}"
amine@400 42 assert (
amine@400 43 start == 1
amine@400 44 ), f"wrong start frame for token 1, expected: 1, found: {start}"
amine@400 45 assert end == 16, f"wrong end frame for token 1, expected: 16, found: {end}"
amine@297 46
amine@400 47 data = "".join(tok2[0])
amine@400 48 start = tok2[1]
amine@400 49 end = tok2[2]
amine@400 50 assert (
amine@400 51 data == "AAAAAAAA"
amine@400 52 ), f"wrong data for token 2, expected: 'AAAAAAAA', found: {data}"
amine@400 53 assert (
amine@400 54 start == 20
amine@400 55 ), f"wrong start frame for token 2, expected: 20, found: {start}"
amine@400 56 assert end == 27, f"wrong end frame for token 2, expected: 27, found: {end}"
amine@297 57
amine@297 58
amine@400 59 def test_init_min_3_init_max_silence_0(validator):
amine@400 60 tokenizer = StreamTokenizer(
amine@400 61 validator,
amine@400 62 min_length=5,
amine@400 63 max_length=20,
amine@400 64 max_continuous_silence=4,
amine@400 65 init_min=3,
amine@400 66 init_max_silence=0,
amine@400 67 mode=0,
amine@400 68 )
amine@297 69
amine@400 70 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
amine@400 71 # ^ ^ ^ ^
amine@400 72 # 18 30 33 37
amine@297 73
amine@400 74 tokens = tokenizer.tokenize(data_source)
amine@297 75
amine@400 76 assert (
amine@400 77 len(tokens) == 2
amine@400 78 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
amine@400 79 tok1, tok2 = tokens[0], tokens[1]
amine@297 80
amine@400 81 data = "".join(tok1[0])
amine@400 82 start = tok1[1]
amine@400 83 end = tok1[2]
amine@400 84 assert (
amine@400 85 data == "AAAAAAAAAaaaa"
amine@400 86 ), f"wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{data}'"
amine@400 87 assert (
amine@400 88 start == 18
amine@400 89 ), f"wrong start frame for token 1, expected: 18, found: {start}"
amine@400 90 assert end == 30, f"wrong end frame for token 1, expected: 30, found: {end}"
amine@297 91
amine@400 92 data = "".join(tok2[0])
amine@400 93 start = tok2[1]
amine@400 94 end = tok2[2]
amine@400 95 assert (
amine@400 96 data == "AAAAA"
amine@400 97 ), f"wrong data for token 2, expected: 'AAAAA', found: '{data}'"
amine@400 98 assert (
amine@400 99 start == 33
amine@400 100 ), f"wrong start frame for token 2, expected: 33, found: {start}"
amine@400 101 assert end == 37, f"wrong end frame for token 2, expected: 37, found: {end}"
amine@297 102
amine@297 103
amine@400 104 def test_init_min_3_init_max_silence_2(validator):
amine@400 105 tokenizer = StreamTokenizer(
amine@400 106 validator,
amine@400 107 min_length=5,
amine@400 108 max_length=20,
amine@400 109 max_continuous_silence=4,
amine@400 110 init_min=3,
amine@400 111 init_max_silence=2,
amine@400 112 mode=0,
amine@400 113 )
amine@297 114
amine@400 115 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
amine@400 116 # ^ ^ ^ ^ ^ ^
amine@400 117 # 5 16 19 31 35 39
amine@400 118 tokens = tokenizer.tokenize(data_source)
amine@297 119
amine@400 120 assert (
amine@400 121 len(tokens) == 3
amine@400 122 ), f"wrong number of tokens, expected: 3, found: {len(tokens)}"
amine@400 123 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 124
amine@400 125 data = "".join(tok1[0])
amine@400 126 start = tok1[1]
amine@400 127 end = tok1[2]
amine@400 128 assert (
amine@400 129 data == "AaAaaAaAaaaa"
amine@400 130 ), f"wrong data for token 1, expected: 'AaAaaAaA', found: '{data}'"
amine@400 131 assert (
amine@400 132 start == 5
amine@400 133 ), f"wrong start frame for token 1, expected: 5, found: {start}"
amine@400 134 assert end == 16, f"wrong end frame for token 1, expected: 16, found: {end}"
amine@297 135
amine@400 136 data = "".join(tok2[0])
amine@400 137 start = tok2[1]
amine@400 138 end = tok2[2]
amine@400 139 assert (
amine@400 140 data == "AAAAAAAAAaaaa"
amine@400 141 ), f"wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{data}'"
amine@400 142 assert (
amine@400 143 start == 19
amine@400 144 ), f"wrong start frame for token 2, expected: 19, found: {start}"
amine@400 145 assert end == 31, f"wrong end frame for token 2, expected: 31, found: {end}"
amine@297 146
amine@400 147 data = "".join(tok3[0])
amine@400 148 start = tok3[1]
amine@400 149 end = tok3[2]
amine@400 150 assert (
amine@400 151 data == "AAAAA"
amine@400 152 ), f"wrong data for token 3, expected: 'AAAAA', found: '{data}'"
amine@400 153 assert (
amine@400 154 start == 35
amine@400 155 ), f"wrong start frame for token 3, expected: 35, found: {start}"
amine@400 156 assert end == 39, f"wrong end frame for token 3, expected: 39, found: {end}"
amine@297 157
amine@297 158
amine@400 159 @pytest.fixture
amine@400 160 def tokenizer_min_max_length(validator):
amine@400 161 return StreamTokenizer(
amine@400 162 validator,
amine@400 163 min_length=6,
amine@400 164 max_length=20,
amine@400 165 max_continuous_silence=2,
amine@400 166 init_min=3,
amine@400 167 init_max_silence=3,
amine@400 168 mode=0,
amine@400 169 )
amine@297 170
amine@297 171
amine@400 172 def test_min_length_6_init_max_length_20(tokenizer_min_max_length):
amine@400 173 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@400 174 # ^ ^ ^ ^
amine@400 175 # 1 14 18 28
amine@297 176
amine@400 177 tokens = tokenizer_min_max_length.tokenize(data_source)
amine@297 178
amine@400 179 assert (
amine@400 180 len(tokens) == 2
amine@400 181 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
amine@400 182 tok1, tok2 = tokens[0], tokens[1]
amine@297 183
amine@400 184 data = "".join(tok1[0])
amine@400 185 start = tok1[1]
amine@400 186 end = tok1[2]
amine@400 187 assert (
amine@400 188 data == "AaaaAaAaaAaAaa"
amine@400 189 ), f"wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{data}'"
amine@400 190 assert (
amine@400 191 start == 1
amine@400 192 ), f"wrong start frame for token 1, expected: 1, found: {start}"
amine@400 193 assert end == 14, f"wrong end frame for token 1, expected: 14, found: {end}"
amine@297 194
amine@400 195 data = "".join(tok2[0])
amine@400 196 start = tok2[1]
amine@400 197 end = tok2[2]
amine@400 198 assert (
amine@400 199 data == "AAAAAAAAAaa"
amine@400 200 ), f"wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{data}'"
amine@400 201 assert (
amine@400 202 start == 18
amine@400 203 ), f"wrong start frame for token 2, expected: 18, found: {start}"
amine@400 204 assert end == 28, f"wrong end frame for token 2, expected: 28, found: {end}"
amine@297 205
amine@297 206
amine@400 207 @pytest.fixture
amine@400 208 def tokenizer_min_max_length_1_1(validator):
amine@400 209 return StreamTokenizer(
amine@400 210 validator,
amine@400 211 min_length=1,
amine@400 212 max_length=1,
amine@400 213 max_continuous_silence=0,
amine@400 214 init_min=0,
amine@400 215 init_max_silence=0,
amine@400 216 mode=0,
amine@400 217 )
amine@297 218
amine@297 219
amine@400 220 def test_min_length_1_init_max_length_1(tokenizer_min_max_length_1_1):
amine@400 221 data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@297 222
amine@400 223 tokens = tokenizer_min_max_length_1_1.tokenize(data_source)
amine@297 224
amine@400 225 assert (
amine@400 226 len(tokens) == 21
amine@400 227 ), f"wrong number of tokens, expected: 21, found: {len(tokens)}"
amine@297 228
amine@297 229
amine@400 230 @pytest.fixture
amine@400 231 def tokenizer_min_max_length_10_20(validator):
amine@400 232 return StreamTokenizer(
amine@400 233 validator,
amine@400 234 min_length=10,
amine@400 235 max_length=20,
amine@400 236 max_continuous_silence=4,
amine@400 237 init_min=3,
amine@400 238 init_max_silence=3,
amine@400 239 mode=0,
amine@400 240 )
amine@297 241
amine@297 242
amine@400 243 def test_min_length_10_init_max_length_20(tokenizer_min_max_length_10_20):
amine@400 244 data_source = StringDataSource(
amine@400 245 "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA"
amine@400 246 )
amine@400 247 # ^ ^ ^ ^
amine@400 248 # 1 16 30 45
amine@297 249
amine@400 250 tokens = tokenizer_min_max_length_10_20.tokenize(data_source)
amine@297 251
amine@400 252 assert (
amine@400 253 len(tokens) == 2
amine@400 254 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"
amine@400 255 tok1, tok2 = tokens[0], tokens[1]
amine@297 256
amine@400 257 data = "".join(tok1[0])
amine@400 258 start = tok1[1]
amine@400 259 end = tok1[2]
amine@400 260 assert (
amine@400 261 data == "AaaaAaAaaAaAaaaa"
amine@400 262 ), f"wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{data}'"
amine@400 263 assert (
amine@400 264 start == 1
amine@400 265 ), f"wrong start frame for token 1, expected: 1, found: {start}"
amine@400 266 assert end == 16, f"wrong end frame for token 1, expected: 16, found: {end}"
amine@297 267
amine@400 268 data = "".join(tok2[0])
amine@400 269 start = tok2[1]
amine@400 270 end = tok2[2]
amine@400 271 assert (
amine@400 272 data == "AAAAAaaAAaaAAA"
amine@400 273 ), f"wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{data}'"
amine@400 274 assert (
amine@400 275 start == 30
amine@400 276 ), f"wrong start frame for token 2, expected: 30, found: {start}"
amine@400 277 assert end == 43, f"wrong end frame for token 2, expected: 43, found: {end}"
amine@297 278
amine@297 279
amine@400 280 @pytest.fixture
amine@400 281 def tokenizer_min_max_length_4_5(validator):
amine@400 282 return StreamTokenizer(
amine@400 283 validator,
amine@400 284 min_length=4,
amine@400 285 max_length=5,
amine@400 286 max_continuous_silence=4,
amine@400 287 init_min=3,
amine@400 288 init_max_silence=3,
amine@400 289 mode=0,
amine@400 290 )
amine@297 291
amine@297 292
amine@400 293 def test_min_length_4_init_max_length_5(tokenizer_min_max_length_4_5):
amine@400 294 data_source = StringDataSource(
amine@400 295 "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa"
amine@400 296 )
amine@400 297 # ^ ^^ ^ ^ ^ ^ ^
amine@400 298 # 18 2223 27 32 36 42 46
amine@297 299
amine@400 300 tokens = tokenizer_min_max_length_4_5.tokenize(data_source)
amine@297 301
amine@400 302 assert (
amine@400 303 len(tokens) == 4
amine@400 304 ), f"wrong number of tokens, expected: 4, found: {len(tokens)}"
amine@400 305 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
amine@297 306
amine@400 307 data = "".join(tok1[0])
amine@400 308 start = tok1[1]
amine@400 309 end = tok1[2]
amine@400 310 assert (
amine@400 311 data == "AAAAA"
amine@400 312 ), f"wrong data for token 1, expected: 'AAAAA', found: '{data}'"
amine@400 313 assert (
amine@400 314 start == 18
amine@400 315 ), f"wrong start frame for token 1, expected: 18, found: {start}"
amine@400 316 assert end == 22, f"wrong end frame for token 1, expected: 22, found: {end}"
amine@297 317
amine@400 318 data = "".join(tok2[0])
amine@400 319 start = tok2[1]
amine@400 320 end = tok2[2]
amine@400 321 assert (
amine@400 322 data == "AAAaa"
amine@400 323 ), f"wrong data for token 2, expected: 'AAAaa', found: '{data}'"
amine@400 324 assert (
amine@400 325 start == 23
amine@400 326 ), f"wrong start frame for token 2, expected: 23, found: {start}"
amine@400 327 assert end == 27, f"wrong end frame for token 2, expected: 27, found: {end}"
amine@297 328
amine@400 329 data = "".join(tok3[0])
amine@400 330 start = tok3[1]
amine@400 331 end = tok3[2]
amine@400 332 assert (
amine@400 333 data == "AAAAA"
amine@400 334 ), f"wrong data for token 3, expected: 'AAAAA', found: '{data}'"
amine@400 335 assert (
amine@400 336 start == 32
amine@400 337 ), f"wrong start frame for token 3, expected: 32, found: {start}"
amine@400 338 assert end == 36, f"wrong end frame for token 3, expected: 36, found: {end}"
amine@297 339
amine@400 340 data = "".join(tok4[0])
amine@400 341 start = tok4[1]
amine@400 342 end = tok4[2]
amine@400 343 assert (
amine@400 344 data == "AAaaA"
amine@400 345 ), f"wrong data for token 4, expected: 'AAaaA', found: '{data}'"
amine@400 346 assert (
amine@400 347 start == 42
amine@400 348 ), f"wrong start frame for token 4, expected: 42, found: {start}"
amine@400 349 assert end == 46, f"wrong end frame for token 4, expected: 46, found: {end}"
amine@297 350
amine@297 351
amine@400 352 @pytest.fixture
amine@400 353 def tokenizer_max_continuous_silence_0(validator):
amine@400 354 return StreamTokenizer(
amine@400 355 validator,
amine@400 356 min_length=5,
amine@400 357 max_length=10,
amine@400 358 max_continuous_silence=0,
amine@400 359 init_min=3,
amine@400 360 init_max_silence=3,
amine@400 361 mode=0,
amine@400 362 )
amine@2 363
amine@297 364
amine@400 365 def test_min_5_max_10_max_continuous_silence_0(
amine@400 366 tokenizer_max_continuous_silence_0,
amine@400 367 ):
amine@400 368 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@400 369 # ^ ^ ^ ^ ^ ^
amine@400 370 # 3 7 9 14 17 25
amine@297 371
amine@400 372 tokens = tokenizer_max_continuous_silence_0.tokenize(data_source)
amine@297 373
amine@400 374 assert (
amine@400 375 len(tokens) == 3
amine@400 376 ), f"wrong number of tokens, expected: 3, found: {len(tokens)}"
amine@400 377 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 378
amine@400 379 data = "".join(tok1[0])
amine@400 380 start = tok1[1]
amine@400 381 end = tok1[2]
amine@400 382 assert (
amine@400 383 data == "AAAAA"
amine@400 384 ), f"wrong data for token 1, expected: 'AAAAA', found: '{data}'"
amine@400 385 assert (
amine@400 386 start == 3
amine@400 387 ), f"wrong start frame for token 1, expected: 3, found: {start}"
amine@400 388 assert end == 7, f"wrong end frame for token 1, expected: 7, found: {end}"
amine@297 389
amine@400 390 data = "".join(tok2[0])
amine@400 391 start = tok2[1]
amine@400 392 end = tok2[2]
amine@400 393 assert (
amine@400 394 data == "AAAAAA"
amine@400 395 ), f"wrong data for token 2, expected: 'AAAAAA', found: '{data}'"
amine@400 396 assert (
amine@400 397 start == 9
amine@400 398 ), f"wrong start frame for token 2, expected: 9, found: {start}"
amine@400 399 assert end == 14, f"wrong end frame for token 2, expected: 14, found: {end}"
amine@297 400
amine@400 401 data = "".join(tok3[0])
amine@400 402 start = tok3[1]
amine@400 403 end = tok3[2]
amine@400 404 assert (
amine@400 405 data == "AAAAAAAAA"
amine@400 406 ), f"wrong data for token 3, expected: 'AAAAAAAAA', found: '{data}'"
amine@400 407 assert (
amine@400 408 start == 17
amine@400 409 ), f"wrong start frame for token 3, expected: 17, found: {start}"
amine@400 410 assert end == 25, f"wrong end frame for token 3, expected: 25, found: {end}"
amine@297 411
amine@2 412
amine@400 413 @pytest.fixture
amine@400 414 def tokenizer_max_continuous_silence_1(validator):
amine@400 415 return StreamTokenizer(
amine@400 416 validator,
amine@400 417 min_length=5,
amine@400 418 max_length=10,
amine@400 419 max_continuous_silence=1,
amine@400 420 init_min=3,
amine@400 421 init_max_silence=3,
amine@400 422 mode=0,
amine@400 423 )
amine@297 424
amine@297 425
amine@400 426 def test_min_5_max_10_max_continuous_silence_1(
amine@400 427 tokenizer_max_continuous_silence_1,
amine@400 428 ):
amine@400 429 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@400 430 # ^ ^^ ^ ^ ^
amine@400 431 # 3 12131517 26
amine@400 432 # (12 13 15 17)
amine@297 433
amine@400 434 tokens = tokenizer_max_continuous_silence_1.tokenize(data_source)
amine@297 435
amine@400 436 assert (
amine@400 437 len(tokens) == 3
amine@400 438 ), f"wrong number of tokens, expected: 3, found: {len(tokens)}"
amine@400 439 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@297 440
amine@400 441 data = "".join(tok1[0])
amine@400 442 start = tok1[1]
amine@400 443 end = tok1[2]
amine@400 444 assert (
amine@400 445 data == "AAAAAaAAAA"
amine@400 446 ), f"wrong data for token 1, expected: 'AAAAAaAAAA', found: '{data}'"
amine@400 447 assert (
amine@400 448 start == 3
amine@400 449 ), f"wrong start frame for token 1, expected: 3, found: {start}"
amine@400 450 assert end == 12, f"wrong end frame for token 1, expected: 12, found: {end}"
amine@297 451
amine@400 452 data = "".join(tok2[0])
amine@400 453 start = tok2[1]
amine@400 454 end = tok2[2]
amine@400 455 assert (
amine@400 456 data == "AAa"
amine@400 457 ), f"wrong data for token 2, expected: 'AAa', found: '{data}'"
amine@400 458 assert (
amine@400 459 start == 13
amine@400 460 ), f"wrong start frame for token 2, expected: 13, found: {start}"
amine@400 461 assert end == 15, f"wrong end frame for token 2, expected: 15, found: {end}"
amine@297 462
amine@400 463 data = "".join(tok3[0])
amine@400 464 start = tok3[1]
amine@400 465 end = tok3[2]
amine@400 466 assert (
amine@400 467 data == "AAAAAAAAAa"
amine@400 468 ), f"wrong data for token 3, expected: 'AAAAAAAAAa', found: '{data}'"
amine@400 469 assert (
amine@400 470 start == 17
amine@400 471 ), f"wrong start frame for token 3, expected: 17, found: {start}"
amine@400 472 assert end == 26, f"wrong end frame for token 3, expected: 26, found: {end}"
amine@297 473
amine@297 474
amine@400 475 @pytest.fixture
amine@400 476 def tokenizer_strict_min_length(validator):
amine@400 477 return StreamTokenizer(
amine@400 478 validator,
amine@400 479 min_length=5,
amine@400 480 max_length=8,
amine@400 481 max_continuous_silence=3,
amine@400 482 init_min=3,
amine@400 483 init_max_silence=3,
amine@400 484 mode=StreamTokenizer.STRICT_MIN_LENGTH,
amine@400 485 )
amine@297 486
amine@297 487
amine@400 488 def test_STRICT_MIN_LENGTH(tokenizer_strict_min_length):
amine@400 489 data_source = StringDataSource("aaAAAAAAAAAAAA")
amine@400 490 # ^ ^
amine@400 491 # 2 9
amine@297 492
amine@400 493 tokens = tokenizer_strict_min_length.tokenize(data_source)
amine@297 494
amine@400 495 assert (
amine@400 496 len(tokens) == 1
amine@400 497 ), f"wrong number of tokens, expected: 1, found: {len(tokens)}"
amine@400 498 tok1 = tokens[0]
amine@297 499
amine@400 500 data = "".join(tok1[0])
amine@400 501 start = tok1[1]
amine@400 502 end = tok1[2]
amine@400 503 assert (
amine@400 504 data == "AAAAAAAA"
amine@400 505 ), f"wrong data for token 1, expected: 'AAAAAAAA', found: '{data}'"
amine@400 506 assert (
amine@400 507 start == 2
amine@400 508 ), f"wrong start frame for token 1, expected: 2, found: {start}"
amine@400 509 assert end == 9, f"wrong end frame for token 1, expected: 9, found: {end}"
amine@297 510
amine@297 511
amine@400 512 @pytest.fixture
amine@400 513 def tokenizer_drop_trailing_silence(validator):
amine@400 514 return StreamTokenizer(
amine@400 515 validator,
amine@400 516 min_length=5,
amine@400 517 max_length=10,
amine@400 518 max_continuous_silence=2,
amine@400 519 init_min=3,
amine@400 520 init_max_silence=3,
amine@400 521 mode=StreamTokenizer.DROP_TRAILING_SILENCE,
amine@400 522 )
amine@297 523
amine@297 524
amine@400 525 def test_DROP_TAILING_SILENCE(tokenizer_drop_trailing_silence):
amine@400 526 data_source = StringDataSource("aaAAAAAaaaaa")
amine@400 527 # ^ ^
amine@400 528 # 2 6
amine@297 529
amine@400 530 tokens = tokenizer_drop_trailing_silence.tokenize(data_source)
amine@297 531
amine@400 532 assert (
amine@400 533 len(tokens) == 1
amine@400 534 ), f"wrong number of tokens, expected: 1, found: {len(tokens)}"
amine@400 535 tok1 = tokens[0]
amine@297 536
amine@400 537 data = "".join(tok1[0])
amine@400 538 start = tok1[1]
amine@400 539 end = tok1[2]
amine@400 540 assert (
amine@400 541 data == "AAAAA"
amine@400 542 ), f"wrong data for token 1, expected: 'AAAAA', found: '{data}'"
amine@400 543 assert (
amine@400 544 start == 2
amine@400 545 ), f"wrong start frame for token 1, expected: 2, found: {start}"
amine@400 546 assert end == 6, f"wrong end frame for token 1, expected: 6, found: {end}"
amine@297 547
amine@297 548
amine@400 549 @pytest.fixture
amine@400 550 def tokenizer_strict_min_and_drop_trailing_silence(validator):
amine@400 551 return StreamTokenizer(
amine@400 552 validator,
amine@400 553 min_length=5,
amine@400 554 max_length=8,
amine@400 555 max_continuous_silence=3,
amine@400 556 init_min=3,
amine@400 557 init_max_silence=3,
amine@400 558 mode=StreamTokenizer.STRICT_MIN_LENGTH
amine@400 559 | StreamTokenizer.DROP_TRAILING_SILENCE,
amine@400 560 )
amine@297 561
amine@297 562
amine@400 563 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(
amine@400 564 tokenizer_strict_min_and_drop_trailing_silence,
amine@400 565 ):
amine@400 566 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
amine@400 567 # ^ ^
amine@400 568 # 2 8
amine@297 569
amine@400 570 tokens = tokenizer_strict_min_and_drop_trailing_silence.tokenize(
amine@400 571 data_source
amine@400 572 )
amine@297 573
amine@400 574 assert (
amine@400 575 len(tokens) == 1
amine@400 576 ), f"wrong number of tokens, expected: 1, found: {len(tokens)}"
amine@400 577 tok1 = tokens[0]
amine@297 578
amine@400 579 data = "".join(tok1[0])
amine@400 580 start = tok1[1]
amine@400 581 end = tok1[2]
amine@400 582 assert (
amine@400 583 data == "AAAAAAAA"
amine@400 584 ), f"wrong data for token 1, expected: 'AAAAAAAA', found: '{data}'"
amine@400 585 assert (
amine@400 586 start == 2
amine@400 587 ), f"wrong start frame for token 1, expected: 2, found: {start}"
amine@400 588 assert end == 9, f"wrong end frame for token 1, expected: 9, found: {end}"
amine@297 589
amine@297 590
amine@400 591 @pytest.fixture
amine@400 592 def tokenizer_callback(validator):
amine@400 593 return StreamTokenizer(
amine@400 594 validator,
amine@400 595 min_length=5,
amine@400 596 max_length=8,
amine@400 597 max_continuous_silence=3,
amine@400 598 init_min=3,
amine@400 599 init_max_silence=3,
amine@400 600 mode=0,
amine@400 601 )
amine@297 602
amine@297 603
amine@400 604 def test_callback(tokenizer_callback):
amine@400 605 tokens = []
amine@297 606
amine@400 607 def callback(data, start, end):
amine@400 608 tokens.append((data, start, end))
amine@297 609
amine@400 610 data_source = StringDataSource("aaAAAAAAAAAAAAa")
amine@400 611 # ^ ^^ ^
amine@400 612 # 2 910 14
amine@297 613
amine@400 614 tokenizer_callback.tokenize(data_source, callback=callback)
amine@2 615
amine@400 616 assert (
amine@400 617 len(tokens) == 2
amine@400 618 ), f"wrong number of tokens, expected: 2, found: {len(tokens)}"