annotate tests/test_StreamTokenizer.py @ 157:682bf4477fae

Move helper test functions to a new file
author Amine Sehili <amine.sehili@gmail.com>
date Sun, 24 Feb 2019 17:16:53 +0100
parents c079c57dad69
children 7259b1eb9329
rev   line source
amine@2 1 '''
amine@2 2 @author: Amine Sehili <amine.sehili@gmail.com>
amine@2 3 September 2015
amine@2 4
amine@2 5 '''
amine@2 6
amine@2 7 import unittest
amine@2 8 from auditok import StreamTokenizer, StringDataSource, DataValidator
amine@2 9
amine@2 10
amine@2 11 class AValidator(DataValidator):
amine@2 12
amine@2 13 def is_valid(self, frame):
amine@2 14 return frame == "A"
amine@2 15
amine@2 16
amine@2 17 class TestStreamTokenizerInitParams(unittest.TestCase):
amine@2 18
amine@2 19
amine@2 20 def setUp(self):
amine@2 21 self.A_validator = AValidator()
amine@2 22
amine@2 23 # Completely deactivate init_min and init_max_silence
amine@2 24 # The tokenizer will only rely on the other parameters
amine@2 25 # Note that if init_min = 0, the value of init_max_silence
amine@2 26 # will have no effect
amine@2 27 def test_init_min_0_init_max_silence_0(self):
amine@2 28
amine@2 29 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
amine@2 30 max_continuous_silence=4, init_min = 0,
amine@2 31 init_max_silence = 0, mode=0)
amine@2 32
amine@2 33
amine@2 34 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
amine@19 35 # ^ ^ ^ ^
amine@19 36 # 2 16 20 27
amine@2 37 tokens = tokenizer.tokenize(data_source)
amine@2 38
amine@2 39 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
amine@2 40 tok1, tok2 = tokens[0], tokens[1]
amine@2 41
amine@2 42 # tok1[0]: data
amine@2 43 # tok1[1]: start frame (included)
amine@2 44 # tok1[2]: end frame (included)
amine@2 45
amine@2 46 data = ''.join(tok1[0])
amine@2 47 start = tok1[1]
amine@2 48 end = tok1[2]
amine@2 49 self.assertEqual(data, "AaaaAaAaaAaAaaaa",
amine@2 50 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(data))
amine@2 51 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
amine@2 52 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
amine@2 53
amine@2 54
amine@2 55 data = ''.join(tok2[0])
amine@2 56 start = tok2[1]
amine@2 57 end = tok2[2]
amine@2 58 self.assertEqual(data, "AAAAAAAA",
amine@2 59 msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(data))
amine@2 60 self.assertEqual(start, 20, msg="wrong start frame for token 2, expected: 20, found: {0} ".format(start))
amine@2 61 self.assertEqual(end, 27, msg="wrong end frame for token 2, expected: 27, found: {0} ".format(end))
amine@2 62
amine@2 63
amine@2 64
amine@5 65 # A valid token is considered as so iff the tokenizer encounters
amine@2 66 # at least valid frames (init_min = 3) between witch there
amine@2 67 # are at most 0 consecutive non valid frames (init_max_silence = 0)
amine@2 68 # The tokenizer will only rely on the other parameters
amine@2 69 # In other words, a valid token must start with 3 valid frames
amine@2 70 def test_init_min_3_init_max_silence_0(self):
amine@2 71
amine@2 72 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
amine@2 73 max_continuous_silence=4, init_min = 3,
amine@2 74 init_max_silence = 0, mode=0)
amine@2 75
amine@2 76
amine@2 77
amine@2 78 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
amine@5 79 # ^ ^ ^ ^
amine@5 80 # 18 30 33 37
amine@2 81
amine@2 82 tokens = tokenizer.tokenize(data_source)
amine@2 83
amine@2 84 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
amine@2 85 tok1, tok2 = tokens[0], tokens[1]
amine@2 86
amine@2 87 data = ''.join(tok1[0])
amine@2 88 start = tok1[1]
amine@2 89 end = tok1[2]
amine@2 90 self.assertEqual(data, "AAAAAAAAAaaaa",
amine@2 91 msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data))
amine@2 92 self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start))
amine@2 93 self.assertEqual(end, 30, msg="wrong end frame for token 1, expected: 30, found: {0} ".format(end))
amine@2 94
amine@2 95
amine@2 96 data = ''.join(tok2[0])
amine@2 97 start = tok2[1]
amine@2 98 end = tok2[2]
amine@2 99 self.assertEqual(data, "AAAAA",
amine@2 100 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 101 self.assertEqual(start, 33, msg="wrong start frame for token 2, expected: 33, found: {0} ".format(start))
amine@2 102 self.assertEqual(end, 37, msg="wrong end frame for token 2, expected: 37, found: {0} ".format(end))
amine@2 103
amine@2 104
amine@2 105 # A valid token is considered iff the tokenizer encounters
amine@2 106 # at least valid frames (init_min = 3) between witch there
amine@2 107 # are at most 2 consecutive non valid frames (init_max_silence = 2)
amine@2 108 def test_init_min_3_init_max_silence_2(self):
amine@2 109
amine@2 110 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
amine@2 111 max_continuous_silence=4, init_min = 3,
amine@2 112 init_max_silence = 2, mode=0)
amine@2 113
amine@2 114
amine@2 115 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
amine@5 116 # ^ ^ ^ ^ ^ ^
amine@5 117 # 5 16 19 31 35 39
amine@2 118 tokens = tokenizer.tokenize(data_source)
amine@2 119
amine@2 120 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
amine@2 121 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@2 122
amine@2 123
amine@2 124 data = ''.join(tok1[0])
amine@2 125 start = tok1[1]
amine@2 126 end = tok1[2]
amine@2 127 self.assertEqual(data, "AaAaaAaAaaaa",
amine@2 128 msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(data))
amine@2 129 self.assertEqual(start, 5, msg="wrong start frame for token 1, expected: 5, found: {0} ".format(start))
amine@2 130 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
amine@2 131
amine@2 132
amine@2 133 data = ''.join(tok2[0])
amine@2 134 start = tok2[1]
amine@2 135 end = tok2[2]
amine@2 136 self.assertEqual(data, "AAAAAAAAAaaaa",
amine@2 137 msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data))
amine@2 138 self.assertEqual(start, 19, msg="wrong start frame for token 2, expected: 19, found: {0} ".format(start))
amine@2 139 self.assertEqual(end, 31, msg="wrong end frame for token 2, expected: 31, found: {0} ".format(end))
amine@2 140
amine@2 141
amine@2 142 data = ''.join(tok3[0])
amine@2 143 start = tok3[1]
amine@2 144 end = tok3[2]
amine@2 145 self.assertEqual(data, "AAAAA",
amine@2 146 msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 147 self.assertEqual(start, 35, msg="wrong start frame for token 2, expected: 35, found: {0} ".format(start))
amine@2 148 self.assertEqual(end, 39, msg="wrong end frame for token 2, expected: 39, found: {0} ".format(end))
amine@2 149
amine@2 150
amine@2 151
amine@2 152 class TestStreamTokenizerMinMaxLength(unittest.TestCase):
amine@2 153
amine@2 154 def setUp(self):
amine@2 155 self.A_validator = AValidator()
amine@2 156
amine@2 157
amine@2 158 def test_min_length_6_init_max_length_20(self):
amine@2 159
amine@2 160 tokenizer = StreamTokenizer(self.A_validator, min_length = 6, max_length=20,
amine@2 161 max_continuous_silence=2, init_min = 3,
amine@2 162 init_max_silence = 3, mode=0)
amine@2 163
amine@2 164
amine@2 165 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@5 166 # ^ ^ ^ ^
amine@5 167 # 1 14 18 28
amine@2 168
amine@2 169 tokens = tokenizer.tokenize(data_source)
amine@2 170
amine@2 171 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
amine@2 172 tok1, tok2 = tokens[0], tokens[1]
amine@2 173
amine@2 174
amine@2 175 data = ''.join(tok1[0])
amine@2 176 start = tok1[1]
amine@2 177 end = tok1[2]
amine@2 178 self.assertEqual(data, "AaaaAaAaaAaAaa",
amine@2 179 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format(data))
amine@2 180 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
amine@2 181 self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
amine@2 182
amine@2 183
amine@2 184 data = ''.join(tok2[0])
amine@2 185 start = tok2[1]
amine@2 186 end = tok2[2]
amine@2 187 self.assertEqual(data, "AAAAAAAAAaa",
amine@2 188 msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format(data))
amine@2 189 self.assertEqual(start, 18, msg="wrong start frame for token 2, expected: 18, found: {0} ".format(start))
amine@2 190 self.assertEqual(end, 28, msg="wrong end frame for token 2, expected: 28, found: {0} ".format(end))
amine@2 191
amine@2 192
amine@2 193 def test_min_length_1_init_max_length_1(self):
amine@2 194
amine@2 195 tokenizer = StreamTokenizer(self.A_validator, min_length = 1, max_length=1,
amine@2 196 max_continuous_silence=0, init_min = 0,
amine@2 197 init_max_silence = 0, mode=0)
amine@2 198
amine@2 199
amine@2 200 data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@2 201
amine@2 202 tokens = tokenizer.tokenize(data_source)
amine@2 203
amine@2 204 self.assertEqual(len(tokens), 21, msg="wrong number of tokens, expected: 21, found: {0} ".format(len(tokens)))
amine@2 205
amine@2 206
amine@2 207 def test_min_length_10_init_max_length_20(self):
amine@2 208
amine@2 209 tokenizer = StreamTokenizer(self.A_validator, min_length = 10, max_length=20,
amine@2 210 max_continuous_silence=4, init_min = 3,
amine@2 211 init_max_silence = 3, mode=0)
amine@2 212
amine@2 213
amine@2 214 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA")
amine@5 215 # ^ ^ ^ ^
amine@5 216 # 1 16 30 45
amine@2 217
amine@2 218 tokens = tokenizer.tokenize(data_source)
amine@2 219
amine@2 220 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
amine@2 221 tok1, tok2 = tokens[0], tokens[1]
amine@2 222
amine@2 223
amine@2 224 data = ''.join(tok1[0])
amine@2 225 start = tok1[1]
amine@2 226 end = tok1[2]
amine@2 227 self.assertEqual(data, "AaaaAaAaaAaAaaaa",
amine@2 228 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format(data))
amine@2 229 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
amine@2 230 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
amine@2 231
amine@2 232
amine@2 233 data = ''.join(tok2[0])
amine@2 234 start = tok2[1]
amine@2 235 end = tok2[2]
amine@2 236 self.assertEqual(data, "AAAAAaaAAaaAAA",
amine@2 237 msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format(data))
amine@2 238 self.assertEqual(start, 30, msg="wrong start frame for token 2, expected: 30, found: {0} ".format(start))
amine@2 239 self.assertEqual(end, 43, msg="wrong end frame for token 2, expected: 43, found: {0} ".format(end))
amine@2 240
amine@2 241
amine@2 242
amine@2 243 def test_min_length_4_init_max_length_5(self):
amine@2 244
amine@2 245 tokenizer = StreamTokenizer(self.A_validator, min_length = 4, max_length=5,
amine@2 246 max_continuous_silence=4, init_min = 3,
amine@2 247 init_max_silence = 3, mode=0)
amine@2 248
amine@2 249
amine@2 250 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa")
amine@5 251 # ^ ^^ ^ ^ ^ ^ ^
amine@5 252 # 18 2223 27 32 36 42 46
amine@2 253
amine@2 254 tokens = tokenizer.tokenize(data_source)
amine@2 255
amine@2 256 self.assertEqual(len(tokens), 4, msg="wrong number of tokens, expected: 4, found: {0} ".format(len(tokens)))
amine@2 257 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
amine@2 258
amine@2 259
amine@2 260 data = ''.join(tok1[0])
amine@2 261 start = tok1[1]
amine@2 262 end = tok1[2]
amine@2 263 self.assertEqual(data, "AAAAA",
amine@2 264 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 265 self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start))
amine@2 266 self.assertEqual(end, 22, msg="wrong end frame for token 1, expected: 22, found: {0} ".format(end))
amine@2 267
amine@2 268
amine@2 269 data = ''.join(tok2[0])
amine@2 270 start = tok2[1]
amine@2 271 end = tok2[2]
amine@2 272 self.assertEqual(data, "AAAaa",
amine@2 273 msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(data))
amine@2 274 self.assertEqual(start, 23, msg="wrong start frame for token 1, expected: 23, found: {0} ".format(start))
amine@2 275 self.assertEqual(end, 27, msg="wrong end frame for token 1, expected: 27, found: {0} ".format(end))
amine@2 276
amine@2 277
amine@2 278 data = ''.join(tok3[0])
amine@2 279 start = tok3[1]
amine@2 280 end = tok3[2]
amine@2 281 self.assertEqual(data, "AAAAA",
amine@2 282 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 283 self.assertEqual(start, 32, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
amine@2 284 self.assertEqual(end, 36, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end))
amine@2 285
amine@2 286
amine@2 287 data = ''.join(tok4[0])
amine@2 288 start = tok4[1]
amine@2 289 end = tok4[2]
amine@2 290 self.assertEqual(data, "AAaaA",
amine@2 291 msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(data))
amine@2 292 self.assertEqual(start, 42, msg="wrong start frame for token 2, expected: 17, found: {0} ".format(start))
amine@2 293 self.assertEqual(end, 46, msg="wrong end frame for token 2, expected: 22, found: {0} ".format(end))
amine@2 294
amine@2 295
amine@2 296 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase):
amine@2 297
amine@2 298 def setUp(self):
amine@2 299 self.A_validator = AValidator()
amine@2 300
amine@2 301
amine@2 302 def test_min_5_max_10_max_continuous_silence_0(self):
amine@2 303
amine@2 304 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
amine@2 305 max_continuous_silence=0, init_min = 3,
amine@2 306 init_max_silence = 3, mode=0)
amine@2 307
amine@2 308 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@5 309 # ^ ^ ^ ^ ^ ^
amine@5 310 # 3 7 9 14 17 25
amine@2 311
amine@2 312 tokens = tokenizer.tokenize(data_source)
amine@2 313
amine@2 314 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
amine@2 315 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@2 316
amine@2 317
amine@2 318 data = ''.join(tok1[0])
amine@2 319 start = tok1[1]
amine@2 320 end = tok1[2]
amine@2 321 self.assertEqual(data, "AAAAA",
amine@2 322 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 323 self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start))
amine@2 324 self.assertEqual(end, 7, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end))
amine@2 325
amine@2 326
amine@2 327 data = ''.join(tok2[0])
amine@2 328 start = tok2[1]
amine@2 329 end = tok2[2]
amine@2 330 self.assertEqual(data, "AAAAAA",
amine@2 331 msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format(data))
amine@2 332 self.assertEqual(start, 9, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start))
amine@2 333 self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
amine@2 334
amine@2 335
amine@2 336 data = ''.join(tok3[0])
amine@2 337 start = tok3[1]
amine@2 338 end = tok3[2]
amine@2 339 self.assertEqual(data, "AAAAAAAAA",
amine@2 340 msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format(data))
amine@2 341 self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start))
amine@2 342 self.assertEqual(end, 25, msg="wrong end frame for token 1, expected: 25, found: {0} ".format(end))
amine@2 343
amine@2 344
amine@2 345
amine@2 346
amine@2 347 def test_min_5_max_10_max_continuous_silence_1(self):
amine@2 348
amine@2 349 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
amine@2 350 max_continuous_silence=1, init_min = 3,
amine@2 351 init_max_silence = 3, mode=0)
amine@2 352
amine@2 353 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@5 354 # ^ ^^ ^ ^ ^
amine@5 355 # 3 12131517 26
amine@5 356 # (12 13 15 17)
amine@2 357
amine@2 358 tokens = tokenizer.tokenize(data_source)
amine@2 359
amine@2 360 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
amine@2 361 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@2 362
amine@2 363
amine@2 364 data = ''.join(tok1[0])
amine@2 365 start = tok1[1]
amine@2 366 end = tok1[2]
amine@2 367 self.assertEqual(data, "AAAAAaAAAA",
amine@2 368 msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(data))
amine@2 369 self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start))
amine@2 370 self.assertEqual(end, 12, msg="wrong end frame for token 1, expected: 10, found: {0} ".format(end))
amine@2 371
amine@2 372
amine@2 373 data = ''.join(tok2[0])
amine@2 374 start = tok2[1]
amine@2 375 end = tok2[2]
amine@2 376 self.assertEqual(data, "AAa",
amine@2 377 msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(data))
amine@2 378 self.assertEqual(start, 13, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start))
amine@2 379 self.assertEqual(end, 15, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
amine@2 380
amine@2 381
amine@2 382 data = ''.join(tok3[0])
amine@2 383 start = tok3[1]
amine@2 384 end = tok3[2]
amine@2 385 self.assertEqual(data, "AAAAAAAAAa",
amine@2 386 msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(data))
amine@2 387 self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start))
amine@2 388 self.assertEqual(end, 26, msg="wrong end frame for token 1, expected: 26, found: {0} ".format(end))
amine@2 389
amine@2 390
amine@2 391 class TestStreamTokenizerModes(unittest.TestCase):
amine@2 392
amine@2 393 def setUp(self):
amine@2 394 self.A_validator = AValidator()
amine@2 395
amine@2 396 def test_STRICT_MIN_LENGTH(self):
amine@2 397
amine@2 398 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
amine@2 399 max_continuous_silence=3, init_min = 3,
amine@2 400 init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH)
amine@2 401
amine@2 402 data_source = StringDataSource("aaAAAAAAAAAAAA")
amine@5 403 # ^ ^
amine@5 404 # 2 9
amine@2 405
amine@2 406 tokens = tokenizer.tokenize(data_source)
amine@2 407
amine@2 408 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
amine@2 409 tok1 = tokens[0]
amine@2 410
amine@2 411
amine@2 412 data = ''.join(tok1[0])
amine@2 413 start = tok1[1]
amine@2 414 end = tok1[2]
amine@2 415 self.assertEqual(data, "AAAAAAAA",
amine@2 416 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data))
amine@2 417 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
amine@2 418 self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
amine@2 419
amine@2 420
amine@3 421 def test_DROP_TAILING_SILENCE(self):
amine@2 422
amine@2 423 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
amine@2 424 max_continuous_silence=2, init_min = 3,
amine@3 425 init_max_silence = 3, mode=StreamTokenizer.DROP_TAILING_SILENCE)
amine@2 426
amine@2 427 data_source = StringDataSource("aaAAAAAaaaaa")
amine@5 428 # ^ ^
amine@5 429 # 2 6
amine@2 430
amine@2 431 tokens = tokenizer.tokenize(data_source)
amine@2 432
amine@2 433 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
amine@2 434 tok1 = tokens[0]
amine@2 435
amine@2 436
amine@2 437 data = ''.join(tok1[0])
amine@2 438 start = tok1[1]
amine@2 439 end = tok1[2]
amine@2 440 self.assertEqual(data, "AAAAA",
amine@2 441 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 442 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
amine@2 443 self.assertEqual(end, 6, msg="wrong end frame for token 1, expected: 6, found: {0} ".format(end))
amine@2 444
amine@2 445
amine@3 446 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self):
amine@2 447
amine@2 448 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
amine@2 449 max_continuous_silence=3, init_min = 3,
amine@3 450 init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE)
amine@2 451
amine@2 452 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
amine@5 453 # ^ ^
amine@5 454 # 2 8
amine@2 455
amine@2 456 tokens = tokenizer.tokenize(data_source)
amine@2 457
amine@2 458 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
amine@2 459 tok1 = tokens[0]
amine@2 460
amine@2 461
amine@2 462 data = ''.join(tok1[0])
amine@2 463 start = tok1[1]
amine@2 464 end = tok1[2]
amine@2 465 self.assertEqual(data, "AAAAAAAA",
amine@2 466 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data))
amine@2 467 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
amine@2 468 self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
amine@2 469
amine@2 470
amine@2 471 class TestStreamTokenizerCallback(unittest.TestCase):
amine@2 472
amine@2 473 def setUp(self):
amine@2 474 self.A_validator = AValidator()
amine@2 475
amine@2 476 def test_callback(self):
amine@2 477
amine@2 478 tokens = []
amine@2 479
amine@2 480 def callback(data, start, end):
amine@2 481 tokens.append((data, start, end))
amine@2 482
amine@2 483
amine@2 484 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
amine@2 485 max_continuous_silence=3, init_min = 3,
amine@2 486 init_max_silence = 3, mode=0)
amine@2 487
amine@2 488 data_source = StringDataSource("aaAAAAAAAAAAAAa")
amine@5 489 # ^ ^^ ^
amine@5 490 # 2 910 14
amine@2 491
amine@2 492 tokenizer.tokenize(data_source, callback=callback)
amine@2 493
amine@2 494 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
amine@2 495
amine@2 496
amine@2 497
amine@2 498 if __name__ == "__main__":
amine@2 499 #import sys;sys.argv = ['', 'Test.testName']
amine@2 500 unittest.main()