annotate tests/test_StreamTokenizer.py @ 3:364eeb8e8bd2

README.md, typos fixes
author Amine Sehili <amine.sehili@gmail.com>
date Tue, 22 Sep 2015 10:49:57 +0200
parents edee860b9f61
children 252d698ae642
rev   line source
amine@2 1 '''
amine@2 2 @author: Amine Sehili <amine.sehili@gmail.com>
amine@2 3 September 2015
amine@2 4
amine@2 5 '''
amine@2 6
amine@2 7 import unittest
amine@2 8 from auditok import StreamTokenizer, StringDataSource, DataValidator
amine@2 9
amine@2 10
amine@2 11 class AValidator(DataValidator):
amine@2 12
amine@2 13 def is_valid(self, frame):
amine@2 14 return frame == "A"
amine@2 15
amine@2 16
amine@2 17 class TestStreamTokenizerInitParams(unittest.TestCase):
amine@2 18
amine@2 19
amine@2 20 def setUp(self):
amine@2 21 self.A_validator = AValidator()
amine@2 22
amine@2 23 # Completely deactivate init_min and init_max_silence
amine@2 24 # The tokenizer will only rely on the other parameters
amine@2 25 # Note that if init_min = 0, the value of init_max_silence
amine@2 26 # will have no effect
amine@2 27 def test_init_min_0_init_max_silence_0(self):
amine@2 28
amine@2 29 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
amine@2 30 max_continuous_silence=4, init_min = 0,
amine@2 31 init_max_silence = 0, mode=0)
amine@2 32
amine@2 33
amine@2 34 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
amine@2 35 # ^ ^ ^ ^
amine@2 36 # 2 16 20 27
amine@2 37 tokens = tokenizer.tokenize(data_source)
amine@2 38
amine@2 39 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
amine@2 40 tok1, tok2 = tokens[0], tokens[1]
amine@2 41
amine@2 42 # tok1[0]: data
amine@2 43 # tok1[1]: start frame (included)
amine@2 44 # tok1[2]: end frame (included)
amine@2 45
amine@2 46 data = ''.join(tok1[0])
amine@2 47 start = tok1[1]
amine@2 48 end = tok1[2]
amine@2 49 self.assertEqual(data, "AaaaAaAaaAaAaaaa",
amine@2 50 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(data))
amine@2 51 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
amine@2 52 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
amine@2 53
amine@2 54
amine@2 55 data = ''.join(tok2[0])
amine@2 56 start = tok2[1]
amine@2 57 end = tok2[2]
amine@2 58 self.assertEqual(data, "AAAAAAAA",
amine@2 59 msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(data))
amine@2 60 self.assertEqual(start, 20, msg="wrong start frame for token 2, expected: 20, found: {0} ".format(start))
amine@2 61 self.assertEqual(end, 27, msg="wrong end frame for token 2, expected: 27, found: {0} ".format(end))
amine@2 62
amine@2 63
amine@2 64
amine@2 65 # A valid token is considered iff the tokenizer encounters
amine@2 66 # at least valid frames (init_min = 3) between witch there
amine@2 67 # are at most 0 consecutive non valid frames (init_max_silence = 0)
amine@2 68 # The tokenizer will only rely on the other parameters
amine@2 69 # In other words, a valid token must start with 3 valid frames
amine@2 70 def test_init_min_3_init_max_silence_0(self):
amine@2 71
amine@2 72 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
amine@2 73 max_continuous_silence=4, init_min = 3,
amine@2 74 init_max_silence = 0, mode=0)
amine@2 75
amine@2 76
amine@2 77 #data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@2 78 # ^ ^ ^ ^
amine@2 79 # 18 26 32 36
amine@2 80
amine@2 81 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
amine@2 82 # ^ ^ ^ ^
amine@2 83 # 18 30 33 37
amine@2 84
amine@2 85 tokens = tokenizer.tokenize(data_source)
amine@2 86
amine@2 87 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
amine@2 88 tok1, tok2 = tokens[0], tokens[1]
amine@2 89
amine@2 90 data = ''.join(tok1[0])
amine@2 91 start = tok1[1]
amine@2 92 end = tok1[2]
amine@2 93 self.assertEqual(data, "AAAAAAAAAaaaa",
amine@2 94 msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data))
amine@2 95 self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start))
amine@2 96 self.assertEqual(end, 30, msg="wrong end frame for token 1, expected: 30, found: {0} ".format(end))
amine@2 97
amine@2 98
amine@2 99 data = ''.join(tok2[0])
amine@2 100 start = tok2[1]
amine@2 101 end = tok2[2]
amine@2 102 self.assertEqual(data, "AAAAA",
amine@2 103 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 104 self.assertEqual(start, 33, msg="wrong start frame for token 2, expected: 33, found: {0} ".format(start))
amine@2 105 self.assertEqual(end, 37, msg="wrong end frame for token 2, expected: 37, found: {0} ".format(end))
amine@2 106
amine@2 107
amine@2 108 # A valid token is considered iff the tokenizer encounters
amine@2 109 # at least valid frames (init_min = 3) between witch there
amine@2 110 # are at most 2 consecutive non valid frames (init_max_silence = 2)
amine@2 111 def test_init_min_3_init_max_silence_2(self):
amine@2 112
amine@2 113 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
amine@2 114 max_continuous_silence=4, init_min = 3,
amine@2 115 init_max_silence = 2, mode=0)
amine@2 116
amine@2 117
amine@2 118 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
amine@2 119 # ^ ^ ^ ^ ^ ^
amine@2 120 # 5 16 19 31 35 39
amine@2 121 tokens = tokenizer.tokenize(data_source)
amine@2 122
amine@2 123 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
amine@2 124 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@2 125
amine@2 126
amine@2 127 data = ''.join(tok1[0])
amine@2 128 start = tok1[1]
amine@2 129 end = tok1[2]
amine@2 130 self.assertEqual(data, "AaAaaAaAaaaa",
amine@2 131 msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(data))
amine@2 132 self.assertEqual(start, 5, msg="wrong start frame for token 1, expected: 5, found: {0} ".format(start))
amine@2 133 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
amine@2 134
amine@2 135
amine@2 136 data = ''.join(tok2[0])
amine@2 137 start = tok2[1]
amine@2 138 end = tok2[2]
amine@2 139 self.assertEqual(data, "AAAAAAAAAaaaa",
amine@2 140 msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data))
amine@2 141 self.assertEqual(start, 19, msg="wrong start frame for token 2, expected: 19, found: {0} ".format(start))
amine@2 142 self.assertEqual(end, 31, msg="wrong end frame for token 2, expected: 31, found: {0} ".format(end))
amine@2 143
amine@2 144
amine@2 145 data = ''.join(tok3[0])
amine@2 146 start = tok3[1]
amine@2 147 end = tok3[2]
amine@2 148 self.assertEqual(data, "AAAAA",
amine@2 149 msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 150 self.assertEqual(start, 35, msg="wrong start frame for token 2, expected: 35, found: {0} ".format(start))
amine@2 151 self.assertEqual(end, 39, msg="wrong end frame for token 2, expected: 39, found: {0} ".format(end))
amine@2 152
amine@2 153
amine@2 154
amine@2 155 class TestStreamTokenizerMinMaxLength(unittest.TestCase):
amine@2 156
amine@2 157 def setUp(self):
amine@2 158 self.A_validator = AValidator()
amine@2 159
amine@2 160
amine@2 161 def test_min_length_6_init_max_length_20(self):
amine@2 162
amine@2 163 tokenizer = StreamTokenizer(self.A_validator, min_length = 6, max_length=20,
amine@2 164 max_continuous_silence=2, init_min = 3,
amine@2 165 init_max_silence = 3, mode=0)
amine@2 166
amine@2 167
amine@2 168 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@2 169 # ^ ^ ^ ^
amine@2 170 # 1 14 18 28
amine@2 171
amine@2 172 tokens = tokenizer.tokenize(data_source)
amine@2 173
amine@2 174 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
amine@2 175 tok1, tok2 = tokens[0], tokens[1]
amine@2 176
amine@2 177
amine@2 178 data = ''.join(tok1[0])
amine@2 179 start = tok1[1]
amine@2 180 end = tok1[2]
amine@2 181 self.assertEqual(data, "AaaaAaAaaAaAaa",
amine@2 182 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format(data))
amine@2 183 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
amine@2 184 self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
amine@2 185
amine@2 186
amine@2 187 data = ''.join(tok2[0])
amine@2 188 start = tok2[1]
amine@2 189 end = tok2[2]
amine@2 190 self.assertEqual(data, "AAAAAAAAAaa",
amine@2 191 msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format(data))
amine@2 192 self.assertEqual(start, 18, msg="wrong start frame for token 2, expected: 18, found: {0} ".format(start))
amine@2 193 self.assertEqual(end, 28, msg="wrong end frame for token 2, expected: 28, found: {0} ".format(end))
amine@2 194
amine@2 195
amine@2 196 def test_min_length_1_init_max_length_1(self):
amine@2 197
amine@2 198 tokenizer = StreamTokenizer(self.A_validator, min_length = 1, max_length=1,
amine@2 199 max_continuous_silence=0, init_min = 0,
amine@2 200 init_max_silence = 0, mode=0)
amine@2 201
amine@2 202
amine@2 203 data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
amine@2 204
amine@2 205 tokens = tokenizer.tokenize(data_source)
amine@2 206
amine@2 207 self.assertEqual(len(tokens), 21, msg="wrong number of tokens, expected: 21, found: {0} ".format(len(tokens)))
amine@2 208
amine@2 209
amine@2 210 def test_min_length_10_init_max_length_20(self):
amine@2 211
amine@2 212 tokenizer = StreamTokenizer(self.A_validator, min_length = 10, max_length=20,
amine@2 213 max_continuous_silence=4, init_min = 3,
amine@2 214 init_max_silence = 3, mode=0)
amine@2 215
amine@2 216
amine@2 217 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA")
amine@2 218 # ^ ^ ^ ^
amine@2 219 # 1 16 30 45
amine@2 220
amine@2 221 tokens = tokenizer.tokenize(data_source)
amine@2 222
amine@2 223 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
amine@2 224 tok1, tok2 = tokens[0], tokens[1]
amine@2 225
amine@2 226
amine@2 227 data = ''.join(tok1[0])
amine@2 228 start = tok1[1]
amine@2 229 end = tok1[2]
amine@2 230 self.assertEqual(data, "AaaaAaAaaAaAaaaa",
amine@2 231 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format(data))
amine@2 232 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
amine@2 233 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
amine@2 234
amine@2 235
amine@2 236 data = ''.join(tok2[0])
amine@2 237 start = tok2[1]
amine@2 238 end = tok2[2]
amine@2 239 self.assertEqual(data, "AAAAAaaAAaaAAA",
amine@2 240 msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format(data))
amine@2 241 self.assertEqual(start, 30, msg="wrong start frame for token 2, expected: 30, found: {0} ".format(start))
amine@2 242 self.assertEqual(end, 43, msg="wrong end frame for token 2, expected: 43, found: {0} ".format(end))
amine@2 243
amine@2 244
amine@2 245
amine@2 246 def test_min_length_4_init_max_length_5(self):
amine@2 247
amine@2 248 tokenizer = StreamTokenizer(self.A_validator, min_length = 4, max_length=5,
amine@2 249 max_continuous_silence=4, init_min = 3,
amine@2 250 init_max_silence = 3, mode=0)
amine@2 251
amine@2 252
amine@2 253 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa")
amine@2 254 # ^ ^^ ^ ^ ^ ^ ^
amine@2 255 # 18 2223 27 32 36 42 46
amine@2 256
amine@2 257 tokens = tokenizer.tokenize(data_source)
amine@2 258
amine@2 259 self.assertEqual(len(tokens), 4, msg="wrong number of tokens, expected: 4, found: {0} ".format(len(tokens)))
amine@2 260 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
amine@2 261
amine@2 262
amine@2 263 data = ''.join(tok1[0])
amine@2 264 start = tok1[1]
amine@2 265 end = tok1[2]
amine@2 266 self.assertEqual(data, "AAAAA",
amine@2 267 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 268 self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start))
amine@2 269 self.assertEqual(end, 22, msg="wrong end frame for token 1, expected: 22, found: {0} ".format(end))
amine@2 270
amine@2 271
amine@2 272 data = ''.join(tok2[0])
amine@2 273 start = tok2[1]
amine@2 274 end = tok2[2]
amine@2 275 self.assertEqual(data, "AAAaa",
amine@2 276 msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(data))
amine@2 277 self.assertEqual(start, 23, msg="wrong start frame for token 1, expected: 23, found: {0} ".format(start))
amine@2 278 self.assertEqual(end, 27, msg="wrong end frame for token 1, expected: 27, found: {0} ".format(end))
amine@2 279
amine@2 280
amine@2 281 data = ''.join(tok3[0])
amine@2 282 start = tok3[1]
amine@2 283 end = tok3[2]
amine@2 284 self.assertEqual(data, "AAAAA",
amine@2 285 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 286 self.assertEqual(start, 32, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
amine@2 287 self.assertEqual(end, 36, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end))
amine@2 288
amine@2 289
amine@2 290 data = ''.join(tok4[0])
amine@2 291 start = tok4[1]
amine@2 292 end = tok4[2]
amine@2 293 self.assertEqual(data, "AAaaA",
amine@2 294 msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(data))
amine@2 295 self.assertEqual(start, 42, msg="wrong start frame for token 2, expected: 17, found: {0} ".format(start))
amine@2 296 self.assertEqual(end, 46, msg="wrong end frame for token 2, expected: 22, found: {0} ".format(end))
amine@2 297
amine@2 298
amine@2 299 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase):
amine@2 300
amine@2 301 def setUp(self):
amine@2 302 self.A_validator = AValidator()
amine@2 303
amine@2 304
amine@2 305 def test_min_5_max_10_max_continuous_silence_0(self):
amine@2 306
amine@2 307 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
amine@2 308 max_continuous_silence=0, init_min = 3,
amine@2 309 init_max_silence = 3, mode=0)
amine@2 310
amine@2 311 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@2 312 # ^ ^ ^ ^ ^ ^
amine@2 313 # 3 7 9 14 17 25
amine@2 314
amine@2 315 tokens = tokenizer.tokenize(data_source)
amine@2 316
amine@2 317 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
amine@2 318 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@2 319
amine@2 320
amine@2 321 data = ''.join(tok1[0])
amine@2 322 start = tok1[1]
amine@2 323 end = tok1[2]
amine@2 324 self.assertEqual(data, "AAAAA",
amine@2 325 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 326 self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start))
amine@2 327 self.assertEqual(end, 7, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end))
amine@2 328
amine@2 329
amine@2 330 data = ''.join(tok2[0])
amine@2 331 start = tok2[1]
amine@2 332 end = tok2[2]
amine@2 333 self.assertEqual(data, "AAAAAA",
amine@2 334 msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format(data))
amine@2 335 self.assertEqual(start, 9, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start))
amine@2 336 self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
amine@2 337
amine@2 338
amine@2 339 data = ''.join(tok3[0])
amine@2 340 start = tok3[1]
amine@2 341 end = tok3[2]
amine@2 342 self.assertEqual(data, "AAAAAAAAA",
amine@2 343 msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format(data))
amine@2 344 self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start))
amine@2 345 self.assertEqual(end, 25, msg="wrong end frame for token 1, expected: 25, found: {0} ".format(end))
amine@2 346
amine@2 347
amine@2 348
amine@2 349
amine@2 350 def test_min_5_max_10_max_continuous_silence_1(self):
amine@2 351
amine@2 352 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
amine@2 353 max_continuous_silence=1, init_min = 3,
amine@2 354 init_max_silence = 3, mode=0)
amine@2 355
amine@2 356 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
amine@2 357 # ^ ^^ ^ ^ ^
amine@2 358 # 3 12131517 26
amine@2 359 # (12 13 15 17)
amine@2 360
amine@2 361 tokens = tokenizer.tokenize(data_source)
amine@2 362
amine@2 363 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
amine@2 364 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
amine@2 365
amine@2 366
amine@2 367 data = ''.join(tok1[0])
amine@2 368 start = tok1[1]
amine@2 369 end = tok1[2]
amine@2 370 self.assertEqual(data, "AAAAAaAAAA",
amine@2 371 msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(data))
amine@2 372 self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start))
amine@2 373 self.assertEqual(end, 12, msg="wrong end frame for token 1, expected: 10, found: {0} ".format(end))
amine@2 374
amine@2 375
amine@2 376 data = ''.join(tok2[0])
amine@2 377 start = tok2[1]
amine@2 378 end = tok2[2]
amine@2 379 self.assertEqual(data, "AAa",
amine@2 380 msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(data))
amine@2 381 self.assertEqual(start, 13, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start))
amine@2 382 self.assertEqual(end, 15, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
amine@2 383
amine@2 384
amine@2 385 data = ''.join(tok3[0])
amine@2 386 start = tok3[1]
amine@2 387 end = tok3[2]
amine@2 388 self.assertEqual(data, "AAAAAAAAAa",
amine@2 389 msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(data))
amine@2 390 self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start))
amine@2 391 self.assertEqual(end, 26, msg="wrong end frame for token 1, expected: 26, found: {0} ".format(end))
amine@2 392
amine@2 393
amine@2 394 class TestStreamTokenizerModes(unittest.TestCase):
amine@2 395
amine@2 396 def setUp(self):
amine@2 397 self.A_validator = AValidator()
amine@2 398
amine@2 399 def test_STRICT_MIN_LENGTH(self):
amine@2 400
amine@2 401 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
amine@2 402 max_continuous_silence=3, init_min = 3,
amine@2 403 init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH)
amine@2 404
amine@2 405 data_source = StringDataSource("aaAAAAAAAAAAAA")
amine@2 406 # ^ ^
amine@2 407 # 2 9
amine@2 408
amine@2 409 tokens = tokenizer.tokenize(data_source)
amine@2 410
amine@2 411 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
amine@2 412 tok1 = tokens[0]
amine@2 413
amine@2 414
amine@2 415 data = ''.join(tok1[0])
amine@2 416 start = tok1[1]
amine@2 417 end = tok1[2]
amine@2 418 self.assertEqual(data, "AAAAAAAA",
amine@2 419 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data))
amine@2 420 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
amine@2 421 self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
amine@2 422
amine@2 423
amine@3 424 def test_DROP_TAILING_SILENCE(self):
amine@2 425
amine@2 426 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
amine@2 427 max_continuous_silence=2, init_min = 3,
amine@3 428 init_max_silence = 3, mode=StreamTokenizer.DROP_TAILING_SILENCE)
amine@2 429
amine@2 430 data_source = StringDataSource("aaAAAAAaaaaa")
amine@2 431 # ^ ^
amine@2 432 # 2 6
amine@2 433
amine@2 434 tokens = tokenizer.tokenize(data_source)
amine@2 435
amine@2 436 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
amine@2 437 tok1 = tokens[0]
amine@2 438
amine@2 439
amine@2 440 data = ''.join(tok1[0])
amine@2 441 start = tok1[1]
amine@2 442 end = tok1[2]
amine@2 443 self.assertEqual(data, "AAAAA",
amine@2 444 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
amine@2 445 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
amine@2 446 self.assertEqual(end, 6, msg="wrong end frame for token 1, expected: 6, found: {0} ".format(end))
amine@2 447
amine@2 448
amine@3 449 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self):
amine@2 450
amine@2 451 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
amine@2 452 max_continuous_silence=3, init_min = 3,
amine@3 453 init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE)
amine@2 454
amine@2 455 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
amine@2 456 # ^ ^
amine@2 457 # 2 8
amine@2 458
amine@2 459 tokens = tokenizer.tokenize(data_source)
amine@2 460
amine@2 461 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
amine@2 462 tok1 = tokens[0]
amine@2 463
amine@2 464
amine@2 465 data = ''.join(tok1[0])
amine@2 466 start = tok1[1]
amine@2 467 end = tok1[2]
amine@2 468 self.assertEqual(data, "AAAAAAAA",
amine@2 469 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data))
amine@2 470 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
amine@2 471 self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
amine@2 472
amine@2 473
amine@2 474 class TestStreamTokenizerCallback(unittest.TestCase):
amine@2 475
amine@2 476 def setUp(self):
amine@2 477 self.A_validator = AValidator()
amine@2 478
amine@2 479 def test_callback(self):
amine@2 480
amine@2 481 tokens = []
amine@2 482
amine@2 483 def callback(data, start, end):
amine@2 484 tokens.append((data, start, end))
amine@2 485
amine@2 486
amine@2 487 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
amine@2 488 max_continuous_silence=3, init_min = 3,
amine@2 489 init_max_silence = 3, mode=0)
amine@2 490
amine@2 491 data_source = StringDataSource("aaAAAAAAAAAAAAa")
amine@2 492 # ^ ^^ ^
amine@2 493 # 2 910 14
amine@2 494
amine@2 495 tokenizer.tokenize(data_source, callback=callback)
amine@2 496
amine@2 497 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
amine@2 498
amine@2 499
amine@2 500
amine@2 501 if __name__ == "__main__":
amine@2 502 #import sys;sys.argv = ['', 'Test.testName']
amine@2 503 unittest.main()