amine@2
|
1 '''
|
amine@2
|
2 @author: Amine Sehili <amine.sehili@gmail.com>
|
amine@2
|
3 September 2015
|
amine@2
|
4
|
amine@2
|
5 '''
|
amine@2
|
6
|
amine@2
|
7 import unittest
|
amine@2
|
8 from auditok import StreamTokenizer, StringDataSource, DataValidator
|
amine@2
|
9
|
amine@2
|
10
|
amine@2
|
11 class AValidator(DataValidator):
|
amine@2
|
12
|
amine@2
|
13 def is_valid(self, frame):
|
amine@2
|
14 return frame == "A"
|
amine@2
|
15
|
amine@2
|
16
|
amine@2
|
17 class TestStreamTokenizerInitParams(unittest.TestCase):
|
amine@2
|
18
|
amine@2
|
19
|
amine@2
|
20 def setUp(self):
|
amine@2
|
21 self.A_validator = AValidator()
|
amine@2
|
22
|
amine@2
|
23 # Completely deactivate init_min and init_max_silence
|
amine@2
|
24 # The tokenizer will only rely on the other parameters
|
amine@2
|
25 # Note that if init_min = 0, the value of init_max_silence
|
amine@2
|
26 # will have no effect
|
amine@2
|
27 def test_init_min_0_init_max_silence_0(self):
|
amine@2
|
28
|
amine@2
|
29 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
|
amine@2
|
30 max_continuous_silence=4, init_min = 0,
|
amine@2
|
31 init_max_silence = 0, mode=0)
|
amine@2
|
32
|
amine@2
|
33
|
amine@2
|
34 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
|
amine@2
|
35 # ^ ^ ^ ^
|
amine@2
|
36 # 2 16 20 27
|
amine@2
|
37 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
38
|
amine@2
|
39 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
|
amine@2
|
40 tok1, tok2 = tokens[0], tokens[1]
|
amine@2
|
41
|
amine@2
|
42 # tok1[0]: data
|
amine@2
|
43 # tok1[1]: start frame (included)
|
amine@2
|
44 # tok1[2]: end frame (included)
|
amine@2
|
45
|
amine@2
|
46 data = ''.join(tok1[0])
|
amine@2
|
47 start = tok1[1]
|
amine@2
|
48 end = tok1[2]
|
amine@2
|
49 self.assertEqual(data, "AaaaAaAaaAaAaaaa",
|
amine@2
|
50 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(data))
|
amine@2
|
51 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
|
amine@2
|
52 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
|
amine@2
|
53
|
amine@2
|
54
|
amine@2
|
55 data = ''.join(tok2[0])
|
amine@2
|
56 start = tok2[1]
|
amine@2
|
57 end = tok2[2]
|
amine@2
|
58 self.assertEqual(data, "AAAAAAAA",
|
amine@2
|
59 msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(data))
|
amine@2
|
60 self.assertEqual(start, 20, msg="wrong start frame for token 2, expected: 20, found: {0} ".format(start))
|
amine@2
|
61 self.assertEqual(end, 27, msg="wrong end frame for token 2, expected: 27, found: {0} ".format(end))
|
amine@2
|
62
|
amine@2
|
63
|
amine@2
|
64
|
amine@2
|
65 # A valid token is considered iff the tokenizer encounters
|
amine@2
|
66 # at least valid frames (init_min = 3) between witch there
|
amine@2
|
67 # are at most 0 consecutive non valid frames (init_max_silence = 0)
|
amine@2
|
68 # The tokenizer will only rely on the other parameters
|
amine@2
|
69 # In other words, a valid token must start with 3 valid frames
|
amine@2
|
70 def test_init_min_3_init_max_silence_0(self):
|
amine@2
|
71
|
amine@2
|
72 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
|
amine@2
|
73 max_continuous_silence=4, init_min = 3,
|
amine@2
|
74 init_max_silence = 0, mode=0)
|
amine@2
|
75
|
amine@2
|
76
|
amine@2
|
77 #data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@2
|
78 # ^ ^ ^ ^
|
amine@2
|
79 # 18 26 32 36
|
amine@2
|
80
|
amine@2
|
81 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
|
amine@2
|
82 # ^ ^ ^ ^
|
amine@2
|
83 # 18 30 33 37
|
amine@2
|
84
|
amine@2
|
85 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
86
|
amine@2
|
87 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
|
amine@2
|
88 tok1, tok2 = tokens[0], tokens[1]
|
amine@2
|
89
|
amine@2
|
90 data = ''.join(tok1[0])
|
amine@2
|
91 start = tok1[1]
|
amine@2
|
92 end = tok1[2]
|
amine@2
|
93 self.assertEqual(data, "AAAAAAAAAaaaa",
|
amine@2
|
94 msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data))
|
amine@2
|
95 self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start))
|
amine@2
|
96 self.assertEqual(end, 30, msg="wrong end frame for token 1, expected: 30, found: {0} ".format(end))
|
amine@2
|
97
|
amine@2
|
98
|
amine@2
|
99 data = ''.join(tok2[0])
|
amine@2
|
100 start = tok2[1]
|
amine@2
|
101 end = tok2[2]
|
amine@2
|
102 self.assertEqual(data, "AAAAA",
|
amine@2
|
103 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
104 self.assertEqual(start, 33, msg="wrong start frame for token 2, expected: 33, found: {0} ".format(start))
|
amine@2
|
105 self.assertEqual(end, 37, msg="wrong end frame for token 2, expected: 37, found: {0} ".format(end))
|
amine@2
|
106
|
amine@2
|
107
|
amine@2
|
108 # A valid token is considered iff the tokenizer encounters
|
amine@2
|
109 # at least valid frames (init_min = 3) between witch there
|
amine@2
|
110 # are at most 2 consecutive non valid frames (init_max_silence = 2)
|
amine@2
|
111 def test_init_min_3_init_max_silence_2(self):
|
amine@2
|
112
|
amine@2
|
113 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
|
amine@2
|
114 max_continuous_silence=4, init_min = 3,
|
amine@2
|
115 init_max_silence = 2, mode=0)
|
amine@2
|
116
|
amine@2
|
117
|
amine@2
|
118 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
|
amine@2
|
119 # ^ ^ ^ ^ ^ ^
|
amine@2
|
120 # 5 16 19 31 35 39
|
amine@2
|
121 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
122
|
amine@2
|
123 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
|
amine@2
|
124 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@2
|
125
|
amine@2
|
126
|
amine@2
|
127 data = ''.join(tok1[0])
|
amine@2
|
128 start = tok1[1]
|
amine@2
|
129 end = tok1[2]
|
amine@2
|
130 self.assertEqual(data, "AaAaaAaAaaaa",
|
amine@2
|
131 msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(data))
|
amine@2
|
132 self.assertEqual(start, 5, msg="wrong start frame for token 1, expected: 5, found: {0} ".format(start))
|
amine@2
|
133 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
|
amine@2
|
134
|
amine@2
|
135
|
amine@2
|
136 data = ''.join(tok2[0])
|
amine@2
|
137 start = tok2[1]
|
amine@2
|
138 end = tok2[2]
|
amine@2
|
139 self.assertEqual(data, "AAAAAAAAAaaaa",
|
amine@2
|
140 msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data))
|
amine@2
|
141 self.assertEqual(start, 19, msg="wrong start frame for token 2, expected: 19, found: {0} ".format(start))
|
amine@2
|
142 self.assertEqual(end, 31, msg="wrong end frame for token 2, expected: 31, found: {0} ".format(end))
|
amine@2
|
143
|
amine@2
|
144
|
amine@2
|
145 data = ''.join(tok3[0])
|
amine@2
|
146 start = tok3[1]
|
amine@2
|
147 end = tok3[2]
|
amine@2
|
148 self.assertEqual(data, "AAAAA",
|
amine@2
|
149 msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
150 self.assertEqual(start, 35, msg="wrong start frame for token 2, expected: 35, found: {0} ".format(start))
|
amine@2
|
151 self.assertEqual(end, 39, msg="wrong end frame for token 2, expected: 39, found: {0} ".format(end))
|
amine@2
|
152
|
amine@2
|
153
|
amine@2
|
154
|
amine@2
|
155 class TestStreamTokenizerMinMaxLength(unittest.TestCase):
|
amine@2
|
156
|
amine@2
|
157 def setUp(self):
|
amine@2
|
158 self.A_validator = AValidator()
|
amine@2
|
159
|
amine@2
|
160
|
amine@2
|
161 def test_min_length_6_init_max_length_20(self):
|
amine@2
|
162
|
amine@2
|
163 tokenizer = StreamTokenizer(self.A_validator, min_length = 6, max_length=20,
|
amine@2
|
164 max_continuous_silence=2, init_min = 3,
|
amine@2
|
165 init_max_silence = 3, mode=0)
|
amine@2
|
166
|
amine@2
|
167
|
amine@2
|
168 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@2
|
169 # ^ ^ ^ ^
|
amine@2
|
170 # 1 14 18 28
|
amine@2
|
171
|
amine@2
|
172 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
173
|
amine@2
|
174 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
|
amine@2
|
175 tok1, tok2 = tokens[0], tokens[1]
|
amine@2
|
176
|
amine@2
|
177
|
amine@2
|
178 data = ''.join(tok1[0])
|
amine@2
|
179 start = tok1[1]
|
amine@2
|
180 end = tok1[2]
|
amine@2
|
181 self.assertEqual(data, "AaaaAaAaaAaAaa",
|
amine@2
|
182 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format(data))
|
amine@2
|
183 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
|
amine@2
|
184 self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
|
amine@2
|
185
|
amine@2
|
186
|
amine@2
|
187 data = ''.join(tok2[0])
|
amine@2
|
188 start = tok2[1]
|
amine@2
|
189 end = tok2[2]
|
amine@2
|
190 self.assertEqual(data, "AAAAAAAAAaa",
|
amine@2
|
191 msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format(data))
|
amine@2
|
192 self.assertEqual(start, 18, msg="wrong start frame for token 2, expected: 18, found: {0} ".format(start))
|
amine@2
|
193 self.assertEqual(end, 28, msg="wrong end frame for token 2, expected: 28, found: {0} ".format(end))
|
amine@2
|
194
|
amine@2
|
195
|
amine@2
|
196 def test_min_length_1_init_max_length_1(self):
|
amine@2
|
197
|
amine@2
|
198 tokenizer = StreamTokenizer(self.A_validator, min_length = 1, max_length=1,
|
amine@2
|
199 max_continuous_silence=0, init_min = 0,
|
amine@2
|
200 init_max_silence = 0, mode=0)
|
amine@2
|
201
|
amine@2
|
202
|
amine@2
|
203 data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@2
|
204
|
amine@2
|
205 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
206
|
amine@2
|
207 self.assertEqual(len(tokens), 21, msg="wrong number of tokens, expected: 21, found: {0} ".format(len(tokens)))
|
amine@2
|
208
|
amine@2
|
209
|
amine@2
|
210 def test_min_length_10_init_max_length_20(self):
|
amine@2
|
211
|
amine@2
|
212 tokenizer = StreamTokenizer(self.A_validator, min_length = 10, max_length=20,
|
amine@2
|
213 max_continuous_silence=4, init_min = 3,
|
amine@2
|
214 init_max_silence = 3, mode=0)
|
amine@2
|
215
|
amine@2
|
216
|
amine@2
|
217 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA")
|
amine@2
|
218 # ^ ^ ^ ^
|
amine@2
|
219 # 1 16 30 45
|
amine@2
|
220
|
amine@2
|
221 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
222
|
amine@2
|
223 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
|
amine@2
|
224 tok1, tok2 = tokens[0], tokens[1]
|
amine@2
|
225
|
amine@2
|
226
|
amine@2
|
227 data = ''.join(tok1[0])
|
amine@2
|
228 start = tok1[1]
|
amine@2
|
229 end = tok1[2]
|
amine@2
|
230 self.assertEqual(data, "AaaaAaAaaAaAaaaa",
|
amine@2
|
231 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format(data))
|
amine@2
|
232 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
|
amine@2
|
233 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
|
amine@2
|
234
|
amine@2
|
235
|
amine@2
|
236 data = ''.join(tok2[0])
|
amine@2
|
237 start = tok2[1]
|
amine@2
|
238 end = tok2[2]
|
amine@2
|
239 self.assertEqual(data, "AAAAAaaAAaaAAA",
|
amine@2
|
240 msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format(data))
|
amine@2
|
241 self.assertEqual(start, 30, msg="wrong start frame for token 2, expected: 30, found: {0} ".format(start))
|
amine@2
|
242 self.assertEqual(end, 43, msg="wrong end frame for token 2, expected: 43, found: {0} ".format(end))
|
amine@2
|
243
|
amine@2
|
244
|
amine@2
|
245
|
amine@2
|
246 def test_min_length_4_init_max_length_5(self):
|
amine@2
|
247
|
amine@2
|
248 tokenizer = StreamTokenizer(self.A_validator, min_length = 4, max_length=5,
|
amine@2
|
249 max_continuous_silence=4, init_min = 3,
|
amine@2
|
250 init_max_silence = 3, mode=0)
|
amine@2
|
251
|
amine@2
|
252
|
amine@2
|
253 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa")
|
amine@2
|
254 # ^ ^^ ^ ^ ^ ^ ^
|
amine@2
|
255 # 18 2223 27 32 36 42 46
|
amine@2
|
256
|
amine@2
|
257 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
258
|
amine@2
|
259 self.assertEqual(len(tokens), 4, msg="wrong number of tokens, expected: 4, found: {0} ".format(len(tokens)))
|
amine@2
|
260 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
|
amine@2
|
261
|
amine@2
|
262
|
amine@2
|
263 data = ''.join(tok1[0])
|
amine@2
|
264 start = tok1[1]
|
amine@2
|
265 end = tok1[2]
|
amine@2
|
266 self.assertEqual(data, "AAAAA",
|
amine@2
|
267 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
268 self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start))
|
amine@2
|
269 self.assertEqual(end, 22, msg="wrong end frame for token 1, expected: 22, found: {0} ".format(end))
|
amine@2
|
270
|
amine@2
|
271
|
amine@2
|
272 data = ''.join(tok2[0])
|
amine@2
|
273 start = tok2[1]
|
amine@2
|
274 end = tok2[2]
|
amine@2
|
275 self.assertEqual(data, "AAAaa",
|
amine@2
|
276 msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(data))
|
amine@2
|
277 self.assertEqual(start, 23, msg="wrong start frame for token 1, expected: 23, found: {0} ".format(start))
|
amine@2
|
278 self.assertEqual(end, 27, msg="wrong end frame for token 1, expected: 27, found: {0} ".format(end))
|
amine@2
|
279
|
amine@2
|
280
|
amine@2
|
281 data = ''.join(tok3[0])
|
amine@2
|
282 start = tok3[1]
|
amine@2
|
283 end = tok3[2]
|
amine@2
|
284 self.assertEqual(data, "AAAAA",
|
amine@2
|
285 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
286 self.assertEqual(start, 32, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
|
amine@2
|
287 self.assertEqual(end, 36, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end))
|
amine@2
|
288
|
amine@2
|
289
|
amine@2
|
290 data = ''.join(tok4[0])
|
amine@2
|
291 start = tok4[1]
|
amine@2
|
292 end = tok4[2]
|
amine@2
|
293 self.assertEqual(data, "AAaaA",
|
amine@2
|
294 msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(data))
|
amine@2
|
295 self.assertEqual(start, 42, msg="wrong start frame for token 2, expected: 17, found: {0} ".format(start))
|
amine@2
|
296 self.assertEqual(end, 46, msg="wrong end frame for token 2, expected: 22, found: {0} ".format(end))
|
amine@2
|
297
|
amine@2
|
298
|
amine@2
|
299 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase):
|
amine@2
|
300
|
amine@2
|
301 def setUp(self):
|
amine@2
|
302 self.A_validator = AValidator()
|
amine@2
|
303
|
amine@2
|
304
|
amine@2
|
305 def test_min_5_max_10_max_continuous_silence_0(self):
|
amine@2
|
306
|
amine@2
|
307 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
|
amine@2
|
308 max_continuous_silence=0, init_min = 3,
|
amine@2
|
309 init_max_silence = 3, mode=0)
|
amine@2
|
310
|
amine@2
|
311 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@2
|
312 # ^ ^ ^ ^ ^ ^
|
amine@2
|
313 # 3 7 9 14 17 25
|
amine@2
|
314
|
amine@2
|
315 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
316
|
amine@2
|
317 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
|
amine@2
|
318 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@2
|
319
|
amine@2
|
320
|
amine@2
|
321 data = ''.join(tok1[0])
|
amine@2
|
322 start = tok1[1]
|
amine@2
|
323 end = tok1[2]
|
amine@2
|
324 self.assertEqual(data, "AAAAA",
|
amine@2
|
325 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
326 self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start))
|
amine@2
|
327 self.assertEqual(end, 7, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end))
|
amine@2
|
328
|
amine@2
|
329
|
amine@2
|
330 data = ''.join(tok2[0])
|
amine@2
|
331 start = tok2[1]
|
amine@2
|
332 end = tok2[2]
|
amine@2
|
333 self.assertEqual(data, "AAAAAA",
|
amine@2
|
334 msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format(data))
|
amine@2
|
335 self.assertEqual(start, 9, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start))
|
amine@2
|
336 self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
|
amine@2
|
337
|
amine@2
|
338
|
amine@2
|
339 data = ''.join(tok3[0])
|
amine@2
|
340 start = tok3[1]
|
amine@2
|
341 end = tok3[2]
|
amine@2
|
342 self.assertEqual(data, "AAAAAAAAA",
|
amine@2
|
343 msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format(data))
|
amine@2
|
344 self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start))
|
amine@2
|
345 self.assertEqual(end, 25, msg="wrong end frame for token 1, expected: 25, found: {0} ".format(end))
|
amine@2
|
346
|
amine@2
|
347
|
amine@2
|
348
|
amine@2
|
349
|
amine@2
|
350 def test_min_5_max_10_max_continuous_silence_1(self):
|
amine@2
|
351
|
amine@2
|
352 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
|
amine@2
|
353 max_continuous_silence=1, init_min = 3,
|
amine@2
|
354 init_max_silence = 3, mode=0)
|
amine@2
|
355
|
amine@2
|
356 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@2
|
357 # ^ ^^ ^ ^ ^
|
amine@2
|
358 # 3 12131517 26
|
amine@2
|
359 # (12 13 15 17)
|
amine@2
|
360
|
amine@2
|
361 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
362
|
amine@2
|
363 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
|
amine@2
|
364 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@2
|
365
|
amine@2
|
366
|
amine@2
|
367 data = ''.join(tok1[0])
|
amine@2
|
368 start = tok1[1]
|
amine@2
|
369 end = tok1[2]
|
amine@2
|
370 self.assertEqual(data, "AAAAAaAAAA",
|
amine@2
|
371 msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(data))
|
amine@2
|
372 self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start))
|
amine@2
|
373 self.assertEqual(end, 12, msg="wrong end frame for token 1, expected: 10, found: {0} ".format(end))
|
amine@2
|
374
|
amine@2
|
375
|
amine@2
|
376 data = ''.join(tok2[0])
|
amine@2
|
377 start = tok2[1]
|
amine@2
|
378 end = tok2[2]
|
amine@2
|
379 self.assertEqual(data, "AAa",
|
amine@2
|
380 msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(data))
|
amine@2
|
381 self.assertEqual(start, 13, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start))
|
amine@2
|
382 self.assertEqual(end, 15, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
|
amine@2
|
383
|
amine@2
|
384
|
amine@2
|
385 data = ''.join(tok3[0])
|
amine@2
|
386 start = tok3[1]
|
amine@2
|
387 end = tok3[2]
|
amine@2
|
388 self.assertEqual(data, "AAAAAAAAAa",
|
amine@2
|
389 msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(data))
|
amine@2
|
390 self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start))
|
amine@2
|
391 self.assertEqual(end, 26, msg="wrong end frame for token 1, expected: 26, found: {0} ".format(end))
|
amine@2
|
392
|
amine@2
|
393
|
amine@2
|
394 class TestStreamTokenizerModes(unittest.TestCase):
|
amine@2
|
395
|
amine@2
|
396 def setUp(self):
|
amine@2
|
397 self.A_validator = AValidator()
|
amine@2
|
398
|
amine@2
|
399 def test_STRICT_MIN_LENGTH(self):
|
amine@2
|
400
|
amine@2
|
401 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
|
amine@2
|
402 max_continuous_silence=3, init_min = 3,
|
amine@2
|
403 init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH)
|
amine@2
|
404
|
amine@2
|
405 data_source = StringDataSource("aaAAAAAAAAAAAA")
|
amine@2
|
406 # ^ ^
|
amine@2
|
407 # 2 9
|
amine@2
|
408
|
amine@2
|
409 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
410
|
amine@2
|
411 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
|
amine@2
|
412 tok1 = tokens[0]
|
amine@2
|
413
|
amine@2
|
414
|
amine@2
|
415 data = ''.join(tok1[0])
|
amine@2
|
416 start = tok1[1]
|
amine@2
|
417 end = tok1[2]
|
amine@2
|
418 self.assertEqual(data, "AAAAAAAA",
|
amine@2
|
419 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data))
|
amine@2
|
420 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
|
amine@2
|
421 self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
|
amine@2
|
422
|
amine@2
|
423
|
amine@3
|
424 def test_DROP_TAILING_SILENCE(self):
|
amine@2
|
425
|
amine@2
|
426 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
|
amine@2
|
427 max_continuous_silence=2, init_min = 3,
|
amine@3
|
428 init_max_silence = 3, mode=StreamTokenizer.DROP_TAILING_SILENCE)
|
amine@2
|
429
|
amine@2
|
430 data_source = StringDataSource("aaAAAAAaaaaa")
|
amine@2
|
431 # ^ ^
|
amine@2
|
432 # 2 6
|
amine@2
|
433
|
amine@2
|
434 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
435
|
amine@2
|
436 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
|
amine@2
|
437 tok1 = tokens[0]
|
amine@2
|
438
|
amine@2
|
439
|
amine@2
|
440 data = ''.join(tok1[0])
|
amine@2
|
441 start = tok1[1]
|
amine@2
|
442 end = tok1[2]
|
amine@2
|
443 self.assertEqual(data, "AAAAA",
|
amine@2
|
444 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
445 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
|
amine@2
|
446 self.assertEqual(end, 6, msg="wrong end frame for token 1, expected: 6, found: {0} ".format(end))
|
amine@2
|
447
|
amine@2
|
448
|
amine@3
|
449 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self):
|
amine@2
|
450
|
amine@2
|
451 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
|
amine@2
|
452 max_continuous_silence=3, init_min = 3,
|
amine@3
|
453 init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE)
|
amine@2
|
454
|
amine@2
|
455 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
|
amine@2
|
456 # ^ ^
|
amine@2
|
457 # 2 8
|
amine@2
|
458
|
amine@2
|
459 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
460
|
amine@2
|
461 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
|
amine@2
|
462 tok1 = tokens[0]
|
amine@2
|
463
|
amine@2
|
464
|
amine@2
|
465 data = ''.join(tok1[0])
|
amine@2
|
466 start = tok1[1]
|
amine@2
|
467 end = tok1[2]
|
amine@2
|
468 self.assertEqual(data, "AAAAAAAA",
|
amine@2
|
469 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data))
|
amine@2
|
470 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
|
amine@2
|
471 self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
|
amine@2
|
472
|
amine@2
|
473
|
amine@2
|
474 class TestStreamTokenizerCallback(unittest.TestCase):
|
amine@2
|
475
|
amine@2
|
476 def setUp(self):
|
amine@2
|
477 self.A_validator = AValidator()
|
amine@2
|
478
|
amine@2
|
479 def test_callback(self):
|
amine@2
|
480
|
amine@2
|
481 tokens = []
|
amine@2
|
482
|
amine@2
|
483 def callback(data, start, end):
|
amine@2
|
484 tokens.append((data, start, end))
|
amine@2
|
485
|
amine@2
|
486
|
amine@2
|
487 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
|
amine@2
|
488 max_continuous_silence=3, init_min = 3,
|
amine@2
|
489 init_max_silence = 3, mode=0)
|
amine@2
|
490
|
amine@2
|
491 data_source = StringDataSource("aaAAAAAAAAAAAAa")
|
amine@2
|
492 # ^ ^^ ^
|
amine@2
|
493 # 2 910 14
|
amine@2
|
494
|
amine@2
|
495 tokenizer.tokenize(data_source, callback=callback)
|
amine@2
|
496
|
amine@2
|
497 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
|
amine@2
|
498
|
amine@2
|
499
|
amine@2
|
500
|
amine@2
|
501 if __name__ == "__main__":
|
amine@2
|
502 #import sys;sys.argv = ['', 'Test.testName']
|
amine@2
|
503 unittest.main()
|