amine@2
|
1 '''
|
amine@2
|
2 @author: Amine Sehili <amine.sehili@gmail.com>
|
amine@2
|
3 September 2015
|
amine@2
|
4
|
amine@2
|
5 '''
|
amine@2
|
6
|
amine@2
|
7 import unittest
|
amine@2
|
8 from auditok import StreamTokenizer, StringDataSource, DataValidator
|
amine@2
|
9
|
amine@2
|
10
|
amine@2
|
11 class AValidator(DataValidator):
|
amine@2
|
12
|
amine@2
|
13 def is_valid(self, frame):
|
amine@2
|
14 return frame == "A"
|
amine@2
|
15
|
amine@2
|
16
|
amine@2
|
17 class TestStreamTokenizerInitParams(unittest.TestCase):
|
amine@2
|
18
|
amine@2
|
19
|
amine@2
|
20 def setUp(self):
|
amine@2
|
21 self.A_validator = AValidator()
|
amine@2
|
22
|
amine@2
|
23 # Completely deactivate init_min and init_max_silence
|
amine@2
|
24 # The tokenizer will only rely on the other parameters
|
amine@2
|
25 # Note that if init_min = 0, the value of init_max_silence
|
amine@2
|
26 # will have no effect
|
amine@2
|
27 def test_init_min_0_init_max_silence_0(self):
|
amine@2
|
28
|
amine@2
|
29 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
|
amine@2
|
30 max_continuous_silence=4, init_min = 0,
|
amine@2
|
31 init_max_silence = 0, mode=0)
|
amine@2
|
32
|
amine@2
|
33
|
amine@2
|
34 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA")
|
amine@19
|
35 # ^ ^ ^ ^
|
amine@19
|
36 # 2 16 20 27
|
amine@2
|
37 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
38
|
amine@2
|
39 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
|
amine@2
|
40 tok1, tok2 = tokens[0], tokens[1]
|
amine@2
|
41
|
amine@2
|
42 # tok1[0]: data
|
amine@2
|
43 # tok1[1]: start frame (included)
|
amine@2
|
44 # tok1[2]: end frame (included)
|
amine@2
|
45
|
amine@2
|
46 data = ''.join(tok1[0])
|
amine@2
|
47 start = tok1[1]
|
amine@2
|
48 end = tok1[2]
|
amine@2
|
49 self.assertEqual(data, "AaaaAaAaaAaAaaaa",
|
amine@2
|
50 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(data))
|
amine@2
|
51 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
|
amine@2
|
52 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
|
amine@2
|
53
|
amine@2
|
54
|
amine@2
|
55 data = ''.join(tok2[0])
|
amine@2
|
56 start = tok2[1]
|
amine@2
|
57 end = tok2[2]
|
amine@2
|
58 self.assertEqual(data, "AAAAAAAA",
|
amine@2
|
59 msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(data))
|
amine@2
|
60 self.assertEqual(start, 20, msg="wrong start frame for token 2, expected: 20, found: {0} ".format(start))
|
amine@2
|
61 self.assertEqual(end, 27, msg="wrong end frame for token 2, expected: 27, found: {0} ".format(end))
|
amine@2
|
62
|
amine@2
|
63
|
amine@2
|
64
|
amine@5
|
65 # A valid token is considered as so iff the tokenizer encounters
|
amine@2
|
66 # at least valid frames (init_min = 3) between witch there
|
amine@2
|
67 # are at most 0 consecutive non valid frames (init_max_silence = 0)
|
amine@2
|
68 # The tokenizer will only rely on the other parameters
|
amine@2
|
69 # In other words, a valid token must start with 3 valid frames
|
amine@2
|
70 def test_init_min_3_init_max_silence_0(self):
|
amine@2
|
71
|
amine@2
|
72 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
|
amine@2
|
73 max_continuous_silence=4, init_min = 3,
|
amine@2
|
74 init_max_silence = 0, mode=0)
|
amine@2
|
75
|
amine@2
|
76
|
amine@2
|
77
|
amine@2
|
78 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA")
|
amine@5
|
79 # ^ ^ ^ ^
|
amine@5
|
80 # 18 30 33 37
|
amine@2
|
81
|
amine@2
|
82 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
83
|
amine@2
|
84 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
|
amine@2
|
85 tok1, tok2 = tokens[0], tokens[1]
|
amine@2
|
86
|
amine@2
|
87 data = ''.join(tok1[0])
|
amine@2
|
88 start = tok1[1]
|
amine@2
|
89 end = tok1[2]
|
amine@2
|
90 self.assertEqual(data, "AAAAAAAAAaaaa",
|
amine@2
|
91 msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data))
|
amine@2
|
92 self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start))
|
amine@2
|
93 self.assertEqual(end, 30, msg="wrong end frame for token 1, expected: 30, found: {0} ".format(end))
|
amine@2
|
94
|
amine@2
|
95
|
amine@2
|
96 data = ''.join(tok2[0])
|
amine@2
|
97 start = tok2[1]
|
amine@2
|
98 end = tok2[2]
|
amine@2
|
99 self.assertEqual(data, "AAAAA",
|
amine@2
|
100 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
101 self.assertEqual(start, 33, msg="wrong start frame for token 2, expected: 33, found: {0} ".format(start))
|
amine@2
|
102 self.assertEqual(end, 37, msg="wrong end frame for token 2, expected: 37, found: {0} ".format(end))
|
amine@2
|
103
|
amine@2
|
104
|
amine@2
|
105 # A valid token is considered iff the tokenizer encounters
|
amine@2
|
106 # at least valid frames (init_min = 3) between witch there
|
amine@2
|
107 # are at most 2 consecutive non valid frames (init_max_silence = 2)
|
amine@2
|
108 def test_init_min_3_init_max_silence_2(self):
|
amine@2
|
109
|
amine@2
|
110 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20,
|
amine@2
|
111 max_continuous_silence=4, init_min = 3,
|
amine@2
|
112 init_max_silence = 2, mode=0)
|
amine@2
|
113
|
amine@2
|
114
|
amine@2
|
115 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA")
|
amine@5
|
116 # ^ ^ ^ ^ ^ ^
|
amine@5
|
117 # 5 16 19 31 35 39
|
amine@2
|
118 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
119
|
amine@2
|
120 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
|
amine@2
|
121 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@2
|
122
|
amine@2
|
123
|
amine@2
|
124 data = ''.join(tok1[0])
|
amine@2
|
125 start = tok1[1]
|
amine@2
|
126 end = tok1[2]
|
amine@2
|
127 self.assertEqual(data, "AaAaaAaAaaaa",
|
amine@2
|
128 msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(data))
|
amine@2
|
129 self.assertEqual(start, 5, msg="wrong start frame for token 1, expected: 5, found: {0} ".format(start))
|
amine@2
|
130 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
|
amine@2
|
131
|
amine@2
|
132
|
amine@2
|
133 data = ''.join(tok2[0])
|
amine@2
|
134 start = tok2[1]
|
amine@2
|
135 end = tok2[2]
|
amine@2
|
136 self.assertEqual(data, "AAAAAAAAAaaaa",
|
amine@2
|
137 msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data))
|
amine@2
|
138 self.assertEqual(start, 19, msg="wrong start frame for token 2, expected: 19, found: {0} ".format(start))
|
amine@2
|
139 self.assertEqual(end, 31, msg="wrong end frame for token 2, expected: 31, found: {0} ".format(end))
|
amine@2
|
140
|
amine@2
|
141
|
amine@2
|
142 data = ''.join(tok3[0])
|
amine@2
|
143 start = tok3[1]
|
amine@2
|
144 end = tok3[2]
|
amine@2
|
145 self.assertEqual(data, "AAAAA",
|
amine@2
|
146 msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
147 self.assertEqual(start, 35, msg="wrong start frame for token 2, expected: 35, found: {0} ".format(start))
|
amine@2
|
148 self.assertEqual(end, 39, msg="wrong end frame for token 2, expected: 39, found: {0} ".format(end))
|
amine@2
|
149
|
amine@2
|
150
|
amine@2
|
151
|
amine@2
|
152 class TestStreamTokenizerMinMaxLength(unittest.TestCase):
|
amine@2
|
153
|
amine@2
|
154 def setUp(self):
|
amine@2
|
155 self.A_validator = AValidator()
|
amine@2
|
156
|
amine@2
|
157
|
amine@2
|
158 def test_min_length_6_init_max_length_20(self):
|
amine@2
|
159
|
amine@2
|
160 tokenizer = StreamTokenizer(self.A_validator, min_length = 6, max_length=20,
|
amine@2
|
161 max_continuous_silence=2, init_min = 3,
|
amine@2
|
162 init_max_silence = 3, mode=0)
|
amine@2
|
163
|
amine@2
|
164
|
amine@2
|
165 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@5
|
166 # ^ ^ ^ ^
|
amine@5
|
167 # 1 14 18 28
|
amine@2
|
168
|
amine@2
|
169 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
170
|
amine@2
|
171 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
|
amine@2
|
172 tok1, tok2 = tokens[0], tokens[1]
|
amine@2
|
173
|
amine@2
|
174
|
amine@2
|
175 data = ''.join(tok1[0])
|
amine@2
|
176 start = tok1[1]
|
amine@2
|
177 end = tok1[2]
|
amine@2
|
178 self.assertEqual(data, "AaaaAaAaaAaAaa",
|
amine@2
|
179 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaa', found: '{0}' ".format(data))
|
amine@2
|
180 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
|
amine@2
|
181 self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
|
amine@2
|
182
|
amine@2
|
183
|
amine@2
|
184 data = ''.join(tok2[0])
|
amine@2
|
185 start = tok2[1]
|
amine@2
|
186 end = tok2[2]
|
amine@2
|
187 self.assertEqual(data, "AAAAAAAAAaa",
|
amine@2
|
188 msg="wrong data for token 2, expected: 'AAAAAAAAAaa', found: '{0}' ".format(data))
|
amine@2
|
189 self.assertEqual(start, 18, msg="wrong start frame for token 2, expected: 18, found: {0} ".format(start))
|
amine@2
|
190 self.assertEqual(end, 28, msg="wrong end frame for token 2, expected: 28, found: {0} ".format(end))
|
amine@2
|
191
|
amine@2
|
192
|
amine@2
|
193 def test_min_length_1_init_max_length_1(self):
|
amine@2
|
194
|
amine@2
|
195 tokenizer = StreamTokenizer(self.A_validator, min_length = 1, max_length=1,
|
amine@2
|
196 max_continuous_silence=0, init_min = 0,
|
amine@2
|
197 init_max_silence = 0, mode=0)
|
amine@2
|
198
|
amine@2
|
199
|
amine@2
|
200 data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA")
|
amine@2
|
201
|
amine@2
|
202 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
203
|
amine@2
|
204 self.assertEqual(len(tokens), 21, msg="wrong number of tokens, expected: 21, found: {0} ".format(len(tokens)))
|
amine@2
|
205
|
amine@2
|
206
|
amine@2
|
207 def test_min_length_10_init_max_length_20(self):
|
amine@2
|
208
|
amine@2
|
209 tokenizer = StreamTokenizer(self.A_validator, min_length = 10, max_length=20,
|
amine@2
|
210 max_continuous_silence=4, init_min = 3,
|
amine@2
|
211 init_max_silence = 3, mode=0)
|
amine@2
|
212
|
amine@2
|
213
|
amine@2
|
214 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA")
|
amine@5
|
215 # ^ ^ ^ ^
|
amine@5
|
216 # 1 16 30 45
|
amine@2
|
217
|
amine@2
|
218 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
219
|
amine@2
|
220 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))
|
amine@2
|
221 tok1, tok2 = tokens[0], tokens[1]
|
amine@2
|
222
|
amine@2
|
223
|
amine@2
|
224 data = ''.join(tok1[0])
|
amine@2
|
225 start = tok1[1]
|
amine@2
|
226 end = tok1[2]
|
amine@2
|
227 self.assertEqual(data, "AaaaAaAaaAaAaaaa",
|
amine@2
|
228 msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: '{0}' ".format(data))
|
amine@2
|
229 self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
|
amine@2
|
230 self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end))
|
amine@2
|
231
|
amine@2
|
232
|
amine@2
|
233 data = ''.join(tok2[0])
|
amine@2
|
234 start = tok2[1]
|
amine@2
|
235 end = tok2[2]
|
amine@2
|
236 self.assertEqual(data, "AAAAAaaAAaaAAA",
|
amine@2
|
237 msg="wrong data for token 2, expected: 'AAAAAaaAAaaAAA', found: '{0}' ".format(data))
|
amine@2
|
238 self.assertEqual(start, 30, msg="wrong start frame for token 2, expected: 30, found: {0} ".format(start))
|
amine@2
|
239 self.assertEqual(end, 43, msg="wrong end frame for token 2, expected: 43, found: {0} ".format(end))
|
amine@2
|
240
|
amine@2
|
241
|
amine@2
|
242
|
amine@2
|
243 def test_min_length_4_init_max_length_5(self):
|
amine@2
|
244
|
amine@2
|
245 tokenizer = StreamTokenizer(self.A_validator, min_length = 4, max_length=5,
|
amine@2
|
246 max_continuous_silence=4, init_min = 3,
|
amine@2
|
247 init_max_silence = 3, mode=0)
|
amine@2
|
248
|
amine@2
|
249
|
amine@2
|
250 data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa")
|
amine@5
|
251 # ^ ^^ ^ ^ ^ ^ ^
|
amine@5
|
252 # 18 2223 27 32 36 42 46
|
amine@2
|
253
|
amine@2
|
254 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
255
|
amine@2
|
256 self.assertEqual(len(tokens), 4, msg="wrong number of tokens, expected: 4, found: {0} ".format(len(tokens)))
|
amine@2
|
257 tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3]
|
amine@2
|
258
|
amine@2
|
259
|
amine@2
|
260 data = ''.join(tok1[0])
|
amine@2
|
261 start = tok1[1]
|
amine@2
|
262 end = tok1[2]
|
amine@2
|
263 self.assertEqual(data, "AAAAA",
|
amine@2
|
264 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
265 self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start))
|
amine@2
|
266 self.assertEqual(end, 22, msg="wrong end frame for token 1, expected: 22, found: {0} ".format(end))
|
amine@2
|
267
|
amine@2
|
268
|
amine@2
|
269 data = ''.join(tok2[0])
|
amine@2
|
270 start = tok2[1]
|
amine@2
|
271 end = tok2[2]
|
amine@2
|
272 self.assertEqual(data, "AAAaa",
|
amine@2
|
273 msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(data))
|
amine@2
|
274 self.assertEqual(start, 23, msg="wrong start frame for token 1, expected: 23, found: {0} ".format(start))
|
amine@2
|
275 self.assertEqual(end, 27, msg="wrong end frame for token 1, expected: 27, found: {0} ".format(end))
|
amine@2
|
276
|
amine@2
|
277
|
amine@2
|
278 data = ''.join(tok3[0])
|
amine@2
|
279 start = tok3[1]
|
amine@2
|
280 end = tok3[2]
|
amine@2
|
281 self.assertEqual(data, "AAAAA",
|
amine@2
|
282 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
283 self.assertEqual(start, 32, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start))
|
amine@2
|
284 self.assertEqual(end, 36, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end))
|
amine@2
|
285
|
amine@2
|
286
|
amine@2
|
287 data = ''.join(tok4[0])
|
amine@2
|
288 start = tok4[1]
|
amine@2
|
289 end = tok4[2]
|
amine@2
|
290 self.assertEqual(data, "AAaaA",
|
amine@2
|
291 msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(data))
|
amine@2
|
292 self.assertEqual(start, 42, msg="wrong start frame for token 2, expected: 17, found: {0} ".format(start))
|
amine@2
|
293 self.assertEqual(end, 46, msg="wrong end frame for token 2, expected: 22, found: {0} ".format(end))
|
amine@2
|
294
|
amine@2
|
295
|
amine@2
|
296 class TestStreamTokenizerMaxContinuousSilence(unittest.TestCase):
|
amine@2
|
297
|
amine@2
|
298 def setUp(self):
|
amine@2
|
299 self.A_validator = AValidator()
|
amine@2
|
300
|
amine@2
|
301
|
amine@2
|
302 def test_min_5_max_10_max_continuous_silence_0(self):
|
amine@2
|
303
|
amine@2
|
304 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
|
amine@2
|
305 max_continuous_silence=0, init_min = 3,
|
amine@2
|
306 init_max_silence = 3, mode=0)
|
amine@2
|
307
|
amine@2
|
308 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@5
|
309 # ^ ^ ^ ^ ^ ^
|
amine@5
|
310 # 3 7 9 14 17 25
|
amine@2
|
311
|
amine@2
|
312 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
313
|
amine@2
|
314 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
|
amine@2
|
315 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@2
|
316
|
amine@2
|
317
|
amine@2
|
318 data = ''.join(tok1[0])
|
amine@2
|
319 start = tok1[1]
|
amine@2
|
320 end = tok1[2]
|
amine@2
|
321 self.assertEqual(data, "AAAAA",
|
amine@2
|
322 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
323 self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start))
|
amine@2
|
324 self.assertEqual(end, 7, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end))
|
amine@2
|
325
|
amine@2
|
326
|
amine@2
|
327 data = ''.join(tok2[0])
|
amine@2
|
328 start = tok2[1]
|
amine@2
|
329 end = tok2[2]
|
amine@2
|
330 self.assertEqual(data, "AAAAAA",
|
amine@2
|
331 msg="wrong data for token 1, expected: 'AAAAAA', found: '{0}' ".format(data))
|
amine@2
|
332 self.assertEqual(start, 9, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start))
|
amine@2
|
333 self.assertEqual(end, 14, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
|
amine@2
|
334
|
amine@2
|
335
|
amine@2
|
336 data = ''.join(tok3[0])
|
amine@2
|
337 start = tok3[1]
|
amine@2
|
338 end = tok3[2]
|
amine@2
|
339 self.assertEqual(data, "AAAAAAAAA",
|
amine@2
|
340 msg="wrong data for token 1, expected: 'AAAAAAAAA', found: '{0}' ".format(data))
|
amine@2
|
341 self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start))
|
amine@2
|
342 self.assertEqual(end, 25, msg="wrong end frame for token 1, expected: 25, found: {0} ".format(end))
|
amine@2
|
343
|
amine@2
|
344
|
amine@2
|
345
|
amine@2
|
346
|
amine@2
|
347 def test_min_5_max_10_max_continuous_silence_1(self):
|
amine@2
|
348
|
amine@2
|
349 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
|
amine@2
|
350 max_continuous_silence=1, init_min = 3,
|
amine@2
|
351 init_max_silence = 3, mode=0)
|
amine@2
|
352
|
amine@2
|
353 data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa")
|
amine@5
|
354 # ^ ^^ ^ ^ ^
|
amine@5
|
355 # 3 12131517 26
|
amine@5
|
356 # (12 13 15 17)
|
amine@2
|
357
|
amine@2
|
358 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
359
|
amine@2
|
360 self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens)))
|
amine@2
|
361 tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2]
|
amine@2
|
362
|
amine@2
|
363
|
amine@2
|
364 data = ''.join(tok1[0])
|
amine@2
|
365 start = tok1[1]
|
amine@2
|
366 end = tok1[2]
|
amine@2
|
367 self.assertEqual(data, "AAAAAaAAAA",
|
amine@2
|
368 msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(data))
|
amine@2
|
369 self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start))
|
amine@2
|
370 self.assertEqual(end, 12, msg="wrong end frame for token 1, expected: 10, found: {0} ".format(end))
|
amine@2
|
371
|
amine@2
|
372
|
amine@2
|
373 data = ''.join(tok2[0])
|
amine@2
|
374 start = tok2[1]
|
amine@2
|
375 end = tok2[2]
|
amine@2
|
376 self.assertEqual(data, "AAa",
|
amine@2
|
377 msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(data))
|
amine@2
|
378 self.assertEqual(start, 13, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start))
|
amine@2
|
379 self.assertEqual(end, 15, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end))
|
amine@2
|
380
|
amine@2
|
381
|
amine@2
|
382 data = ''.join(tok3[0])
|
amine@2
|
383 start = tok3[1]
|
amine@2
|
384 end = tok3[2]
|
amine@2
|
385 self.assertEqual(data, "AAAAAAAAAa",
|
amine@2
|
386 msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(data))
|
amine@2
|
387 self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start))
|
amine@2
|
388 self.assertEqual(end, 26, msg="wrong end frame for token 1, expected: 26, found: {0} ".format(end))
|
amine@2
|
389
|
amine@2
|
390
|
amine@2
|
391 class TestStreamTokenizerModes(unittest.TestCase):
|
amine@2
|
392
|
amine@2
|
393 def setUp(self):
|
amine@2
|
394 self.A_validator = AValidator()
|
amine@2
|
395
|
amine@2
|
396 def test_STRICT_MIN_LENGTH(self):
|
amine@2
|
397
|
amine@2
|
398 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
|
amine@2
|
399 max_continuous_silence=3, init_min = 3,
|
amine@2
|
400 init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH)
|
amine@2
|
401
|
amine@2
|
402 data_source = StringDataSource("aaAAAAAAAAAAAA")
|
amine@5
|
403 # ^ ^
|
amine@5
|
404 # 2 9
|
amine@2
|
405
|
amine@2
|
406 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
407
|
amine@2
|
408 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
|
amine@2
|
409 tok1 = tokens[0]
|
amine@2
|
410
|
amine@2
|
411
|
amine@2
|
412 data = ''.join(tok1[0])
|
amine@2
|
413 start = tok1[1]
|
amine@2
|
414 end = tok1[2]
|
amine@2
|
415 self.assertEqual(data, "AAAAAAAA",
|
amine@2
|
416 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data))
|
amine@2
|
417 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
|
amine@2
|
418 self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
|
amine@2
|
419
|
amine@2
|
420
|
amine@3
|
421 def test_DROP_TAILING_SILENCE(self):
|
amine@2
|
422
|
amine@2
|
423 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10,
|
amine@2
|
424 max_continuous_silence=2, init_min = 3,
|
amine@3
|
425 init_max_silence = 3, mode=StreamTokenizer.DROP_TAILING_SILENCE)
|
amine@2
|
426
|
amine@2
|
427 data_source = StringDataSource("aaAAAAAaaaaa")
|
amine@5
|
428 # ^ ^
|
amine@5
|
429 # 2 6
|
amine@2
|
430
|
amine@2
|
431 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
432
|
amine@2
|
433 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
|
amine@2
|
434 tok1 = tokens[0]
|
amine@2
|
435
|
amine@2
|
436
|
amine@2
|
437 data = ''.join(tok1[0])
|
amine@2
|
438 start = tok1[1]
|
amine@2
|
439 end = tok1[2]
|
amine@2
|
440 self.assertEqual(data, "AAAAA",
|
amine@2
|
441 msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data))
|
amine@2
|
442 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
|
amine@2
|
443 self.assertEqual(end, 6, msg="wrong end frame for token 1, expected: 6, found: {0} ".format(end))
|
amine@2
|
444
|
amine@2
|
445
|
amine@3
|
446 def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self):
|
amine@2
|
447
|
amine@2
|
448 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
|
amine@2
|
449 max_continuous_silence=3, init_min = 3,
|
amine@3
|
450 init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE)
|
amine@2
|
451
|
amine@2
|
452 data_source = StringDataSource("aaAAAAAAAAAAAAaa")
|
amine@5
|
453 # ^ ^
|
amine@5
|
454 # 2 8
|
amine@2
|
455
|
amine@2
|
456 tokens = tokenizer.tokenize(data_source)
|
amine@2
|
457
|
amine@2
|
458 self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
|
amine@2
|
459 tok1 = tokens[0]
|
amine@2
|
460
|
amine@2
|
461
|
amine@2
|
462 data = ''.join(tok1[0])
|
amine@2
|
463 start = tok1[1]
|
amine@2
|
464 end = tok1[2]
|
amine@2
|
465 self.assertEqual(data, "AAAAAAAA",
|
amine@2
|
466 msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data))
|
amine@2
|
467 self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start))
|
amine@2
|
468 self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
|
amine@2
|
469
|
amine@2
|
470
|
amine@2
|
471 class TestStreamTokenizerCallback(unittest.TestCase):
|
amine@2
|
472
|
amine@2
|
473 def setUp(self):
|
amine@2
|
474 self.A_validator = AValidator()
|
amine@2
|
475
|
amine@2
|
476 def test_callback(self):
|
amine@2
|
477
|
amine@2
|
478 tokens = []
|
amine@2
|
479
|
amine@2
|
480 def callback(data, start, end):
|
amine@2
|
481 tokens.append((data, start, end))
|
amine@2
|
482
|
amine@2
|
483
|
amine@2
|
484 tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8,
|
amine@2
|
485 max_continuous_silence=3, init_min = 3,
|
amine@2
|
486 init_max_silence = 3, mode=0)
|
amine@2
|
487
|
amine@2
|
488 data_source = StringDataSource("aaAAAAAAAAAAAAa")
|
amine@5
|
489 # ^ ^^ ^
|
amine@5
|
490 # 2 910 14
|
amine@2
|
491
|
amine@2
|
492 tokenizer.tokenize(data_source, callback=callback)
|
amine@2
|
493
|
amine@2
|
494 self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
|
amine@2
|
495
|
amine@2
|
496
|
amine@2
|
497
|
amine@2
|
498 if __name__ == "__main__":
|
amine@2
|
499 #import sys;sys.argv = ['', 'Test.testName']
|
amine@2
|
500 unittest.main()
|