amine@33
|
1 """
|
amine@368
|
2 .. autosummary::
|
amine@368
|
3 :toctree: generated/
|
amine@33
|
4
|
amine@371
|
5 load
|
amine@368
|
6 split
|
amine@368
|
7 AudioRegion
|
amine@368
|
8 StreamTokenizer
|
amine@33
|
9 """
|
amine@404
|
10
|
amine@404
|
11 import math
|
amine@187
|
12 import os
|
amine@411
|
13 import warnings
|
amine@411
|
14 from dataclasses import dataclass, field
|
amine@411
|
15 from pathlib import Path
|
amine@404
|
16
|
amine@404
|
17 from .exceptions import TooSmallBlockDuration
|
amine@404
|
18 from .io import check_audio_data, get_audio_source, player_for, to_file
|
amine@404
|
19 from .plotting import plot
|
amine@404
|
20 from .util import AudioEnergyValidator, AudioReader, DataValidator
|
amine@263
|
21
|
amine@263
|
22 try:
|
amine@246
|
23 from . import signal_numpy as signal
|
amine@246
|
24 except ImportError:
|
amine@246
|
25 from . import signal
|
amine@246
|
26
|
amine@371
|
27 __all__ = ["load", "split", "AudioRegion", "StreamTokenizer"]
|
amine@179
|
28
|
amine@179
|
29
|
amine@179
|
30 DEFAULT_ANALYSIS_WINDOW = 0.05
|
amine@179
|
31 DEFAULT_ENERGY_THRESHOLD = 50
|
amine@368
|
32 _EPSILON = 1e-10
|
amine@179
|
33
|
amine@179
|
34
|
amine@371
|
35 def load(input, skip=0, max_read=None, **kwargs):
|
amine@371
|
36 """Load audio data from a source and return it as an :class:`AudioRegion`.
|
amine@373
|
37
|
amine@373
|
38 Parameters
|
amine@373
|
39 ----------
|
amine@373
|
40 input : None, str, bytes, AudioSource
|
amine@373
|
41 source to read audio data from. If `str`, it should be a path to a
|
amine@373
|
42 valid audio file. If `bytes`, it is used as raw audio data. If it is
|
amine@373
|
43 "-", raw data will be read from stdin. If None, read audio data from
|
amine@373
|
44 the microphone using PyAudio. If of type `bytes` or is a path to a
|
amine@373
|
45 raw audio file then `sampling_rate`, `sample_width` and `channels`
|
amine@373
|
46 parameters (or their alias) are required. If it's an
|
amine@373
|
47 :class:`AudioSource` object it's used directly to read data.
|
amine@373
|
48 skip : float, default: 0
|
amine@373
|
49 amount, in seconds, of audio data to skip from source. If read from
|
amine@373
|
50 a microphone, `skip` must be 0, otherwise a `ValueError` is raised.
|
amine@373
|
51 max_read : float, default: None
|
amine@373
|
52 amount, in seconds, of audio data to read from source. If read from
|
amine@373
|
53 microphone, `max_read` should not be None, otherwise a `ValueError` is
|
amine@373
|
54 raised.
|
amine@373
|
55 audio_format, fmt : str
|
amine@373
|
56 type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only
|
amine@373
|
57 be used if `input` is a string path to an audio file. If not given,
|
amine@373
|
58 audio type will be guessed from file name extension or from file
|
amine@373
|
59 header.
|
amine@373
|
60 sampling_rate, sr : int
|
amine@373
|
61 sampling rate of audio data. Required if `input` is a raw audio file,
|
amine@373
|
62 a `bytes` object or None (i.e., read from microphone).
|
amine@373
|
63 sample_width, sw : int
|
amine@373
|
64 number of bytes used to encode one audio sample, typically 1, 2 or 4.
|
amine@373
|
65 Required for raw data, see `sampling_rate`.
|
amine@373
|
66 channels, ch : int
|
amine@373
|
67 number of channels of audio data. Required for raw data, see
|
amine@373
|
68 `sampling_rate`.
|
amine@373
|
69 large_file : bool, default: False
|
amine@373
|
70 If True, AND if `input` is a path to a *wav* of a *raw* audio file
|
amine@373
|
71 (and **only** these two formats) then audio file is not fully loaded to
|
amine@373
|
72 memory in order to create the region (but the portion of data needed to
|
amine@373
|
73 create the region is of course loaded to memory). Set to True if
|
amine@373
|
74 `max_read` is significantly smaller then the size of a large audio file
|
amine@373
|
75 that shouldn't be entirely loaded to memory.
|
amine@373
|
76
|
amine@373
|
77 Returns
|
amine@373
|
78 -------
|
amine@373
|
79 region: AudioRegion
|
amine@373
|
80
|
amine@373
|
81 Raises
|
amine@373
|
82 ------
|
amine@373
|
83 ValueError
|
amine@373
|
84 raised if `input` is None (i.e., read data from microphone) and `skip`
|
amine@373
|
85 != 0 or `input` is None `max_read` is None (meaning that when reading
|
amine@373
|
86 from the microphone, no data should be skipped, and maximum amount of
|
amine@373
|
87 data to read should be explicitly provided).
|
amine@371
|
88 """
|
amine@371
|
89 return AudioRegion.load(input, skip, max_read, **kwargs)
|
amine@371
|
90
|
amine@371
|
91
|
amine@179
|
92 def split(
|
amine@179
|
93 input,
|
amine@179
|
94 min_dur=0.2,
|
amine@179
|
95 max_dur=5,
|
amine@179
|
96 max_silence=0.3,
|
amine@179
|
97 drop_trailing_silence=False,
|
amine@183
|
98 strict_min_dur=False,
|
amine@404
|
99 **kwargs,
|
amine@179
|
100 ):
|
amine@351
|
101 """
|
amine@368
|
102 Split audio data and return a generator of AudioRegions
|
amine@179
|
103
|
amine@351
|
104 Parameters
|
amine@351
|
105 ----------
|
amine@351
|
106 input : str, bytes, AudioSource, AudioReader, AudioRegion or None
|
amine@351
|
107 input audio data. If str, it should be a path to an existing audio file.
|
amine@362
|
108 "-" is interpreted as standard input. If bytes, input is considered as
|
amine@362
|
109 raw audio data. If None, read audio from microphone.
|
amine@368
|
110 Every object that is not an `AudioReader` will be transformed into an
|
amine@351
|
111 `AudioReader` before processing. If it is an `str` that refers to a raw
|
amine@351
|
112 audio file, `bytes` or None, audio parameters should be provided using
|
amine@404
|
113 kwargs (i.e., `sampling_rate`, `sample_width` and `channels` or their
|
amine@351
|
114 alias).
|
amine@368
|
115 If `input` is str then audio format will be guessed from file extension.
|
amine@351
|
116 `audio_format` (alias `fmt`) kwarg can also be given to specify audio
|
amine@351
|
117 format explicitly. If none of these options is available, rely on
|
amine@351
|
118 backend (currently only pydub is supported) to load data.
|
amine@351
|
119 min_dur : float, default: 0.2
|
amine@404
|
120 minimum duration in seconds of a detected audio event. By using large
|
amine@351
|
121 values for `min_dur`, very short audio events (e.g., very short 1-word
|
amine@404
|
122 utterances like 'yes' or 'no') can be mis detected. Using a very small
|
amine@404
|
123 value may result in a high number of too short audio events.
|
amine@351
|
124 max_dur : float, default: 5
|
amine@351
|
125 maximum duration in seconds of a detected audio event. If an audio event
|
amine@351
|
126 lasts more than `max_dur` it will be truncated. If the continuation of a
|
amine@351
|
127 truncated audio event is shorter than `min_dur` then this continuation
|
amine@351
|
128 is accepted as a valid audio event if `strict_min_dur` is False.
|
amine@351
|
129 Otherwise it is rejected.
|
amine@351
|
130 max_silence : float, default: 0.3
|
amine@351
|
131 maximum duration of continuous silence within an audio event. There
|
amine@351
|
132 might be many silent gaps of this duration within one audio event. If
|
amine@351
|
133 the continuous silence happens at the end of the event than it's kept as
|
amine@351
|
134 part of the event if `drop_trailing_silence` is False (default).
|
amine@351
|
135 drop_trailing_silence : bool, default: False
|
amine@351
|
136 Whether to remove trailing silence from detected events. To avoid abrupt
|
amine@368
|
137 cuts in speech, trailing silence should be kept, therefore this
|
amine@368
|
138 parameter should be False.
|
amine@351
|
139 strict_min_dur : bool, default: False
|
amine@351
|
140 strict minimum duration. Do not accept an audio event if it is shorter
|
amine@368
|
141 than `min_dur` even if it is contiguous to the latest valid event. This
|
amine@368
|
142 happens if the the latest detected event had reached `max_dur`.
|
amine@179
|
143
|
amine@368
|
144 Other Parameters
|
amine@368
|
145 ----------------
|
amine@351
|
146 analysis_window, aw : float, default: 0.05 (50 ms)
|
amine@351
|
147 duration of analysis window in seconds. A value between 0.01 (10 ms) and
|
amine@351
|
148 0.1 (100 ms) should be good for most use-cases.
|
amine@351
|
149 audio_format, fmt : str
|
amine@351
|
150 type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
|
amine@368
|
151 used if `input` is a string path to an audio file. If not given, audio
|
amine@351
|
152 type will be guessed from file name extension or from file header.
|
amine@351
|
153 sampling_rate, sr : int
|
amine@362
|
154 sampling rate of audio data. Required if `input` is a raw audio file, is
|
amine@351
|
155 a bytes object or None (i.e., read from microphone).
|
amine@351
|
156 sample_width, sw : int
|
amine@351
|
157 number of bytes used to encode one audio sample, typically 1, 2 or 4.
|
amine@351
|
158 Required for raw data, see `sampling_rate`.
|
amine@351
|
159 channels, ch : int
|
amine@362
|
160 number of channels of audio data. Required for raw data, see
|
amine@351
|
161 `sampling_rate`.
|
amine@351
|
162 use_channel, uc : {None, "mix"} or int
|
amine@351
|
163 which channel to use for split if `input` has multiple audio channels.
|
amine@351
|
164 Regardless of which channel is used for splitting, returned audio events
|
amine@351
|
165 contain data from *all* channels, just as `input`.
|
amine@351
|
166 The following values are accepted:
|
amine@368
|
167
|
amine@368
|
168 - None (alias "any"): accept audio activity from any channel, even if
|
amine@368
|
169 other channels are silent. This is the default behavior.
|
amine@368
|
170
|
amine@368
|
171 - "mix" ("avg" or "average"): mix down all channels (i.e. compute
|
amine@368
|
172 average channel) and split the resulting channel.
|
amine@368
|
173
|
amine@368
|
174 - int (0 <=, > `channels`): use one channel, specified by integer id,
|
amine@368
|
175 for split.
|
amine@368
|
176
|
amine@351
|
177 large_file : bool, default: False
|
amine@351
|
178 If True, AND if `input` is a path to a *wav* of a *raw* audio file
|
amine@351
|
179 (and only these two formats) then audio data is lazily loaded to memory
|
amine@351
|
180 (i.e., one analysis window a time). Otherwise the whole file is loaded
|
amine@351
|
181 to memory before split. Set to True if the size of the file is larger
|
amine@351
|
182 than available memory.
|
amine@368
|
183 max_read, mr : float, default: None, read until end of stream
|
amine@351
|
184 maximum data to read from source in seconds.
|
amine@351
|
185 validator, val : callable, DataValidator
|
amine@404
|
186 custom data validator. If `None` (default), an `AudioEnergyValidtor` is
|
amine@362
|
187 used with the given energy threshold. Can be a callable or an instance
|
amine@351
|
188 of `DataValidator` that implements `is_valid`. In either case, it'll be
|
amine@351
|
189 called with with a window of audio data as the first parameter.
|
amine@351
|
190 energy_threshold, eth : float, default: 50
|
amine@362
|
191 energy threshold for audio activity detection. Audio regions that have
|
amine@351
|
192 enough windows of with a signal energy equal to or above this threshold
|
amine@362
|
193 are considered valid audio events. Here we are referring to this amount
|
amine@362
|
194 as the energy of the signal but to be more accurate, it is the log
|
amine@368
|
195 energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see
|
amine@368
|
196 :class:`AudioEnergyValidator` and
|
amine@368
|
197 :func:`calculate_energy_single_channel`). If `validator` is given, this
|
amine@368
|
198 argument is ignored.
|
amine@368
|
199
|
amine@368
|
200 Yields
|
amine@368
|
201 ------
|
amine@368
|
202 AudioRegion
|
amine@368
|
203 a generator of detected :class:`AudioRegion` s.
|
amine@179
|
204 """
|
amine@225
|
205 if min_dur <= 0:
|
amine@404
|
206 raise ValueError(f"'min_dur' ({min_dur}) must be > 0")
|
amine@225
|
207 if max_dur <= 0:
|
amine@404
|
208 raise ValueError(f"'max_dur' ({max_dur}) must be > 0")
|
amine@225
|
209 if max_silence < 0:
|
amine@404
|
210 raise ValueError(f"'max_silence' ({max_silence}) must be >= 0")
|
amine@219
|
211
|
amine@295
|
212 if isinstance(input, AudioReader):
|
amine@179
|
213 source = input
|
amine@207
|
214 analysis_window = source.block_dur
|
amine@179
|
215 else:
|
amine@207
|
216 analysis_window = kwargs.get(
|
amine@210
|
217 "analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW)
|
amine@207
|
218 )
|
amine@237
|
219 if analysis_window <= 0:
|
amine@237
|
220 raise ValueError(
|
amine@404
|
221 f"'analysis_window' ({analysis_window}) must be > 0"
|
amine@237
|
222 )
|
amine@210
|
223
|
amine@179
|
224 params = kwargs.copy()
|
amine@210
|
225 params["max_read"] = params.get("max_read", params.get("mr"))
|
amine@212
|
226 params["audio_format"] = params.get("audio_format", params.get("fmt"))
|
amine@179
|
227 if isinstance(input, AudioRegion):
|
amine@179
|
228 params["sampling_rate"] = input.sr
|
amine@179
|
229 params["sample_width"] = input.sw
|
amine@179
|
230 params["channels"] = input.ch
|
amine@179
|
231 input = bytes(input)
|
amine@236
|
232 try:
|
amine@297
|
233 source = AudioReader(input, block_dur=analysis_window, **params)
|
amine@404
|
234 except TooSmallBlockDuration as exc:
|
amine@404
|
235 err_msg = f"Too small 'analysis_window' ({exc.block_dur}) for "
|
amine@404
|
236 err_msg += f"sampling rate ({exc.sampling_rate}). Analysis window "
|
amine@404
|
237 err_msg += f"should at least be 1/{exc.sampling_rate} to cover "
|
amine@404
|
238 err_msg += "one data sample"
|
amine@404
|
239 raise ValueError(err_msg) from exc
|
amine@179
|
240
|
amine@210
|
241 validator = kwargs.get("validator", kwargs.get("val"))
|
amine@179
|
242 if validator is None:
|
amine@185
|
243 energy_threshold = kwargs.get(
|
amine@185
|
244 "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
|
amine@185
|
245 )
|
amine@241
|
246 use_channel = kwargs.get("use_channel", kwargs.get("uc"))
|
amine@241
|
247 validator = AudioEnergyValidator(
|
amine@241
|
248 energy_threshold, source.sw, source.ch, use_channel=use_channel
|
amine@241
|
249 )
|
amine@387
|
250 mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
|
amine@183
|
251 if strict_min_dur:
|
amine@179
|
252 mode |= StreamTokenizer.STRICT_MIN_LENGTH
|
amine@222
|
253 min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil)
|
amine@236
|
254 max_length = _duration_to_nb_windows(
|
amine@236
|
255 max_dur, analysis_window, math.floor, _EPSILON
|
amine@236
|
256 )
|
amine@185
|
257 max_continuous_silence = _duration_to_nb_windows(
|
amine@232
|
258 max_silence, analysis_window, math.floor, _EPSILON
|
amine@185
|
259 )
|
amine@179
|
260
|
amine@222
|
261 err_msg = "({0} sec.) results in {1} analysis window(s) "
|
amine@222
|
262 err_msg += "({1} == {6}({0} / {2})) which is {5} the number "
|
amine@222
|
263 err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))"
|
amine@219
|
264 if min_length > max_length:
|
amine@219
|
265 err_msg = "'min_dur' " + err_msg
|
amine@219
|
266 raise ValueError(
|
amine@219
|
267 err_msg.format(
|
amine@222
|
268 min_dur,
|
amine@222
|
269 min_length,
|
amine@222
|
270 analysis_window,
|
amine@222
|
271 max_length,
|
amine@222
|
272 max_dur,
|
amine@222
|
273 "higher than",
|
amine@222
|
274 "ceil",
|
amine@219
|
275 )
|
amine@219
|
276 )
|
amine@219
|
277
|
amine@219
|
278 if max_continuous_silence >= max_length:
|
amine@219
|
279 err_msg = "'max_silence' " + err_msg
|
amine@219
|
280 raise ValueError(
|
amine@219
|
281 err_msg.format(
|
amine@219
|
282 max_silence,
|
amine@219
|
283 max_continuous_silence,
|
amine@219
|
284 analysis_window,
|
amine@219
|
285 max_length,
|
amine@219
|
286 max_dur,
|
amine@222
|
287 "higher or equal to",
|
amine@222
|
288 "floor",
|
amine@219
|
289 )
|
amine@219
|
290 )
|
amine@219
|
291
|
amine@179
|
292 tokenizer = StreamTokenizer(
|
amine@179
|
293 validator, min_length, max_length, max_continuous_silence, mode=mode
|
amine@179
|
294 )
|
amine@179
|
295 source.open()
|
amine@179
|
296 token_gen = tokenizer.tokenize(source, generator=True)
|
amine@179
|
297 region_gen = (
|
amine@179
|
298 _make_audio_region(
|
amine@323
|
299 token[0],
|
amine@323
|
300 token[1],
|
amine@185
|
301 source.block_dur,
|
amine@185
|
302 source.sr,
|
amine@185
|
303 source.sw,
|
amine@185
|
304 source.ch,
|
amine@179
|
305 )
|
amine@179
|
306 for token in token_gen
|
amine@179
|
307 )
|
amine@179
|
308 return region_gen
|
amine@179
|
309
|
amine@179
|
310
|
amine@236
|
311 def _duration_to_nb_windows(
|
amine@236
|
312 duration, analysis_window, round_fn=round, epsilon=0
|
amine@236
|
313 ):
|
amine@179
|
314 """
|
amine@215
|
315 Converts a given duration into a positive integer of analysis windows.
|
amine@179
|
316 if `duration / analysis_window` is not an integer, the result will be
|
amine@179
|
317 rounded to the closest bigger integer. If `duration == 0`, returns `0`.
|
amine@215
|
318 If `duration < analysis_window`, returns 1.
|
amine@179
|
319 `duration` and `analysis_window` can be in seconds or milliseconds but
|
amine@179
|
320 must be in the same unit.
|
amine@179
|
321
|
amine@351
|
322 Parameters
|
amine@351
|
323 ----------
|
amine@351
|
324 duration : float
|
amine@232
|
325 a given duration in seconds or ms.
|
amine@179
|
326 analysis_window: float
|
amine@232
|
327 size of analysis window, in the same unit as `duration`.
|
amine@351
|
328 round_fn : callable
|
amine@232
|
329 function called to round the result. Default: `round`.
|
amine@351
|
330 epsilon : float
|
amine@232
|
331 small value to add to the division result before rounding.
|
amine@232
|
332 E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with
|
amine@232
|
333 `round_fn=math.floor` returns `2` instead of `3`. Adding a small value
|
amine@232
|
334 to `0.3 / 0.1` avoids this error.
|
amine@179
|
335
|
amine@351
|
336 Returns
|
amine@351
|
337 -------
|
amine@368
|
338 nb_windows : int
|
amine@179
|
339 minimum number of `analysis_window`'s to cover `durartion`. That means
|
amine@179
|
340 that `analysis_window * nb_windows >= duration`.
|
amine@179
|
341 """
|
amine@215
|
342 if duration < 0 or analysis_window <= 0:
|
amine@215
|
343 err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0"
|
amine@215
|
344 raise ValueError(err_msg.format(duration, analysis_window))
|
amine@179
|
345 if duration == 0:
|
amine@179
|
346 return 0
|
amine@232
|
347 return int(round_fn(duration / analysis_window + epsilon))
|
amine@179
|
348
|
amine@179
|
349
|
amine@179
|
350 def _make_audio_region(
|
amine@323
|
351 data_frames,
|
amine@323
|
352 start_frame,
|
amine@185
|
353 frame_duration,
|
amine@185
|
354 sampling_rate,
|
amine@185
|
355 sample_width,
|
amine@185
|
356 channels,
|
amine@179
|
357 ):
|
amine@351
|
358 """
|
amine@351
|
359 Helper function to create an `AudioRegion` from parameters returned by
|
amine@351
|
360 tokenization object. It takes care of setting up region `start` and `end`
|
amine@351
|
361 in metadata.
|
amine@179
|
362
|
amine@351
|
363 Parameters
|
amine@351
|
364 ----------
|
amine@179
|
365 frame_duration: float
|
amine@179
|
366 duration of analysis window in seconds
|
amine@351
|
367 start_frame : int
|
amine@404
|
368 index of the first analysis window
|
amine@404
|
369 sampling_rate : int
|
amine@179
|
370 sampling rate of audio data
|
amine@351
|
371 sample_width : int
|
amine@179
|
372 number of bytes of one audio sample
|
amine@351
|
373 channels : int
|
amine@179
|
374 number of channels of audio data
|
amine@179
|
375
|
amine@351
|
376 Returns
|
amine@351
|
377 -------
|
amine@351
|
378 audio_region : AudioRegion
|
amine@404
|
379 AudioRegion whose start time is calculated as:
|
amine@185
|
380 `1000 * start_frame * frame_duration`
|
amine@179
|
381 """
|
amine@179
|
382 start = start_frame * frame_duration
|
amine@179
|
383 data = b"".join(data_frames)
|
amine@411
|
384 return AudioRegion(data, sampling_rate, sample_width, channels, start)
|
amine@81
|
385
|
amine@81
|
386
|
amine@308
|
387 def _read_chunks_online(max_read, **kwargs):
|
amine@351
|
388 """
|
amine@351
|
389 Helper function to read audio data from an online blocking source
|
amine@351
|
390 (i.e., microphone). Used to build an `AudioRegion` and can intercept
|
amine@351
|
391 KeyboardInterrupt so that reading stops as soon as this exception is
|
amine@351
|
392 raised. Makes building `AudioRegion`s on [i]python sessions and jupyter
|
amine@351
|
393 notebooks more user friendly.
|
amine@351
|
394
|
amine@351
|
395 Parameters
|
amine@351
|
396 ----------
|
amine@351
|
397 max_read : float
|
amine@351
|
398 maximum amount of data to read in seconds.
|
amine@351
|
399 kwargs :
|
amine@351
|
400 audio parameters (sampling_rate, sample_width and channels).
|
amine@351
|
401
|
amine@351
|
402 See also
|
amine@351
|
403 --------
|
amine@351
|
404 `AudioRegion.build`
|
amine@351
|
405 """
|
amine@308
|
406 reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs)
|
amine@308
|
407 reader.open()
|
amine@308
|
408 data = []
|
amine@308
|
409 try:
|
amine@308
|
410 while True:
|
amine@308
|
411 frame = reader.read()
|
amine@308
|
412 if frame is None:
|
amine@308
|
413 break
|
amine@308
|
414 data.append(frame)
|
amine@308
|
415 except KeyboardInterrupt:
|
amine@308
|
416 # Stop data acquisition from microphone when pressing
|
amine@308
|
417 # Ctrl+C on a [i]python session or a notebook
|
amine@308
|
418 pass
|
amine@308
|
419 reader.close()
|
amine@308
|
420 return (
|
amine@308
|
421 b"".join(data),
|
amine@308
|
422 reader.sampling_rate,
|
amine@308
|
423 reader.sample_width,
|
amine@308
|
424 reader.channels,
|
amine@308
|
425 )
|
amine@308
|
426
|
amine@308
|
427
|
amine@308
|
428 def _read_offline(input, skip=0, max_read=None, **kwargs):
|
amine@351
|
429 """
|
amine@351
|
430 Helper function to read audio data from an offline (i.e., file). Used to
|
amine@351
|
431 build `AudioRegion`s.
|
amine@351
|
432
|
amine@351
|
433 Parameters
|
amine@351
|
434 ----------
|
amine@351
|
435 input : str, bytes
|
amine@351
|
436 path to audio file (if str), or a bytes object representing raw audio
|
amine@351
|
437 data.
|
amine@351
|
438 skip : float, default 0
|
amine@351
|
439 amount of data to skip from the begining of audio source.
|
amine@351
|
440 max_read : float, default: None
|
amine@351
|
441 maximum amount of audio data to read. Default: None, means read until
|
amine@351
|
442 end of stream.
|
amine@351
|
443 kwargs :
|
amine@351
|
444 audio parameters (sampling_rate, sample_width and channels).
|
amine@351
|
445
|
amine@351
|
446 See also
|
amine@351
|
447 --------
|
amine@351
|
448 `AudioRegion.build`
|
amine@351
|
449
|
amine@351
|
450 """
|
amine@308
|
451 audio_source = get_audio_source(input, **kwargs)
|
amine@308
|
452 audio_source.open()
|
amine@308
|
453 if skip is not None and skip > 0:
|
amine@308
|
454 skip_samples = round(skip * audio_source.sampling_rate)
|
amine@308
|
455 audio_source.read(skip_samples)
|
amine@308
|
456 if max_read is not None:
|
amine@308
|
457 if max_read < 0:
|
amine@308
|
458 max_read = None
|
amine@308
|
459 else:
|
amine@308
|
460 max_read = round(max_read * audio_source.sampling_rate)
|
amine@308
|
461 data = audio_source.read(max_read)
|
amine@323
|
462 audio_source.close()
|
amine@308
|
463 return (
|
amine@308
|
464 data,
|
amine@308
|
465 audio_source.sampling_rate,
|
amine@308
|
466 audio_source.sample_width,
|
amine@308
|
467 audio_source.channels,
|
amine@308
|
468 )
|
amine@308
|
469
|
amine@308
|
470
|
amine@228
|
471 def _check_convert_index(index, types, err_msg):
|
amine@228
|
472 if not isinstance(index, slice) or index.step is not None:
|
amine@228
|
473 raise TypeError(err_msg)
|
amine@228
|
474 start = index.start if index.start is not None else 0
|
amine@228
|
475 stop = index.stop
|
amine@228
|
476 for index in (start, stop):
|
amine@228
|
477 if index is not None and not isinstance(index, types):
|
amine@228
|
478 raise TypeError(err_msg)
|
amine@228
|
479 return start, stop
|
amine@228
|
480
|
amine@228
|
481
|
amine@228
|
482 class _SecondsView:
|
amine@351
|
483 """A class to create a view of `AudioRegion` that can be sliced using
|
amine@351
|
484 indices in seconds.
|
amine@351
|
485 """
|
amine@351
|
486
|
amine@228
|
487 def __init__(self, region):
|
amine@228
|
488 self._region = region
|
amine@228
|
489
|
amine@228
|
490 def __getitem__(self, index):
|
amine@228
|
491 err_msg = "Slicing AudioRegion by seconds requires indices of type "
|
amine@228
|
492 err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])"
|
amine@228
|
493 start_s, stop_s = _check_convert_index(index, (int, float), err_msg)
|
amine@228
|
494 sr = self._region.sampling_rate
|
amine@228
|
495 start_sample = int(start_s * sr)
|
amine@228
|
496 stop_sample = None if stop_s is None else round(stop_s * sr)
|
amine@228
|
497 return self._region[start_sample:stop_sample]
|
amine@228
|
498
|
amine@245
|
499 @property
|
amine@245
|
500 def len(self):
|
amine@245
|
501 """
|
amine@245
|
502 Return region duration in seconds.
|
amine@245
|
503 """
|
amine@245
|
504 return self._region.duration
|
amine@245
|
505
|
amine@228
|
506
|
amine@228
|
507 class _MillisView(_SecondsView):
|
amine@351
|
508 """A class to create a view of `AudioRegion` that can be sliced using
|
amine@351
|
509 indices in milliseconds.
|
amine@351
|
510 """
|
amine@351
|
511
|
amine@228
|
512 def __getitem__(self, index):
|
amine@228
|
513 err_msg = (
|
amine@228
|
514 "Slicing AudioRegion by milliseconds requires indices of type "
|
amine@228
|
515 )
|
amine@228
|
516 err_msg += "'int' without a step (e.g. region.sec[500:1500])"
|
amine@228
|
517 start_ms, stop_ms = _check_convert_index(index, (int), err_msg)
|
amine@228
|
518 start_sec = start_ms / 1000
|
amine@228
|
519 stop_sec = None if stop_ms is None else stop_ms / 1000
|
amine@228
|
520 index = slice(start_sec, stop_sec)
|
amine@413
|
521 return super().__getitem__(index)
|
amine@228
|
522
|
amine@245
|
523 def __len__(self):
|
amine@245
|
524 """
|
amine@245
|
525 Return region duration in milliseconds.
|
amine@245
|
526 """
|
amine@245
|
527 return round(self._region.duration * 1000)
|
amine@245
|
528
|
amine@245
|
529 @property
|
amine@245
|
530 def len(self):
|
amine@245
|
531 """
|
amine@245
|
532 Return region duration in milliseconds.
|
amine@245
|
533 """
|
amine@245
|
534 return len(self)
|
amine@245
|
535
|
amine@228
|
536
|
amine@244
|
537 class _AudioRegionMetadata(dict):
|
amine@387
|
538 """A class to store `AudioRegion`'s metadata."""
|
amine@351
|
539
|
amine@244
|
540 def __getattr__(self, name):
|
amine@411
|
541 warnings.warn(
|
amine@411
|
542 "`AudioRegion.meta` is deprecated and will be removed in future "
|
amine@411
|
543 "versions. For the 'start' and 'end' fields, please use "
|
amine@411
|
544 "`AudioRegion.start` and `AudioRegion.end`.",
|
amine@411
|
545 DeprecationWarning,
|
amine@411
|
546 stacklevel=2,
|
amine@411
|
547 )
|
amine@244
|
548 if name in self:
|
amine@244
|
549 return self[name]
|
amine@244
|
550 else:
|
amine@244
|
551 err_msg = "AudioRegion metadata has no entry '{}'"
|
amine@244
|
552 raise AttributeError(err_msg.format(name))
|
amine@244
|
553
|
amine@244
|
554 def __setattr__(self, name, value):
|
amine@244
|
555 self[name] = value
|
amine@244
|
556
|
amine@244
|
557 def __str__(self):
|
amine@244
|
558 return "\n".join("{}: {}".format(k, v) for k, v in self.items())
|
amine@244
|
559
|
amine@244
|
560 def __repr__(self):
|
amine@244
|
561 return str(self)
|
amine@244
|
562
|
amine@244
|
563
|
amine@411
|
564 @dataclass(frozen=True)
|
amine@81
|
565 class AudioRegion(object):
|
amine@368
|
566 """
|
amine@368
|
567 AudioRegion encapsulates raw audio data and provides an interface to
|
amine@368
|
568 perform simple operations on it. Use `AudioRegion.load` to build an
|
amine@368
|
569 `AudioRegion` from different types of objects.
|
amine@368
|
570
|
amine@368
|
571 Parameters
|
amine@368
|
572 ----------
|
amine@368
|
573 data : bytes
|
amine@368
|
574 raw audio data as a bytes object
|
amine@368
|
575 sampling_rate : int
|
amine@368
|
576 sampling rate of audio data
|
amine@368
|
577 sample_width : int
|
amine@368
|
578 number of bytes of one audio sample
|
amine@368
|
579 channels : int
|
amine@368
|
580 number of channels of audio data
|
amine@412
|
581 start : float, default: None
|
amine@412
|
582 optional start time of the region. This is typically provided by the
|
amine@412
|
583 `split` function.
|
amine@368
|
584
|
amine@368
|
585 See also
|
amine@368
|
586 --------
|
amine@368
|
587 AudioRegion.load
|
amine@368
|
588 """
|
amine@368
|
589
|
amine@411
|
590 data: bytes
|
amine@411
|
591 sampling_rate: int
|
amine@411
|
592 sample_width: int
|
amine@411
|
593 channels: int
|
amine@411
|
594 start: float = field(default=None, repr=None)
|
amine@411
|
595
|
amine@411
|
596 def __post_init__(self):
|
amine@244
|
597
|
amine@411
|
598 check_audio_data(self.data, self.sample_width, self.channels)
|
amine@411
|
599
|
amine@411
|
600 object.__setattr__(self, "splitp", self.split_and_plot)
|
amine@411
|
601 object.__setattr__(self, "_samples", None)
|
amine@411
|
602
|
amine@411
|
603 duration = len(self.data) / (
|
amine@411
|
604 self.sampling_rate * self.sample_width * self.channels
|
amine@411
|
605 )
|
amine@411
|
606 object.__setattr__(self, "duration", duration)
|
amine@411
|
607
|
amine@411
|
608 if self.start is not None:
|
amine@411
|
609 object.__setattr__(self, "end", self.start + self.duration)
|
amine@411
|
610 object.__setattr__(
|
amine@411
|
611 self,
|
amine@411
|
612 "meta",
|
amine@411
|
613 _AudioRegionMetadata({"start": self.start, "end": self.end}),
|
amine@411
|
614 )
|
amine@411
|
615 else:
|
amine@411
|
616 object.__setattr__(self, "end", None)
|
amine@411
|
617 object.__setattr__(self, "meta", None)
|
amine@411
|
618
|
amine@411
|
619 # `seconds` and `millis` are defined below as @property with docstring
|
amine@411
|
620 object.__setattr__(self, "_seconds_view", _SecondsView(self))
|
amine@411
|
621 object.__setattr__(self, "_millis_view", _MillisView(self))
|
amine@411
|
622
|
amine@411
|
623 object.__setattr__(self, "sec", self.seconds)
|
amine@411
|
624 object.__setattr__(self, "s", self.seconds)
|
amine@411
|
625 object.__setattr__(self, "ms", self.millis)
|
amine@244
|
626
|
amine@239
|
627 @classmethod
|
amine@307
|
628 def load(cls, input, skip=0, max_read=None, **kwargs):
|
amine@351
|
629 """
|
amine@373
|
630 Create an `AudioRegion` by loading data from `input`. See :func:`load`
|
amine@373
|
631 for parameters descripion.
|
amine@351
|
632
|
amine@351
|
633 Returns
|
amine@351
|
634 -------
|
amine@351
|
635 region: AudioRegion
|
amine@351
|
636
|
amine@351
|
637 Raises
|
amine@351
|
638 ------
|
amine@368
|
639 ValueError
|
amine@368
|
640 raised if `input` is None and `skip` != 0 or `max_read` is None.
|
amine@351
|
641 """
|
amine@308
|
642 if input is None:
|
amine@351
|
643 if skip > 0:
|
amine@351
|
644 raise ValueError(
|
amine@351
|
645 "'skip' should be 0 when reading from microphone"
|
amine@351
|
646 )
|
amine@308
|
647 if max_read is None or max_read < 0:
|
amine@308
|
648 raise ValueError(
|
amine@333
|
649 "'max_read' should not be None when reading from "
|
amine@333
|
650 "microphone"
|
amine@308
|
651 )
|
amine@308
|
652 data, sampling_rate, sample_width, channels = _read_chunks_online(
|
amine@308
|
653 max_read, **kwargs
|
amine@307
|
654 )
|
amine@239
|
655 else:
|
amine@308
|
656 data, sampling_rate, sample_width, channels = _read_offline(
|
amine@308
|
657 input, skip=skip, max_read=max_read, **kwargs
|
amine@308
|
658 )
|
amine@308
|
659
|
amine@308
|
660 return cls(data, sampling_rate, sample_width, channels)
|
amine@239
|
661
|
amine@228
|
662 @property
|
amine@351
|
663 def seconds(self):
|
amine@373
|
664 """
|
amine@404
|
665 A view to slice audio region by seconds using
|
amine@404
|
666 ``region.seconds[start:end]``.
|
amine@373
|
667 """
|
amine@228
|
668 return self._seconds_view
|
amine@228
|
669
|
amine@228
|
670 @property
|
amine@228
|
671 def millis(self):
|
amine@404
|
672 """A view to slice audio region by milliseconds using
|
amine@404
|
673 ``region.millis[start:end]``."""
|
amine@228
|
674 return self._millis_view
|
amine@228
|
675
|
amine@81
|
676 @property
|
amine@81
|
677 def sr(self):
|
leminhnguyen@395
|
678 """Sampling rate of audio data, alias for `sampling_rate`."""
|
amine@411
|
679 return self.sampling_rate
|
amine@81
|
680
|
amine@81
|
681 @property
|
amine@81
|
682 def sw(self):
|
amine@411
|
683 """Number of bytes per sample, alias for `sample_width`."""
|
amine@411
|
684 return self.sample_width
|
amine@81
|
685
|
amine@81
|
686 @property
|
amine@81
|
687 def ch(self):
|
amine@387
|
688 """Number of channels of audio data, alias for `channels`."""
|
amine@411
|
689 return self.channels
|
amine@2
|
690
|
amine@270
|
691 def play(self, progress_bar=False, player=None, **progress_bar_kwargs):
|
amine@351
|
692 """
|
amine@351
|
693 Play audio region.
|
amine@201
|
694
|
amine@351
|
695 Parameters
|
amine@351
|
696 ----------
|
amine@351
|
697 progress_bar : bool, default: False
|
amine@351
|
698 whether to use a progress bar while playing audio. Default: False.
|
amine@351
|
699 `progress_bar` requires `tqdm`, if not installed, no progress bar
|
amine@351
|
700 will be shown.
|
amine@351
|
701 player : AudioPalyer, default: None
|
amine@351
|
702 audio player to use. if None (default), use `player_for()`
|
amine@201
|
703 to get a new audio player.
|
amine@351
|
704 progress_bar_kwargs : kwargs
|
amine@351
|
705 keyword arguments to pass to `tqdm` progress_bar builder (e.g.,
|
amine@368
|
706 use `leave=False` to clean up the screen when play finishes).
|
amine@201
|
707 """
|
amine@199
|
708 if player is None:
|
amine@199
|
709 player = player_for(self)
|
amine@411
|
710 player.play(self.data, progress_bar=progress_bar, **progress_bar_kwargs)
|
amine@199
|
711
|
amine@411
|
712 def save(
|
amine@411
|
713 self, filename, audio_format=None, exists_ok=True, **audio_parameters
|
amine@411
|
714 ):
|
amine@351
|
715 """
|
amine@351
|
716 Save audio region to file.
|
amine@187
|
717
|
amine@351
|
718 Parameters
|
amine@351
|
719 ----------
|
amine@411
|
720 filename : str, Path
|
amine@411
|
721 path to output audio file. If of type `str`, it may contain a
|
amine@411
|
722 `{start}`, `{end}` and a `{duration}` placeholders.
|
amine@411
|
723 Regions returned by `split` contain a `start` and and `end`
|
amine@411
|
724 attributes that can be used to build output file name as in the
|
amine@411
|
725 example.
|
amine@362
|
726 audio_format : str, default: None
|
amine@351
|
727 format used to save audio data. If None (default), format is guessed
|
amine@351
|
728 from file name's extension. If file name has no extension, audio
|
amine@351
|
729 data is saved as a raw (headerless) audio file.
|
amine@351
|
730 exists_ok : bool, default: True
|
amine@368
|
731 If True, overwrite `file` if a file with the same name exists.
|
amine@368
|
732 If False, raise an `IOError` if `file` exists.
|
amine@351
|
733 audio_parameters: dict
|
amine@351
|
734 any keyword arguments to be passed to audio saving backend.
|
amine@187
|
735
|
amine@351
|
736 Returns
|
amine@351
|
737 -------
|
amine@351
|
738 file: str
|
amine@411
|
739 name of output file with filled placehoders.
|
amine@351
|
740 Raises
|
amine@411
|
741 IOError if `filename` exists and `exists_ok` is False.
|
amine@187
|
742
|
amine@368
|
743
|
amine@368
|
744 Examples
|
amine@368
|
745 --------
|
amine@411
|
746 Create and AudioRegion, explicitly passing a value for `start`. `end`
|
amine@411
|
747 will be computed based on `start` and the region's duration.
|
amine@411
|
748
|
amine@411
|
749 >>> region = AudioRegion(b'\0' * 2 * 24000,
|
amine@368
|
750 >>> sampling_rate=16000,
|
amine@368
|
751 >>> sample_width=2,
|
amine@411
|
752 >>> channels=1,
|
amine@411
|
753 >>> start=2.25)
|
amine@411
|
754 >>> region
|
amine@411
|
755 <AudioRegion(duration=1.500, sampling_rate=16000, sample_width=2, channels=1)>
|
amine@411
|
756
|
amine@411
|
757 >>> assert region.end == 3.75
|
amine@411
|
758 >>> assert region.save('audio_{start}-{end}.wav') == "audio_2.25-3.75.wav"
|
amine@411
|
759 >>> filename = region.save('audio_{start:.3f}-{end:.3f}_{duration:.3f}.wav')
|
amine@411
|
760 >>> assert filename == "audio_2.250-3.750_1.500.wav"
|
amine@187
|
761 """
|
amine@411
|
762 if isinstance(filename, Path):
|
amine@411
|
763 if not exists_ok and filename.exists():
|
amine@411
|
764 raise FileExistsError(
|
amine@411
|
765 "file '{filename}' exists".format(filename=str(filename))
|
amine@411
|
766 )
|
amine@411
|
767 if isinstance(filename, str):
|
amine@411
|
768 filename = filename.format(
|
amine@411
|
769 duration=self.duration,
|
amine@411
|
770 meta=self.meta,
|
amine@411
|
771 start=self.start,
|
amine@411
|
772 end=self.end,
|
amine@411
|
773 )
|
amine@411
|
774 if not exists_ok and os.path.exists(filename):
|
amine@411
|
775 raise FileExistsError(
|
amine@411
|
776 "file '{filename}' exists".format(filename=filename)
|
amine@411
|
777 )
|
amine@187
|
778 to_file(
|
amine@411
|
779 self.data,
|
amine@411
|
780 filename,
|
amine@351
|
781 audio_format,
|
amine@187
|
782 sr=self.sr,
|
amine@187
|
783 sw=self.sw,
|
amine@187
|
784 ch=self.ch,
|
amine@195
|
785 audio_parameters=audio_parameters,
|
amine@187
|
786 )
|
amine@411
|
787 return filename
|
amine@187
|
788
|
amine@248
|
789 def split(
|
amine@248
|
790 self,
|
amine@248
|
791 min_dur=0.2,
|
amine@248
|
792 max_dur=5,
|
amine@248
|
793 max_silence=0.3,
|
amine@248
|
794 drop_trailing_silence=False,
|
amine@248
|
795 strict_min_dur=False,
|
amine@404
|
796 **kwargs,
|
amine@248
|
797 ):
|
amine@368
|
798 """Split audio region. See :func:`auditok.split()` for a comprehensive
|
amine@368
|
799 description of split parameters.
|
amine@373
|
800 See Also :meth:`AudioRegio.split_and_plot`.
|
amine@248
|
801 """
|
amine@306
|
802 if kwargs.get("max_read", kwargs.get("mr")) is not None:
|
amine@306
|
803 warn_msg = "'max_read' (or 'mr') should not be used with "
|
amine@306
|
804 warn_msg += "AudioRegion.split_and_plot(). You should rather "
|
amine@306
|
805 warn_msg += "slice audio region before calling this method"
|
amine@306
|
806 raise RuntimeWarning(warn_msg)
|
amine@248
|
807 return split(
|
amine@248
|
808 self,
|
amine@248
|
809 min_dur=min_dur,
|
amine@248
|
810 max_dur=max_dur,
|
amine@248
|
811 max_silence=max_silence,
|
amine@248
|
812 drop_trailing_silence=drop_trailing_silence,
|
amine@248
|
813 strict_min_dur=strict_min_dur,
|
amine@404
|
814 **kwargs,
|
amine@248
|
815 )
|
amine@248
|
816
|
amine@301
|
817 def plot(
|
amine@301
|
818 self,
|
amine@301
|
819 scale_signal=True,
|
amine@301
|
820 show=True,
|
amine@301
|
821 figsize=None,
|
amine@301
|
822 save_as=None,
|
amine@301
|
823 dpi=120,
|
amine@301
|
824 theme="auditok",
|
amine@301
|
825 ):
|
amine@404
|
826 """Plot audio region using one sub-plot per each channel.
|
amine@373
|
827
|
amine@373
|
828 Parameters
|
amine@373
|
829 ----------
|
amine@373
|
830 scale_signal : bool, default: True
|
amine@373
|
831 if true, scale signal by subtracting its mean and dividing by its
|
amine@373
|
832 standard deviation before plotting.
|
amine@373
|
833 show : bool
|
amine@373
|
834 whether to show plotted signal right after the call.
|
amine@373
|
835 figsize : tuple, default: None
|
amine@373
|
836 width and height of the figure to pass to `matplotlib`.
|
amine@373
|
837 save_as : str, default None.
|
amine@373
|
838 if provided, also save plot to file.
|
amine@373
|
839 dpi : int, default: 120
|
amine@373
|
840 plot dpi to pass to `matplotlib`.
|
amine@373
|
841 theme : str or dict, default: "auditok"
|
amine@373
|
842 plot theme to use. Currently only "auditok" theme is implemented. To
|
amine@373
|
843 provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
|
amine@368
|
844 """
|
amine@404
|
845 plot(
|
amine@404
|
846 self,
|
amine@404
|
847 scale_signal=scale_signal,
|
amine@404
|
848 show=show,
|
amine@404
|
849 figsize=figsize,
|
amine@404
|
850 save_as=save_as,
|
amine@404
|
851 dpi=dpi,
|
amine@404
|
852 theme=theme,
|
amine@404
|
853 )
|
amine@250
|
854
|
amine@250
|
855 def split_and_plot(
|
amine@250
|
856 self,
|
amine@250
|
857 min_dur=0.2,
|
amine@250
|
858 max_dur=5,
|
amine@250
|
859 max_silence=0.3,
|
amine@250
|
860 drop_trailing_silence=False,
|
amine@250
|
861 strict_min_dur=False,
|
amine@301
|
862 scale_signal=True,
|
amine@250
|
863 show=True,
|
amine@301
|
864 figsize=None,
|
amine@301
|
865 save_as=None,
|
amine@301
|
866 dpi=120,
|
amine@301
|
867 theme="auditok",
|
amine@404
|
868 **kwargs,
|
amine@250
|
869 ):
|
amine@368
|
870 """Split region and plot signal and detections. Alias: :meth:`splitp`.
|
amine@368
|
871 See :func:`auditok.split()` for a comprehensive description of split
|
amine@373
|
872 parameters. Also see :meth:`plot` for plot parameters.
|
amine@250
|
873 """
|
amine@404
|
874 regions = self.split(
|
amine@404
|
875 min_dur=min_dur,
|
amine@404
|
876 max_dur=max_dur,
|
amine@404
|
877 max_silence=max_silence,
|
amine@404
|
878 drop_trailing_silence=drop_trailing_silence,
|
amine@404
|
879 strict_min_dur=strict_min_dur,
|
amine@404
|
880 **kwargs,
|
amine@404
|
881 )
|
amine@404
|
882 regions = list(regions)
|
amine@404
|
883 detections = ((reg.meta.start, reg.meta.end) for reg in regions)
|
amine@404
|
884 eth = kwargs.get(
|
amine@404
|
885 "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
|
amine@404
|
886 )
|
amine@404
|
887 plot(
|
amine@404
|
888 self,
|
amine@404
|
889 scale_signal=scale_signal,
|
amine@404
|
890 detections=detections,
|
amine@404
|
891 energy_threshold=eth,
|
amine@404
|
892 show=show,
|
amine@404
|
893 figsize=figsize,
|
amine@404
|
894 save_as=save_as,
|
amine@404
|
895 dpi=dpi,
|
amine@404
|
896 theme=theme,
|
amine@404
|
897 )
|
amine@404
|
898 return regions
|
amine@250
|
899
|
amine@246
|
900 @property
|
amine@246
|
901 def samples(self):
|
amine@387
|
902 """Audio region as arrays of samples, one array per channel."""
|
amine@411
|
903 if self._samples is None: # TODO fixit
|
amine@411
|
904 _samples = signal.to_array(
|
amine@411
|
905 self.data, self.sample_width, self.channels
|
amine@323
|
906 )
|
amine@411
|
907 return _samples
|
amine@246
|
908
|
amine@408
|
909 def __array__(self):
|
amine@408
|
910 return self.samples
|
amine@408
|
911
|
amine@408
|
912 def numpy(self):
|
amine@408
|
913 return self.samples
|
amine@408
|
914
|
amine@82
|
915 def __len__(self):
|
amine@85
|
916 """
|
amine@245
|
917 Return region length in number of samples.
|
amine@85
|
918 """
|
amine@411
|
919 return len(self.data) // (self.sample_width * self.channels)
|
amine@245
|
920
|
amine@245
|
921 @property
|
amine@245
|
922 def len(self):
|
amine@245
|
923 """
|
amine@245
|
924 Return region length in number of samples.
|
amine@245
|
925 """
|
amine@245
|
926 return len(self)
|
amine@82
|
927
|
amine@83
|
928 def __bytes__(self):
|
amine@411
|
929 return self.data
|
amine@83
|
930
|
amine@244
|
931 def __str__(self):
|
amine@178
|
932 return (
|
amine@244
|
933 "AudioRegion(duration={:.3f}, "
|
amine@178
|
934 "sampling_rate={}, sample_width={}, channels={})".format(
|
amine@244
|
935 self.duration, self.sr, self.sw, self.ch
|
amine@178
|
936 )
|
amine@178
|
937 )
|
amine@83
|
938
|
amine@244
|
939 def __repr__(self):
|
amine@409
|
940 return "<{}>".format(str(self))
|
amine@83
|
941
|
amine@87
|
942 def __add__(self, other):
|
amine@87
|
943 """
|
amine@87
|
944 Concatenates this region and `other` and return a new region.
|
amine@87
|
945 Both regions must have the same sampling rate, sample width
|
amine@87
|
946 and number of channels. If not, raises a `ValueError`.
|
amine@87
|
947 """
|
amine@87
|
948 if not isinstance(other, AudioRegion):
|
amine@178
|
949 raise TypeError(
|
amine@185
|
950 "Can only concatenate AudioRegion, "
|
amine@185
|
951 'not "{}"'.format(type(other))
|
amine@178
|
952 )
|
amine@87
|
953 if other.sr != self.sr:
|
amine@178
|
954 raise ValueError(
|
amine@178
|
955 "Can only concatenate AudioRegions of the same "
|
amine@178
|
956 "sampling rate ({} != {})".format(self.sr, other.sr)
|
amine@178
|
957 )
|
amine@87
|
958 if other.sw != self.sw:
|
amine@178
|
959 raise ValueError(
|
amine@178
|
960 "Can only concatenate AudioRegions of the same "
|
amine@178
|
961 "sample width ({} != {})".format(self.sw, other.sw)
|
amine@178
|
962 )
|
amine@87
|
963 if other.ch != self.ch:
|
amine@178
|
964 raise ValueError(
|
amine@178
|
965 "Can only concatenate AudioRegions of the same "
|
amine@178
|
966 "number of channels ({} != {})".format(self.ch, other.ch)
|
amine@178
|
967 )
|
amine@411
|
968 data = self.data + other.data
|
amine@244
|
969 return AudioRegion(data, self.sr, self.sw, self.ch)
|
amine@87
|
970
|
amine@87
|
971 def __radd__(self, other):
|
amine@87
|
972 """
|
amine@87
|
973 Concatenates `other` and this region. `other` should be an
|
amine@87
|
974 `AudioRegion` with the same audio parameters as this region
|
amine@87
|
975 but can exceptionally be `0` to make it possible to concatenate
|
amine@87
|
976 many regions with `sum`.
|
amine@87
|
977 """
|
amine@87
|
978 if other == 0:
|
amine@87
|
979 return self
|
amine@87
|
980 return other.add(self)
|
amine@87
|
981
|
amine@195
|
982 def __mul__(self, n):
|
amine@195
|
983 if not isinstance(n, int):
|
amine@195
|
984 err_msg = "Can't multiply AudioRegion by a non-int of type '{}'"
|
amine@195
|
985 raise TypeError(err_msg.format(type(n)))
|
amine@411
|
986 data = self.data * n
|
amine@244
|
987 return AudioRegion(data, self.sr, self.sw, self.ch)
|
amine@195
|
988
|
amine@195
|
989 def __rmul__(self, n):
|
amine@195
|
990 return self * n
|
amine@195
|
991
|
amine@247
|
992 def __truediv__(self, n):
|
amine@247
|
993 if not isinstance(n, int) or n <= 0:
|
amine@387
|
994 raise TypeError("AudioRegion can only be divided by a positive int")
|
amine@253
|
995 samples_per_sub_region, rest = divmod(len(self), n)
|
amine@253
|
996 onset = 0
|
amine@247
|
997 sub_regions = []
|
amine@253
|
998 while onset < len(self):
|
amine@253
|
999 offset = 0
|
amine@253
|
1000 if rest > 0:
|
amine@253
|
1001 offset = 1
|
amine@253
|
1002 rest -= 1
|
amine@253
|
1003 offset += onset + samples_per_sub_region
|
amine@253
|
1004 sub_regions.append(self[onset:offset])
|
amine@253
|
1005 onset = offset
|
amine@247
|
1006 return sub_regions
|
amine@247
|
1007
|
amine@198
|
1008 def __eq__(self, other):
|
amine@198
|
1009 if other is self:
|
amine@198
|
1010 return True
|
amine@198
|
1011 if not isinstance(other, AudioRegion):
|
amine@198
|
1012 return False
|
amine@198
|
1013 return (
|
amine@411
|
1014 (self.data == other.data)
|
amine@198
|
1015 and (self.sr == other.sr)
|
amine@198
|
1016 and (self.sw == other.sw)
|
amine@198
|
1017 and (self.ch == other.ch)
|
amine@198
|
1018 )
|
amine@198
|
1019
|
amine@188
|
1020 def __getitem__(self, index):
|
amine@230
|
1021 err_msg = "Slicing AudioRegion by samples requires indices of type "
|
amine@230
|
1022 err_msg += "'int' without a step (e.g. region.sec[1600:3200])"
|
amine@230
|
1023 start_sample, stop_sample = _check_convert_index(index, (int), err_msg)
|
amine@188
|
1024
|
amine@228
|
1025 bytes_per_sample = self.sample_width * self.channels
|
amine@411
|
1026 len_samples = len(self.data) // bytes_per_sample
|
amine@188
|
1027
|
amine@230
|
1028 if start_sample < 0:
|
amine@230
|
1029 start_sample = max(start_sample + len_samples, 0)
|
amine@228
|
1030 onset = start_sample * bytes_per_sample
|
amine@188
|
1031
|
amine@230
|
1032 if stop_sample is not None:
|
amine@230
|
1033 if stop_sample < 0:
|
amine@230
|
1034 stop_sample = max(stop_sample + len_samples, 0)
|
amine@230
|
1035 offset = index.stop * bytes_per_sample
|
amine@230
|
1036 else:
|
amine@228
|
1037 offset = None
|
amine@228
|
1038
|
amine@411
|
1039 data = self.data[onset:offset]
|
amine@244
|
1040 return AudioRegion(data, self.sr, self.sw, self.ch)
|
amine@188
|
1041
|
amine@2
|
1042
|
amine@178
|
1043 class StreamTokenizer:
|
amine@32
|
1044 """
|
amine@32
|
1045 Class for stream tokenizers. It implements a 4-state automaton scheme
|
amine@32
|
1046 to extract sub-sequences of interest on the fly.
|
amine@67
|
1047
|
amine@351
|
1048 Parameters
|
amine@351
|
1049 ----------
|
amine@351
|
1050 validator : callable, DataValidator (must implement `is_valid`)
|
amine@351
|
1051 called with each data frame read from source. Should take one positional
|
amine@351
|
1052 argument and return True or False for valid and invalid frames
|
amine@351
|
1053 respectively.
|
amine@67
|
1054
|
amine@351
|
1055 min_length : int
|
amine@351
|
1056 Minimum number of frames of a valid token. This includes all
|
amine@351
|
1057 tolerated non valid frames within the token.
|
amine@67
|
1058
|
amine@351
|
1059 max_length : int
|
amine@351
|
1060 Maximum number of frames of a valid token. This includes all
|
amine@351
|
1061 tolerated non valid frames within the token.
|
amine@67
|
1062
|
amine@368
|
1063 max_continuous_silence : int
|
amine@351
|
1064 Maximum number of consecutive non-valid frames within a token.
|
amine@351
|
1065 Note that, within a valid token, there may be many tolerated
|
amine@351
|
1066 *silent* regions that contain each a number of non valid frames up
|
amine@351
|
1067 to `max_continuous_silence`
|
amine@67
|
1068
|
amine@368
|
1069 init_min : int
|
amine@351
|
1070 Minimum number of consecutive valid frames that must be
|
amine@351
|
1071 **initially** gathered before any sequence of non valid frames can
|
amine@351
|
1072 be tolerated. This option is not always needed, it can be used to
|
amine@351
|
1073 drop non-valid tokens as early as possible. **Default = 0** means
|
amine@351
|
1074 that the option is by default ineffective.
|
amine@67
|
1075
|
amine@368
|
1076 init_max_silence : int
|
amine@351
|
1077 Maximum number of tolerated consecutive non-valid frames if the
|
amine@351
|
1078 number already gathered valid frames has not yet reached
|
amine@351
|
1079 'init_min'.This argument is normally used if `init_min` is used.
|
amine@351
|
1080 **Default = 0**, by default this argument is not taken into
|
amine@351
|
1081 consideration.
|
amine@67
|
1082
|
amine@368
|
1083 mode : int
|
amine@368
|
1084 mode can be one of the following:
|
amine@67
|
1085
|
amine@368
|
1086 -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and
|
amine@368
|
1087 accept a token shorter than `min_length` if it is the continuation
|
amine@368
|
1088 of the latest delivered token.
|
amine@67
|
1089
|
amine@368
|
1090 -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered
|
amine@368
|
1091 because `max_length` is reached, and token `i+1` is immediately
|
amine@368
|
1092 adjacent to token `i` (i.e. token `i` ends at frame `k` and token
|
amine@368
|
1093 `i+1` starts at frame `k+1`) then accept token `i+1` only of it has
|
amine@368
|
1094 a size of at least `min_length`. The default behavior is to accept
|
amine@368
|
1095 token `i+1` event if it is shorter than `min_length` (provided that
|
amine@368
|
1096 the above conditions are fulfilled of course).
|
amine@327
|
1097
|
amine@368
|
1098 -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing
|
amine@368
|
1099 non-valid frames from a token to be delivered if and only if it
|
amine@368
|
1100 is not **truncated**. This can be a bit tricky. A token is actually
|
amine@368
|
1101 delivered if:
|
amine@368
|
1102
|
amine@368
|
1103 - `max_continuous_silence` is reached.
|
amine@368
|
1104
|
amine@368
|
1105 - Its length reaches `max_length`. This is referred to as a
|
amine@368
|
1106 **truncated** token.
|
amine@368
|
1107
|
amine@368
|
1108 In the current implementation, a `StreamTokenizer`'s decision is only
|
amine@368
|
1109 based on already seen data and on incoming data. Thus, if a token is
|
amine@368
|
1110 truncated at a non-valid but tolerated frame (`max_length` is reached
|
amine@368
|
1111 but `max_continuous_silence` not yet) any tailing silence will be kept
|
amine@368
|
1112 because it can potentially be part of valid token (if `max_length` was
|
amine@368
|
1113 bigger). But if `max_continuous_silence` is reached before
|
amine@368
|
1114 `max_length`, the delivered token will not be considered as truncated
|
amine@368
|
1115 but a result of *normal* end of detection (i.e. no more valid data).
|
amine@368
|
1116 In that case the trailing silence can be removed if you use the
|
amine@368
|
1117 `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
|
amine@368
|
1118
|
amine@404
|
1119 -4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`: # noqa: B950
|
amine@368
|
1120 use both options. That means: first remove tailing silence, then
|
amine@368
|
1121 check if the token still has a length of at least `min_length`.
|
amine@368
|
1122
|
amine@368
|
1123
|
amine@368
|
1124 Examples
|
amine@368
|
1125 --------
|
amine@67
|
1126
|
amine@351
|
1127 In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
|
amine@351
|
1128 accepted although it is shorter than `min_length` (3), because it
|
amine@351
|
1129 immediately follows the latest delivered token:
|
amine@67
|
1130
|
amine@368
|
1131 >>> from auditok.core import StreamTokenizer
|
leminhnguyen@395
|
1132 >>> from auditok.util import StringDataSource, DataValidator
|
amine@351
|
1133
|
amine@368
|
1134 >>> class UpperCaseChecker(DataValidator):
|
amine@368
|
1135 >>> def is_valid(self, frame):
|
amine@351
|
1136 return frame.isupper()
|
amine@368
|
1137 >>> dsource = StringDataSource("aaaAAAABBbbb")
|
amine@368
|
1138 >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
|
amine@351
|
1139 min_length=3,
|
amine@351
|
1140 max_length=4,
|
amine@351
|
1141 max_continuous_silence=0)
|
amine@368
|
1142 >>> tokenizer.tokenize(dsource)
|
amine@368
|
1143 [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
|
amine@67
|
1144
|
amine@67
|
1145
|
amine@351
|
1146 The following tokenizer will however reject the 'BB' token:
|
amine@67
|
1147
|
amine@368
|
1148 >>> dsource = StringDataSource("aaaAAAABBbbb")
|
amine@368
|
1149 >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
|
amine@351
|
1150 min_length=3, max_length=4,
|
amine@351
|
1151 max_continuous_silence=0,
|
amine@351
|
1152 mode=StreamTokenizer.STRICT_MIN_LENGTH)
|
amine@368
|
1153 >>> tokenizer.tokenize(dsource)
|
amine@368
|
1154 [(['A', 'A', 'A', 'A'], 3, 6)]
|
amine@351
|
1155
|
amine@351
|
1156
|
amine@351
|
1157
|
amine@368
|
1158 >>> tokenizer = StreamTokenizer(
|
amine@368
|
1159 >>> validator=UpperCaseChecker(),
|
amine@368
|
1160 >>> min_length=3,
|
amine@368
|
1161 >>> max_length=6,
|
amine@368
|
1162 >>> max_continuous_silence=3,
|
amine@368
|
1163 >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE
|
amine@368
|
1164 >>> )
|
amine@368
|
1165 >>> dsource = StringDataSource("aaaAAAaaaBBbbbb")
|
amine@368
|
1166 >>> tokenizer.tokenize(dsource)
|
amine@368
|
1167 [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
|
amine@32
|
1168
|
amine@351
|
1169 The first token is delivered with its tailing silence because it is
|
amine@351
|
1170 truncated while the second one has its tailing frames removed.
|
amine@32
|
1171
|
amine@351
|
1172 Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
|
amine@67
|
1173
|
amine@351
|
1174 .. code:: python
|
amine@67
|
1175
|
amine@351
|
1176 [
|
amine@351
|
1177 (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8),
|
amine@351
|
1178 (['B', 'B', 'b', 'b', 'b'], 9, 13)
|
amine@351
|
1179 ]
|
amine@67
|
1180
|
amine@32
|
1181 """
|
amine@67
|
1182
|
amine@32
|
1183 SILENCE = 0
|
amine@32
|
1184 POSSIBLE_SILENCE = 1
|
amine@67
|
1185 POSSIBLE_NOISE = 2
|
amine@32
|
1186 NOISE = 3
|
amine@297
|
1187 NORMAL = 0
|
amine@32
|
1188 STRICT_MIN_LENGTH = 2
|
amine@32
|
1189 DROP_TRAILING_SILENCE = 4
|
amine@67
|
1190
|
amine@178
|
1191 def __init__(
|
amine@178
|
1192 self,
|
amine@178
|
1193 validator,
|
amine@178
|
1194 min_length,
|
amine@178
|
1195 max_length,
|
amine@178
|
1196 max_continuous_silence,
|
amine@178
|
1197 init_min=0,
|
amine@178
|
1198 init_max_silence=0,
|
amine@178
|
1199 mode=0,
|
amine@178
|
1200 ):
|
amine@297
|
1201 if callable(validator):
|
amine@297
|
1202 self._is_valid = validator
|
amine@297
|
1203 elif isinstance(validator, DataValidator):
|
amine@297
|
1204 self._is_valid = validator.is_valid
|
amine@297
|
1205 else:
|
amine@185
|
1206 raise TypeError(
|
amine@333
|
1207 "'validator' must be a callable or an instance of "
|
amine@333
|
1208 "DataValidator"
|
amine@185
|
1209 )
|
amine@67
|
1210
|
amine@2
|
1211 if max_length <= 0:
|
amine@185
|
1212 raise ValueError(
|
amine@185
|
1213 "'max_length' must be > 0 (value={0})".format(max_length)
|
amine@185
|
1214 )
|
amine@67
|
1215
|
amine@2
|
1216 if min_length <= 0 or min_length > max_length:
|
amine@387
|
1217 err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})"
|
amine@328
|
1218 raise ValueError(err_msg.format(min_length))
|
amine@67
|
1219
|
amine@2
|
1220 if max_continuous_silence >= max_length:
|
amine@328
|
1221 err_msg = "'max_continuous_silence' must be < 'max_length' "
|
amine@328
|
1222 err_msg += "(value={0})"
|
amine@328
|
1223 raise ValueError(err_msg.format(max_continuous_silence))
|
amine@67
|
1224
|
amine@5
|
1225 if init_min >= max_length:
|
amine@178
|
1226 raise ValueError(
|
amine@178
|
1227 "'init_min' must be < 'max_length' (value={0})".format(
|
amine@178
|
1228 max_continuous_silence
|
amine@178
|
1229 )
|
amine@178
|
1230 )
|
amine@67
|
1231
|
amine@2
|
1232 self.validator = validator
|
amine@2
|
1233 self.min_length = min_length
|
amine@2
|
1234 self.max_length = max_length
|
amine@2
|
1235 self.max_continuous_silence = max_continuous_silence
|
amine@2
|
1236 self.init_min = init_min
|
amine@2
|
1237 self.init_max_silent = init_max_silence
|
amine@297
|
1238 self._set_mode(mode)
|
amine@2
|
1239 self._deliver = None
|
amine@2
|
1240 self._tokens = None
|
amine@2
|
1241 self._state = None
|
amine@2
|
1242 self._data = None
|
amine@2
|
1243 self._contiguous_token = False
|
amine@2
|
1244 self._init_count = 0
|
amine@2
|
1245 self._silence_length = 0
|
amine@2
|
1246 self._start_frame = 0
|
amine@2
|
1247 self._current_frame = 0
|
amine@67
|
1248
|
amine@297
|
1249 def _set_mode(self, mode):
|
amine@297
|
1250 strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH
|
amine@297
|
1251 strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE
|
amine@327
|
1252 if mode not in [
|
amine@297
|
1253 StreamTokenizer.NORMAL,
|
amine@297
|
1254 StreamTokenizer.STRICT_MIN_LENGTH,
|
amine@297
|
1255 StreamTokenizer.DROP_TRAILING_SILENCE,
|
amine@297
|
1256 strict_min_and_drop_trailing,
|
amine@178
|
1257 ]:
|
amine@2
|
1258 raise ValueError("Wrong value for mode")
|
amine@2
|
1259 self._mode = mode
|
amine@2
|
1260 self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
|
amine@297
|
1261 self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
|
amine@67
|
1262
|
amine@2
|
1263 def _reinitialize(self):
|
amine@2
|
1264 self._contiguous_token = False
|
amine@2
|
1265 self._data = []
|
amine@2
|
1266 self._tokens = []
|
amine@2
|
1267 self._state = self.SILENCE
|
amine@2
|
1268 self._current_frame = -1
|
amine@2
|
1269 self._deliver = self._append_token
|
amine@67
|
1270
|
amine@177
|
1271 def tokenize(self, data_source, callback=None, generator=False):
|
amine@2
|
1272 """
|
amine@328
|
1273 Read data from `data_source`, one frame a time, and process the read
|
amine@333
|
1274 frames in order to detect sequences of frames that make up valid
|
amine@333
|
1275 tokens.
|
amine@67
|
1276
|
amine@32
|
1277 :Parameters:
|
amine@328
|
1278 `data_source` : instance of the :class:`DataSource` class that
|
amine@328
|
1279 implements a `read` method. 'read' should return a slice of
|
amine@328
|
1280 signal, i.e. frame (of whatever type as long as it can be
|
amine@328
|
1281 processed by validator) and None if there is no more signal.
|
amine@67
|
1282
|
amine@32
|
1283 `callback` : an optional 3-argument function.
|
amine@328
|
1284 If a `callback` function is given, it will be called each time
|
amine@328
|
1285 a valid token is found.
|
amine@67
|
1286
|
amine@67
|
1287
|
amine@32
|
1288 :Returns:
|
amine@328
|
1289 A list of tokens if `callback` is None. Each token is tuple with the
|
amine@328
|
1290 following elements:
|
amine@67
|
1291
|
amine@32
|
1292 .. code python
|
amine@67
|
1293
|
amine@32
|
1294 (data, start, end)
|
amine@67
|
1295
|
amine@328
|
1296 where `data` is a list of read frames, `start`: index of the first
|
amine@328
|
1297 frame in the original data and `end` : index of the last frame.
|
amine@2
|
1298 """
|
amine@177
|
1299 token_gen = self._iter_tokens(data_source)
|
amine@177
|
1300 if callback:
|
amine@177
|
1301 for token in token_gen:
|
amine@177
|
1302 callback(*token)
|
amine@177
|
1303 return
|
amine@177
|
1304 if generator:
|
amine@177
|
1305 return token_gen
|
amine@177
|
1306 return list(token_gen)
|
amine@67
|
1307
|
amine@177
|
1308 def _iter_tokens(self, data_source):
|
amine@2
|
1309 self._reinitialize()
|
amine@2
|
1310 while True:
|
amine@67
|
1311 frame = data_source.read()
|
amine@177
|
1312 self._current_frame += 1
|
amine@47
|
1313 if frame is None:
|
amine@177
|
1314 token = self._post_process()
|
amine@177
|
1315 if token is not None:
|
amine@177
|
1316 yield token
|
amine@2
|
1317 break
|
amine@177
|
1318 token = self._process(frame)
|
amine@177
|
1319 if token is not None:
|
amine@177
|
1320 yield token
|
amine@67
|
1321
|
amine@327
|
1322 def _process(self, frame): # noqa: C901
|
amine@67
|
1323
|
amine@297
|
1324 frame_is_valid = self._is_valid(frame)
|
amine@67
|
1325
|
amine@2
|
1326 if self._state == self.SILENCE:
|
amine@67
|
1327
|
amine@2
|
1328 if frame_is_valid:
|
amine@2
|
1329 # seems we got a valid frame after a silence
|
amine@2
|
1330 self._init_count = 1
|
amine@2
|
1331 self._silence_length = 0
|
amine@2
|
1332 self._start_frame = self._current_frame
|
amine@2
|
1333 self._data.append(frame)
|
amine@67
|
1334
|
amine@67
|
1335 if self._init_count >= self.init_min:
|
amine@2
|
1336 self._state = self.NOISE
|
amine@2
|
1337 if len(self._data) >= self.max_length:
|
amine@177
|
1338 return self._process_end_of_detection(True)
|
amine@2
|
1339 else:
|
amine@2
|
1340 self._state = self.POSSIBLE_NOISE
|
amine@67
|
1341
|
amine@2
|
1342 elif self._state == self.POSSIBLE_NOISE:
|
amine@67
|
1343
|
amine@2
|
1344 if frame_is_valid:
|
amine@2
|
1345 self._silence_length = 0
|
amine@2
|
1346 self._init_count += 1
|
amine@2
|
1347 self._data.append(frame)
|
amine@67
|
1348 if self._init_count >= self.init_min:
|
amine@2
|
1349 self._state = self.NOISE
|
amine@2
|
1350 if len(self._data) >= self.max_length:
|
amine@177
|
1351 return self._process_end_of_detection(True)
|
amine@67
|
1352
|
amine@67
|
1353 else:
|
amine@2
|
1354 self._silence_length += 1
|
amine@178
|
1355 if (
|
amine@178
|
1356 self._silence_length > self.init_max_silent
|
amine@178
|
1357 or len(self._data) + 1 >= self.max_length
|
amine@178
|
1358 ):
|
amine@2
|
1359 # either init_max_silent or max_length is reached
|
amine@2
|
1360 # before _init_count, back to silence
|
amine@2
|
1361 self._data = []
|
amine@2
|
1362 self._state = self.SILENCE
|
amine@2
|
1363 else:
|
amine@2
|
1364 self._data.append(frame)
|
amine@67
|
1365
|
amine@2
|
1366 elif self._state == self.NOISE:
|
amine@67
|
1367
|
amine@2
|
1368 if frame_is_valid:
|
amine@2
|
1369 self._data.append(frame)
|
amine@2
|
1370 if len(self._data) >= self.max_length:
|
amine@177
|
1371 return self._process_end_of_detection(True)
|
amine@67
|
1372
|
amine@67
|
1373 elif self.max_continuous_silence <= 0:
|
amine@328
|
1374 # max token reached at this frame will _deliver if
|
amine@328
|
1375 # _contiguous_token and not _strict_min_length
|
amine@2
|
1376 self._state = self.SILENCE
|
amine@177
|
1377 return self._process_end_of_detection()
|
amine@2
|
1378 else:
|
amine@2
|
1379 # this is the first silent frame following a valid one
|
amine@2
|
1380 # and it is tolerated
|
amine@2
|
1381 self._silence_length = 1
|
amine@2
|
1382 self._data.append(frame)
|
amine@2
|
1383 self._state = self.POSSIBLE_SILENCE
|
amine@2
|
1384 if len(self._data) == self.max_length:
|
amine@177
|
1385 return self._process_end_of_detection(True)
|
amine@67
|
1386 # don't reset _silence_length because we still
|
amine@2
|
1387 # need to know the total number of silent frames
|
amine@67
|
1388
|
amine@2
|
1389 elif self._state == self.POSSIBLE_SILENCE:
|
amine@67
|
1390
|
amine@2
|
1391 if frame_is_valid:
|
amine@2
|
1392 self._data.append(frame)
|
amine@2
|
1393 self._silence_length = 0
|
amine@2
|
1394 self._state = self.NOISE
|
amine@2
|
1395 if len(self._data) >= self.max_length:
|
amine@177
|
1396 return self._process_end_of_detection(True)
|
amine@67
|
1397
|
amine@2
|
1398 else:
|
amine@2
|
1399 if self._silence_length >= self.max_continuous_silence:
|
amine@177
|
1400 self._state = self.SILENCE
|
amine@2
|
1401 if self._silence_length < len(self._data):
|
amine@67
|
1402 # _deliver only gathered frames aren't all silent
|
amine@177
|
1403 return self._process_end_of_detection()
|
amine@177
|
1404 self._data = []
|
amine@2
|
1405 self._silence_length = 0
|
amine@2
|
1406 else:
|
amine@2
|
1407 self._data.append(frame)
|
amine@2
|
1408 self._silence_length += 1
|
amine@2
|
1409 if len(self._data) >= self.max_length:
|
amine@177
|
1410 return self._process_end_of_detection(True)
|
amine@67
|
1411 # don't reset _silence_length because we still
|
amine@2
|
1412 # need to know the total number of silent frames
|
amine@67
|
1413
|
amine@2
|
1414 def _post_process(self):
|
amine@2
|
1415 if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
|
amine@2
|
1416 if len(self._data) > 0 and len(self._data) > self._silence_length:
|
amine@177
|
1417 return self._process_end_of_detection()
|
amine@67
|
1418
|
amine@2
|
1419 def _process_end_of_detection(self, truncated=False):
|
amine@67
|
1420
|
amine@185
|
1421 if (
|
amine@185
|
1422 not truncated
|
amine@297
|
1423 and self._drop_trailing_silence
|
amine@185
|
1424 and self._silence_length > 0
|
amine@185
|
1425 ):
|
amine@2
|
1426 # happens if max_continuous_silence is reached
|
amine@2
|
1427 # or max_length is reached at a silent frame
|
amine@178
|
1428 self._data = self._data[0 : -self._silence_length]
|
amine@67
|
1429
|
amine@178
|
1430 if (len(self._data) >= self.min_length) or (
|
amine@178
|
1431 len(self._data) > 0
|
amine@178
|
1432 and not self._strict_min_length
|
amine@178
|
1433 and self._contiguous_token
|
amine@178
|
1434 ):
|
amine@67
|
1435
|
amine@177
|
1436 start_frame = self._start_frame
|
amine@177
|
1437 end_frame = self._start_frame + len(self._data) - 1
|
amine@177
|
1438 data = self._data
|
amine@177
|
1439 self._data = []
|
amine@177
|
1440 token = (data, start_frame, end_frame)
|
amine@67
|
1441
|
amine@2
|
1442 if truncated:
|
amine@2
|
1443 # next token (if any) will start at _current_frame + 1
|
amine@2
|
1444 self._start_frame = self._current_frame + 1
|
amine@2
|
1445 # remember that it is contiguous with the just delivered one
|
amine@2
|
1446 self._contiguous_token = True
|
amine@2
|
1447 else:
|
amine@2
|
1448 self._contiguous_token = False
|
amine@177
|
1449 return token
|
amine@2
|
1450 else:
|
amine@67
|
1451 self._contiguous_token = False
|
amine@67
|
1452
|
amine@2
|
1453 self._data = []
|
amine@67
|
1454
|
amine@2
|
1455 def _append_token(self, data, start, end):
|
amine@178
|
1456 self._tokens.append((data, start, end))
|