auditok: auditok/core.py annotate

annotate auditok/core.py @ 413:0a6bc66562d3

Use modern syntax for super everywhere

author	Amine Sehili <amine.sehili@gmail.com>
date	Fri, 21 Jun 2024 20:12:53 +0200
parents	5a6685f1e42d
children	9f83c1ecb03b

rev	line source
amine@33	1 """
amine@368	2 .. autosummary::
amine@368	3 :toctree: generated/
amine@33	4
amine@371	5 load
amine@368	6 split
amine@368	7 AudioRegion
amine@368	8 StreamTokenizer
amine@33	9 """
amine@404	10
amine@404	11 import math
amine@187	12 import os
amine@411	13 import warnings
amine@411	14 from dataclasses import dataclass, field
amine@411	15 from pathlib import Path
amine@404	16
amine@404	17 from .exceptions import TooSmallBlockDuration
amine@404	18 from .io import check_audio_data, get_audio_source, player_for, to_file
amine@404	19 from .plotting import plot
amine@404	20 from .util import AudioEnergyValidator, AudioReader, DataValidator
amine@263	21
amine@263	22 try:
amine@246	23 from . import signal_numpy as signal
amine@246	24 except ImportError:
amine@246	25 from . import signal
amine@246	26
amine@371	27 __all__ = ["load", "split", "AudioRegion", "StreamTokenizer"]
amine@179	28
amine@179	29
amine@179	30 DEFAULT_ANALYSIS_WINDOW = 0.05
amine@179	31 DEFAULT_ENERGY_THRESHOLD = 50
amine@368	32 _EPSILON = 1e-10
amine@179	33
amine@179	34
amine@371	35 def load(input, skip=0, max_read=None, **kwargs):
amine@371	36 """Load audio data from a source and return it as an :class:`AudioRegion`.
amine@373	37
amine@373	38 Parameters
amine@373	39 ----------
amine@373	40 input : None, str, bytes, AudioSource
amine@373	41 source to read audio data from. If `str`, it should be a path to a
amine@373	42 valid audio file. If `bytes`, it is used as raw audio data. If it is
amine@373	43 "-", raw data will be read from stdin. If None, read audio data from
amine@373	44 the microphone using PyAudio. If of type `bytes` or is a path to a
amine@373	45 raw audio file then `sampling_rate`, `sample_width` and `channels`
amine@373	46 parameters (or their alias) are required. If it's an
amine@373	47 :class:`AudioSource` object it's used directly to read data.
amine@373	48 skip : float, default: 0
amine@373	49 amount, in seconds, of audio data to skip from source. If read from
amine@373	50 a microphone, `skip` must be 0, otherwise a `ValueError` is raised.
amine@373	51 max_read : float, default: None
amine@373	52 amount, in seconds, of audio data to read from source. If read from
amine@373	53 microphone, `max_read` should not be None, otherwise a `ValueError` is
amine@373	54 raised.
amine@373	55 audio_format, fmt : str
amine@373	56 type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only
amine@373	57 be used if `input` is a string path to an audio file. If not given,
amine@373	58 audio type will be guessed from file name extension or from file
amine@373	59 header.
amine@373	60 sampling_rate, sr : int
amine@373	61 sampling rate of audio data. Required if `input` is a raw audio file,
amine@373	62 a `bytes` object or None (i.e., read from microphone).
amine@373	63 sample_width, sw : int
amine@373	64 number of bytes used to encode one audio sample, typically 1, 2 or 4.
amine@373	65 Required for raw data, see `sampling_rate`.
amine@373	66 channels, ch : int
amine@373	67 number of channels of audio data. Required for raw data, see
amine@373	68 `sampling_rate`.
amine@373	69 large_file : bool, default: False
amine@373	70 If True, AND if `input` is a path to a wav of a raw audio file
amine@373	71 (and only these two formats) then audio file is not fully loaded to
amine@373	72 memory in order to create the region (but the portion of data needed to
amine@373	73 create the region is of course loaded to memory). Set to True if
amine@373	74 `max_read` is significantly smaller then the size of a large audio file
amine@373	75 that shouldn't be entirely loaded to memory.
amine@373	76
amine@373	77 Returns
amine@373	78 -------
amine@373	79 region: AudioRegion
amine@373	80
amine@373	81 Raises
amine@373	82 ------
amine@373	83 ValueError
amine@373	84 raised if `input` is None (i.e., read data from microphone) and `skip`
amine@373	85 != 0 or `input` is None `max_read` is None (meaning that when reading
amine@373	86 from the microphone, no data should be skipped, and maximum amount of
amine@373	87 data to read should be explicitly provided).
amine@371	88 """
amine@371	89 return AudioRegion.load(input, skip, max_read, **kwargs)
amine@371	90
amine@371	91
amine@179	92 def split(
amine@179	93 input,
amine@179	94 min_dur=0.2,
amine@179	95 max_dur=5,
amine@179	96 max_silence=0.3,
amine@179	97 drop_trailing_silence=False,
amine@183	98 strict_min_dur=False,
amine@404	99 **kwargs,
amine@179	100 ):
amine@351	101 """
amine@368	102 Split audio data and return a generator of AudioRegions
amine@179	103
amine@351	104 Parameters
amine@351	105 ----------
amine@351	106 input : str, bytes, AudioSource, AudioReader, AudioRegion or None
amine@351	107 input audio data. If str, it should be a path to an existing audio file.
amine@362	108 "-" is interpreted as standard input. If bytes, input is considered as
amine@362	109 raw audio data. If None, read audio from microphone.
amine@368	110 Every object that is not an `AudioReader` will be transformed into an
amine@351	111 `AudioReader` before processing. If it is an `str` that refers to a raw
amine@351	112 audio file, `bytes` or None, audio parameters should be provided using
amine@404	113 kwargs (i.e., `sampling_rate`, `sample_width` and `channels` or their
amine@351	114 alias).
amine@368	115 If `input` is str then audio format will be guessed from file extension.
amine@351	116 `audio_format` (alias `fmt`) kwarg can also be given to specify audio
amine@351	117 format explicitly. If none of these options is available, rely on
amine@351	118 backend (currently only pydub is supported) to load data.
amine@351	119 min_dur : float, default: 0.2
amine@404	120 minimum duration in seconds of a detected audio event. By using large
amine@351	121 values for `min_dur`, very short audio events (e.g., very short 1-word
amine@404	122 utterances like 'yes' or 'no') can be mis detected. Using a very small
amine@404	123 value may result in a high number of too short audio events.
amine@351	124 max_dur : float, default: 5
amine@351	125 maximum duration in seconds of a detected audio event. If an audio event
amine@351	126 lasts more than `max_dur` it will be truncated. If the continuation of a
amine@351	127 truncated audio event is shorter than `min_dur` then this continuation
amine@351	128 is accepted as a valid audio event if `strict_min_dur` is False.
amine@351	129 Otherwise it is rejected.
amine@351	130 max_silence : float, default: 0.3
amine@351	131 maximum duration of continuous silence within an audio event. There
amine@351	132 might be many silent gaps of this duration within one audio event. If
amine@351	133 the continuous silence happens at the end of the event than it's kept as
amine@351	134 part of the event if `drop_trailing_silence` is False (default).
amine@351	135 drop_trailing_silence : bool, default: False
amine@351	136 Whether to remove trailing silence from detected events. To avoid abrupt
amine@368	137 cuts in speech, trailing silence should be kept, therefore this
amine@368	138 parameter should be False.
amine@351	139 strict_min_dur : bool, default: False
amine@351	140 strict minimum duration. Do not accept an audio event if it is shorter
amine@368	141 than `min_dur` even if it is contiguous to the latest valid event. This
amine@368	142 happens if the the latest detected event had reached `max_dur`.
amine@179	143
amine@368	144 Other Parameters
amine@368	145 ----------------
amine@351	146 analysis_window, aw : float, default: 0.05 (50 ms)
amine@351	147 duration of analysis window in seconds. A value between 0.01 (10 ms) and
amine@351	148 0.1 (100 ms) should be good for most use-cases.
amine@351	149 audio_format, fmt : str
amine@351	150 type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
amine@368	151 used if `input` is a string path to an audio file. If not given, audio
amine@351	152 type will be guessed from file name extension or from file header.
amine@351	153 sampling_rate, sr : int
amine@362	154 sampling rate of audio data. Required if `input` is a raw audio file, is
amine@351	155 a bytes object or None (i.e., read from microphone).
amine@351	156 sample_width, sw : int
amine@351	157 number of bytes used to encode one audio sample, typically 1, 2 or 4.
amine@351	158 Required for raw data, see `sampling_rate`.
amine@351	159 channels, ch : int
amine@362	160 number of channels of audio data. Required for raw data, see
amine@351	161 `sampling_rate`.
amine@351	162 use_channel, uc : {None, "mix"} or int
amine@351	163 which channel to use for split if `input` has multiple audio channels.
amine@351	164 Regardless of which channel is used for splitting, returned audio events
amine@351	165 contain data from all channels, just as `input`.
amine@351	166 The following values are accepted:
amine@368	167
amine@368	168 - None (alias "any"): accept audio activity from any channel, even if
amine@368	169 other channels are silent. This is the default behavior.
amine@368	170
amine@368	171 - "mix" ("avg" or "average"): mix down all channels (i.e. compute
amine@368	172 average channel) and split the resulting channel.
amine@368	173
amine@368	174 - int (0 <=, > `channels`): use one channel, specified by integer id,
amine@368	175 for split.
amine@368	176
amine@351	177 large_file : bool, default: False
amine@351	178 If True, AND if `input` is a path to a wav of a raw audio file
amine@351	179 (and only these two formats) then audio data is lazily loaded to memory
amine@351	180 (i.e., one analysis window a time). Otherwise the whole file is loaded
amine@351	181 to memory before split. Set to True if the size of the file is larger
amine@351	182 than available memory.
amine@368	183 max_read, mr : float, default: None, read until end of stream
amine@351	184 maximum data to read from source in seconds.
amine@351	185 validator, val : callable, DataValidator
amine@404	186 custom data validator. If `None` (default), an `AudioEnergyValidtor` is
amine@362	187 used with the given energy threshold. Can be a callable or an instance
amine@351	188 of `DataValidator` that implements `is_valid`. In either case, it'll be
amine@351	189 called with with a window of audio data as the first parameter.
amine@351	190 energy_threshold, eth : float, default: 50
amine@362	191 energy threshold for audio activity detection. Audio regions that have
amine@351	192 enough windows of with a signal energy equal to or above this threshold
amine@362	193 are considered valid audio events. Here we are referring to this amount
amine@362	194 as the energy of the signal but to be more accurate, it is the log
amine@368	195 energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see
amine@368	196 :class:`AudioEnergyValidator` and
amine@368	197 :func:`calculate_energy_single_channel`). If `validator` is given, this
amine@368	198 argument is ignored.
amine@368	199
amine@368	200 Yields
amine@368	201 ------
amine@368	202 AudioRegion
amine@368	203 a generator of detected :class:`AudioRegion` s.
amine@179	204 """
amine@225	205 if min_dur <= 0:
amine@404	206 raise ValueError(f"'min_dur' ({min_dur}) must be > 0")
amine@225	207 if max_dur <= 0:
amine@404	208 raise ValueError(f"'max_dur' ({max_dur}) must be > 0")
amine@225	209 if max_silence < 0:
amine@404	210 raise ValueError(f"'max_silence' ({max_silence}) must be >= 0")
amine@219	211
amine@295	212 if isinstance(input, AudioReader):
amine@179	213 source = input
amine@207	214 analysis_window = source.block_dur
amine@179	215 else:
amine@207	216 analysis_window = kwargs.get(
amine@210	217 "analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW)
amine@207	218 )
amine@237	219 if analysis_window <= 0:
amine@237	220 raise ValueError(
amine@404	221 f"'analysis_window' ({analysis_window}) must be > 0"
amine@237	222 )
amine@210	223
amine@179	224 params = kwargs.copy()
amine@210	225 params["max_read"] = params.get("max_read", params.get("mr"))
amine@212	226 params["audio_format"] = params.get("audio_format", params.get("fmt"))
amine@179	227 if isinstance(input, AudioRegion):
amine@179	228 params["sampling_rate"] = input.sr
amine@179	229 params["sample_width"] = input.sw
amine@179	230 params["channels"] = input.ch
amine@179	231 input = bytes(input)
amine@236	232 try:
amine@297	233 source = AudioReader(input, block_dur=analysis_window, **params)
amine@404	234 except TooSmallBlockDuration as exc:
amine@404	235 err_msg = f"Too small 'analysis_window' ({exc.block_dur}) for "
amine@404	236 err_msg += f"sampling rate ({exc.sampling_rate}). Analysis window "
amine@404	237 err_msg += f"should at least be 1/{exc.sampling_rate} to cover "
amine@404	238 err_msg += "one data sample"
amine@404	239 raise ValueError(err_msg) from exc
amine@179	240
amine@210	241 validator = kwargs.get("validator", kwargs.get("val"))
amine@179	242 if validator is None:
amine@185	243 energy_threshold = kwargs.get(
amine@185	244 "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
amine@185	245 )
amine@241	246 use_channel = kwargs.get("use_channel", kwargs.get("uc"))
amine@241	247 validator = AudioEnergyValidator(
amine@241	248 energy_threshold, source.sw, source.ch, use_channel=use_channel
amine@241	249 )
amine@387	250 mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
amine@183	251 if strict_min_dur:
amine@179	252 mode \|= StreamTokenizer.STRICT_MIN_LENGTH
amine@222	253 min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil)
amine@236	254 max_length = _duration_to_nb_windows(
amine@236	255 max_dur, analysis_window, math.floor, _EPSILON
amine@236	256 )
amine@185	257 max_continuous_silence = _duration_to_nb_windows(
amine@232	258 max_silence, analysis_window, math.floor, _EPSILON
amine@185	259 )
amine@179	260
amine@222	261 err_msg = "({0} sec.) results in {1} analysis window(s) "
amine@222	262 err_msg += "({1} == {6}({0} / {2})) which is {5} the number "
amine@222	263 err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))"
amine@219	264 if min_length > max_length:
amine@219	265 err_msg = "'min_dur' " + err_msg
amine@219	266 raise ValueError(
amine@219	267 err_msg.format(
amine@222	268 min_dur,
amine@222	269 min_length,
amine@222	270 analysis_window,
amine@222	271 max_length,
amine@222	272 max_dur,
amine@222	273 "higher than",
amine@222	274 "ceil",
amine@219	275 )
amine@219	276 )
amine@219	277
amine@219	278 if max_continuous_silence >= max_length:
amine@219	279 err_msg = "'max_silence' " + err_msg
amine@219	280 raise ValueError(
amine@219	281 err_msg.format(
amine@219	282 max_silence,
amine@219	283 max_continuous_silence,
amine@219	284 analysis_window,
amine@219	285 max_length,
amine@219	286 max_dur,
amine@222	287 "higher or equal to",
amine@222	288 "floor",
amine@219	289 )
amine@219	290 )
amine@219	291
amine@179	292 tokenizer = StreamTokenizer(
amine@179	293 validator, min_length, max_length, max_continuous_silence, mode=mode
amine@179	294 )
amine@179	295 source.open()
amine@179	296 token_gen = tokenizer.tokenize(source, generator=True)
amine@179	297 region_gen = (
amine@179	298 _make_audio_region(
amine@323	299 token[0],
amine@323	300 token[1],
amine@185	301 source.block_dur,
amine@185	302 source.sr,
amine@185	303 source.sw,
amine@185	304 source.ch,
amine@179	305 )
amine@179	306 for token in token_gen
amine@179	307 )
amine@179	308 return region_gen
amine@179	309
amine@179	310
amine@236	311 def _duration_to_nb_windows(
amine@236	312 duration, analysis_window, round_fn=round, epsilon=0
amine@236	313 ):
amine@179	314 """
amine@215	315 Converts a given duration into a positive integer of analysis windows.
amine@179	316 if `duration / analysis_window` is not an integer, the result will be
amine@179	317 rounded to the closest bigger integer. If `duration == 0`, returns `0`.
amine@215	318 If `duration < analysis_window`, returns 1.
amine@179	319 `duration` and `analysis_window` can be in seconds or milliseconds but
amine@179	320 must be in the same unit.
amine@179	321
amine@351	322 Parameters
amine@351	323 ----------
amine@351	324 duration : float
amine@232	325 a given duration in seconds or ms.
amine@179	326 analysis_window: float
amine@232	327 size of analysis window, in the same unit as `duration`.
amine@351	328 round_fn : callable
amine@232	329 function called to round the result. Default: `round`.
amine@351	330 epsilon : float
amine@232	331 small value to add to the division result before rounding.
amine@232	332 E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with
amine@232	333 `round_fn=math.floor` returns `2` instead of `3`. Adding a small value
amine@232	334 to `0.3 / 0.1` avoids this error.
amine@179	335
amine@351	336 Returns
amine@351	337 -------
amine@368	338 nb_windows : int
amine@179	339 minimum number of `analysis_window`'s to cover `durartion`. That means
amine@179	340 that `analysis_window * nb_windows >= duration`.
amine@179	341 """
amine@215	342 if duration < 0 or analysis_window <= 0:
amine@215	343 err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0"
amine@215	344 raise ValueError(err_msg.format(duration, analysis_window))
amine@179	345 if duration == 0:
amine@179	346 return 0
amine@232	347 return int(round_fn(duration / analysis_window + epsilon))
amine@179	348
amine@179	349
amine@179	350 def _make_audio_region(
amine@323	351 data_frames,
amine@323	352 start_frame,
amine@185	353 frame_duration,
amine@185	354 sampling_rate,
amine@185	355 sample_width,
amine@185	356 channels,
amine@179	357 ):
amine@351	358 """
amine@351	359 Helper function to create an `AudioRegion` from parameters returned by
amine@351	360 tokenization object. It takes care of setting up region `start` and `end`
amine@351	361 in metadata.
amine@179	362
amine@351	363 Parameters
amine@351	364 ----------
amine@179	365 frame_duration: float
amine@179	366 duration of analysis window in seconds
amine@351	367 start_frame : int
amine@404	368 index of the first analysis window
amine@404	369 sampling_rate : int
amine@179	370 sampling rate of audio data
amine@351	371 sample_width : int
amine@179	372 number of bytes of one audio sample
amine@351	373 channels : int
amine@179	374 number of channels of audio data
amine@179	375
amine@351	376 Returns
amine@351	377 -------
amine@351	378 audio_region : AudioRegion
amine@404	379 AudioRegion whose start time is calculated as:
amine@185	380 `1000 * start_frame * frame_duration`
amine@179	381 """
amine@179	382 start = start_frame * frame_duration
amine@179	383 data = b"".join(data_frames)
amine@411	384 return AudioRegion(data, sampling_rate, sample_width, channels, start)
amine@81	385
amine@81	386
amine@308	387 def _read_chunks_online(max_read, **kwargs):
amine@351	388 """
amine@351	389 Helper function to read audio data from an online blocking source
amine@351	390 (i.e., microphone). Used to build an `AudioRegion` and can intercept
amine@351	391 KeyboardInterrupt so that reading stops as soon as this exception is
amine@351	392 raised. Makes building `AudioRegion`s on [i]python sessions and jupyter
amine@351	393 notebooks more user friendly.
amine@351	394
amine@351	395 Parameters
amine@351	396 ----------
amine@351	397 max_read : float
amine@351	398 maximum amount of data to read in seconds.
amine@351	399 kwargs :
amine@351	400 audio parameters (sampling_rate, sample_width and channels).
amine@351	401
amine@351	402 See also
amine@351	403 --------
amine@351	404 `AudioRegion.build`
amine@351	405 """
amine@308	406 reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs)
amine@308	407 reader.open()
amine@308	408 data = []
amine@308	409 try:
amine@308	410 while True:
amine@308	411 frame = reader.read()
amine@308	412 if frame is None:
amine@308	413 break
amine@308	414 data.append(frame)
amine@308	415 except KeyboardInterrupt:
amine@308	416 # Stop data acquisition from microphone when pressing
amine@308	417 # Ctrl+C on a [i]python session or a notebook
amine@308	418 pass
amine@308	419 reader.close()
amine@308	420 return (
amine@308	421 b"".join(data),
amine@308	422 reader.sampling_rate,
amine@308	423 reader.sample_width,
amine@308	424 reader.channels,
amine@308	425 )
amine@308	426
amine@308	427
amine@308	428 def _read_offline(input, skip=0, max_read=None, **kwargs):
amine@351	429 """
amine@351	430 Helper function to read audio data from an offline (i.e., file). Used to
amine@351	431 build `AudioRegion`s.
amine@351	432
amine@351	433 Parameters
amine@351	434 ----------
amine@351	435 input : str, bytes
amine@351	436 path to audio file (if str), or a bytes object representing raw audio
amine@351	437 data.
amine@351	438 skip : float, default 0
amine@351	439 amount of data to skip from the begining of audio source.
amine@351	440 max_read : float, default: None
amine@351	441 maximum amount of audio data to read. Default: None, means read until
amine@351	442 end of stream.
amine@351	443 kwargs :
amine@351	444 audio parameters (sampling_rate, sample_width and channels).
amine@351	445
amine@351	446 See also
amine@351	447 --------
amine@351	448 `AudioRegion.build`
amine@351	449
amine@351	450 """
amine@308	451 audio_source = get_audio_source(input, **kwargs)
amine@308	452 audio_source.open()
amine@308	453 if skip is not None and skip > 0:
amine@308	454 skip_samples = round(skip * audio_source.sampling_rate)
amine@308	455 audio_source.read(skip_samples)
amine@308	456 if max_read is not None:
amine@308	457 if max_read < 0:
amine@308	458 max_read = None
amine@308	459 else:
amine@308	460 max_read = round(max_read * audio_source.sampling_rate)
amine@308	461 data = audio_source.read(max_read)
amine@323	462 audio_source.close()
amine@308	463 return (
amine@308	464 data,
amine@308	465 audio_source.sampling_rate,
amine@308	466 audio_source.sample_width,
amine@308	467 audio_source.channels,
amine@308	468 )
amine@308	469
amine@308	470
amine@228	471 def _check_convert_index(index, types, err_msg):
amine@228	472 if not isinstance(index, slice) or index.step is not None:
amine@228	473 raise TypeError(err_msg)
amine@228	474 start = index.start if index.start is not None else 0
amine@228	475 stop = index.stop
amine@228	476 for index in (start, stop):
amine@228	477 if index is not None and not isinstance(index, types):
amine@228	478 raise TypeError(err_msg)
amine@228	479 return start, stop
amine@228	480
amine@228	481
amine@228	482 class _SecondsView:
amine@351	483 """A class to create a view of `AudioRegion` that can be sliced using
amine@351	484 indices in seconds.
amine@351	485 """
amine@351	486
amine@228	487 def __init__(self, region):
amine@228	488 self._region = region
amine@228	489
amine@228	490 def __getitem__(self, index):
amine@228	491 err_msg = "Slicing AudioRegion by seconds requires indices of type "
amine@228	492 err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])"
amine@228	493 start_s, stop_s = _check_convert_index(index, (int, float), err_msg)
amine@228	494 sr = self._region.sampling_rate
amine@228	495 start_sample = int(start_s * sr)
amine@228	496 stop_sample = None if stop_s is None else round(stop_s * sr)
amine@228	497 return self._region[start_sample:stop_sample]
amine@228	498
amine@245	499 @property
amine@245	500 def len(self):
amine@245	501 """
amine@245	502 Return region duration in seconds.
amine@245	503 """
amine@245	504 return self._region.duration
amine@245	505
amine@228	506
amine@228	507 class _MillisView(_SecondsView):
amine@351	508 """A class to create a view of `AudioRegion` that can be sliced using
amine@351	509 indices in milliseconds.
amine@351	510 """
amine@351	511
amine@228	512 def __getitem__(self, index):
amine@228	513 err_msg = (
amine@228	514 "Slicing AudioRegion by milliseconds requires indices of type "
amine@228	515 )
amine@228	516 err_msg += "'int' without a step (e.g. region.sec[500:1500])"
amine@228	517 start_ms, stop_ms = _check_convert_index(index, (int), err_msg)
amine@228	518 start_sec = start_ms / 1000
amine@228	519 stop_sec = None if stop_ms is None else stop_ms / 1000
amine@228	520 index = slice(start_sec, stop_sec)
amine@413	521 return super().__getitem__(index)
amine@228	522
amine@245	523 def __len__(self):
amine@245	524 """
amine@245	525 Return region duration in milliseconds.
amine@245	526 """
amine@245	527 return round(self._region.duration * 1000)
amine@245	528
amine@245	529 @property
amine@245	530 def len(self):
amine@245	531 """
amine@245	532 Return region duration in milliseconds.
amine@245	533 """
amine@245	534 return len(self)
amine@245	535
amine@228	536
amine@244	537 class _AudioRegionMetadata(dict):
amine@387	538 """A class to store `AudioRegion`'s metadata."""
amine@351	539
amine@244	540 def __getattr__(self, name):
amine@411	541 warnings.warn(
amine@411	542 "`AudioRegion.meta` is deprecated and will be removed in future "
amine@411	543 "versions. For the 'start' and 'end' fields, please use "
amine@411	544 "`AudioRegion.start` and `AudioRegion.end`.",
amine@411	545 DeprecationWarning,
amine@411	546 stacklevel=2,
amine@411	547 )
amine@244	548 if name in self:
amine@244	549 return self[name]
amine@244	550 else:
amine@244	551 err_msg = "AudioRegion metadata has no entry '{}'"
amine@244	552 raise AttributeError(err_msg.format(name))
amine@244	553
amine@244	554 def __setattr__(self, name, value):
amine@244	555 self[name] = value
amine@244	556
amine@244	557 def __str__(self):
amine@244	558 return "\n".join("{}: {}".format(k, v) for k, v in self.items())
amine@244	559
amine@244	560 def __repr__(self):
amine@244	561 return str(self)
amine@244	562
amine@244	563
amine@411	564 @dataclass(frozen=True)
amine@81	565 class AudioRegion(object):
amine@368	566 """
amine@368	567 AudioRegion encapsulates raw audio data and provides an interface to
amine@368	568 perform simple operations on it. Use `AudioRegion.load` to build an
amine@368	569 `AudioRegion` from different types of objects.
amine@368	570
amine@368	571 Parameters
amine@368	572 ----------
amine@368	573 data : bytes
amine@368	574 raw audio data as a bytes object
amine@368	575 sampling_rate : int
amine@368	576 sampling rate of audio data
amine@368	577 sample_width : int
amine@368	578 number of bytes of one audio sample
amine@368	579 channels : int
amine@368	580 number of channels of audio data
amine@412	581 start : float, default: None
amine@412	582 optional start time of the region. This is typically provided by the
amine@412	583 `split` function.
amine@368	584
amine@368	585 See also
amine@368	586 --------
amine@368	587 AudioRegion.load
amine@368	588 """
amine@368	589
amine@411	590 data: bytes
amine@411	591 sampling_rate: int
amine@411	592 sample_width: int
amine@411	593 channels: int
amine@411	594 start: float = field(default=None, repr=None)
amine@411	595
amine@411	596 def __post_init__(self):
amine@244	597
amine@411	598 check_audio_data(self.data, self.sample_width, self.channels)
amine@411	599
amine@411	600 object.__setattr__(self, "splitp", self.split_and_plot)
amine@411	601 object.__setattr__(self, "_samples", None)
amine@411	602
amine@411	603 duration = len(self.data) / (
amine@411	604 self.sampling_rate * self.sample_width * self.channels
amine@411	605 )
amine@411	606 object.__setattr__(self, "duration", duration)
amine@411	607
amine@411	608 if self.start is not None:
amine@411	609 object.__setattr__(self, "end", self.start + self.duration)
amine@411	610 object.__setattr__(
amine@411	611 self,
amine@411	612 "meta",
amine@411	613 _AudioRegionMetadata({"start": self.start, "end": self.end}),
amine@411	614 )
amine@411	615 else:
amine@411	616 object.__setattr__(self, "end", None)
amine@411	617 object.__setattr__(self, "meta", None)
amine@411	618
amine@411	619 # `seconds` and `millis` are defined below as @property with docstring
amine@411	620 object.__setattr__(self, "_seconds_view", _SecondsView(self))
amine@411	621 object.__setattr__(self, "_millis_view", _MillisView(self))
amine@411	622
amine@411	623 object.__setattr__(self, "sec", self.seconds)
amine@411	624 object.__setattr__(self, "s", self.seconds)
amine@411	625 object.__setattr__(self, "ms", self.millis)
amine@244	626
amine@239	627 @classmethod
amine@307	628 def load(cls, input, skip=0, max_read=None, **kwargs):
amine@351	629 """
amine@373	630 Create an `AudioRegion` by loading data from `input`. See :func:`load`
amine@373	631 for parameters descripion.
amine@351	632
amine@351	633 Returns
amine@351	634 -------
amine@351	635 region: AudioRegion
amine@351	636
amine@351	637 Raises
amine@351	638 ------
amine@368	639 ValueError
amine@368	640 raised if `input` is None and `skip` != 0 or `max_read` is None.
amine@351	641 """
amine@308	642 if input is None:
amine@351	643 if skip > 0:
amine@351	644 raise ValueError(
amine@351	645 "'skip' should be 0 when reading from microphone"
amine@351	646 )
amine@308	647 if max_read is None or max_read < 0:
amine@308	648 raise ValueError(
amine@333	649 "'max_read' should not be None when reading from "
amine@333	650 "microphone"
amine@308	651 )
amine@308	652 data, sampling_rate, sample_width, channels = _read_chunks_online(
amine@308	653 max_read, **kwargs
amine@307	654 )
amine@239	655 else:
amine@308	656 data, sampling_rate, sample_width, channels = _read_offline(
amine@308	657 input, skip=skip, max_read=max_read, **kwargs
amine@308	658 )
amine@308	659
amine@308	660 return cls(data, sampling_rate, sample_width, channels)
amine@239	661
amine@228	662 @property
amine@351	663 def seconds(self):
amine@373	664 """
amine@404	665 A view to slice audio region by seconds using
amine@404	666 ``region.seconds[start:end]``.
amine@373	667 """
amine@228	668 return self._seconds_view
amine@228	669
amine@228	670 @property
amine@228	671 def millis(self):
amine@404	672 """A view to slice audio region by milliseconds using
amine@404	673 ``region.millis[start:end]``."""
amine@228	674 return self._millis_view
amine@228	675
amine@81	676 @property
amine@81	677 def sr(self):
leminhnguyen@395	678 """Sampling rate of audio data, alias for `sampling_rate`."""
amine@411	679 return self.sampling_rate
amine@81	680
amine@81	681 @property
amine@81	682 def sw(self):
amine@411	683 """Number of bytes per sample, alias for `sample_width`."""
amine@411	684 return self.sample_width
amine@81	685
amine@81	686 @property
amine@81	687 def ch(self):
amine@387	688 """Number of channels of audio data, alias for `channels`."""
amine@411	689 return self.channels
amine@2	690
amine@270	691 def play(self, progress_bar=False, player=None, **progress_bar_kwargs):
amine@351	692 """
amine@351	693 Play audio region.
amine@201	694
amine@351	695 Parameters
amine@351	696 ----------
amine@351	697 progress_bar : bool, default: False
amine@351	698 whether to use a progress bar while playing audio. Default: False.
amine@351	699 `progress_bar` requires `tqdm`, if not installed, no progress bar
amine@351	700 will be shown.
amine@351	701 player : AudioPalyer, default: None
amine@351	702 audio player to use. if None (default), use `player_for()`
amine@201	703 to get a new audio player.
amine@351	704 progress_bar_kwargs : kwargs
amine@351	705 keyword arguments to pass to `tqdm` progress_bar builder (e.g.,
amine@368	706 use `leave=False` to clean up the screen when play finishes).
amine@201	707 """
amine@199	708 if player is None:
amine@199	709 player = player_for(self)
amine@411	710 player.play(self.data, progress_bar=progress_bar, **progress_bar_kwargs)
amine@199	711
amine@411	712 def save(
amine@411	713 self, filename, audio_format=None, exists_ok=True, **audio_parameters
amine@411	714 ):
amine@351	715 """
amine@351	716 Save audio region to file.
amine@187	717
amine@351	718 Parameters
amine@351	719 ----------
amine@411	720 filename : str, Path
amine@411	721 path to output audio file. If of type `str`, it may contain a
amine@411	722 `{start}`, `{end}` and a `{duration}` placeholders.
amine@411	723 Regions returned by `split` contain a `start` and and `end`
amine@411	724 attributes that can be used to build output file name as in the
amine@411	725 example.
amine@362	726 audio_format : str, default: None
amine@351	727 format used to save audio data. If None (default), format is guessed
amine@351	728 from file name's extension. If file name has no extension, audio
amine@351	729 data is saved as a raw (headerless) audio file.
amine@351	730 exists_ok : bool, default: True
amine@368	731 If True, overwrite `file` if a file with the same name exists.
amine@368	732 If False, raise an `IOError` if `file` exists.
amine@351	733 audio_parameters: dict
amine@351	734 any keyword arguments to be passed to audio saving backend.
amine@187	735
amine@351	736 Returns
amine@351	737 -------
amine@351	738 file: str
amine@411	739 name of output file with filled placehoders.
amine@351	740 Raises
amine@411	741 IOError if `filename` exists and `exists_ok` is False.
amine@187	742
amine@368	743
amine@368	744 Examples
amine@368	745 --------
amine@411	746 Create and AudioRegion, explicitly passing a value for `start`. `end`
amine@411	747 will be computed based on `start` and the region's duration.
amine@411	748
amine@411	749 >>> region = AudioRegion(b'\0' * 2 * 24000,
amine@368	750 >>> sampling_rate=16000,
amine@368	751 >>> sample_width=2,
amine@411	752 >>> channels=1,
amine@411	753 >>> start=2.25)
amine@411	754 >>> region
amine@411	755 <AudioRegion(duration=1.500, sampling_rate=16000, sample_width=2, channels=1)>
amine@411	756
amine@411	757 >>> assert region.end == 3.75
amine@411	758 >>> assert region.save('audio_{start}-{end}.wav') == "audio_2.25-3.75.wav"
amine@411	759 >>> filename = region.save('audio_{start:.3f}-{end:.3f}_{duration:.3f}.wav')
amine@411	760 >>> assert filename == "audio_2.250-3.750_1.500.wav"
amine@187	761 """
amine@411	762 if isinstance(filename, Path):
amine@411	763 if not exists_ok and filename.exists():
amine@411	764 raise FileExistsError(
amine@411	765 "file '{filename}' exists".format(filename=str(filename))
amine@411	766 )
amine@411	767 if isinstance(filename, str):
amine@411	768 filename = filename.format(
amine@411	769 duration=self.duration,
amine@411	770 meta=self.meta,
amine@411	771 start=self.start,
amine@411	772 end=self.end,
amine@411	773 )
amine@411	774 if not exists_ok and os.path.exists(filename):
amine@411	775 raise FileExistsError(
amine@411	776 "file '{filename}' exists".format(filename=filename)
amine@411	777 )
amine@187	778 to_file(
amine@411	779 self.data,
amine@411	780 filename,
amine@351	781 audio_format,
amine@187	782 sr=self.sr,
amine@187	783 sw=self.sw,
amine@187	784 ch=self.ch,
amine@195	785 audio_parameters=audio_parameters,
amine@187	786 )
amine@411	787 return filename
amine@187	788
amine@248	789 def split(
amine@248	790 self,
amine@248	791 min_dur=0.2,
amine@248	792 max_dur=5,
amine@248	793 max_silence=0.3,
amine@248	794 drop_trailing_silence=False,
amine@248	795 strict_min_dur=False,
amine@404	796 **kwargs,
amine@248	797 ):
amine@368	798 """Split audio region. See :func:`auditok.split()` for a comprehensive
amine@368	799 description of split parameters.
amine@373	800 See Also :meth:`AudioRegio.split_and_plot`.
amine@248	801 """
amine@306	802 if kwargs.get("max_read", kwargs.get("mr")) is not None:
amine@306	803 warn_msg = "'max_read' (or 'mr') should not be used with "
amine@306	804 warn_msg += "AudioRegion.split_and_plot(). You should rather "
amine@306	805 warn_msg += "slice audio region before calling this method"
amine@306	806 raise RuntimeWarning(warn_msg)
amine@248	807 return split(
amine@248	808 self,
amine@248	809 min_dur=min_dur,
amine@248	810 max_dur=max_dur,
amine@248	811 max_silence=max_silence,
amine@248	812 drop_trailing_silence=drop_trailing_silence,
amine@248	813 strict_min_dur=strict_min_dur,
amine@404	814 **kwargs,
amine@248	815 )
amine@248	816
amine@301	817 def plot(
amine@301	818 self,
amine@301	819 scale_signal=True,
amine@301	820 show=True,
amine@301	821 figsize=None,
amine@301	822 save_as=None,
amine@301	823 dpi=120,
amine@301	824 theme="auditok",
amine@301	825 ):
amine@404	826 """Plot audio region using one sub-plot per each channel.
amine@373	827
amine@373	828 Parameters
amine@373	829 ----------
amine@373	830 scale_signal : bool, default: True
amine@373	831 if true, scale signal by subtracting its mean and dividing by its
amine@373	832 standard deviation before plotting.
amine@373	833 show : bool
amine@373	834 whether to show plotted signal right after the call.
amine@373	835 figsize : tuple, default: None
amine@373	836 width and height of the figure to pass to `matplotlib`.
amine@373	837 save_as : str, default None.
amine@373	838 if provided, also save plot to file.
amine@373	839 dpi : int, default: 120
amine@373	840 plot dpi to pass to `matplotlib`.
amine@373	841 theme : str or dict, default: "auditok"
amine@373	842 plot theme to use. Currently only "auditok" theme is implemented. To
amine@373	843 provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
amine@368	844 """
amine@404	845 plot(
amine@404	846 self,
amine@404	847 scale_signal=scale_signal,
amine@404	848 show=show,
amine@404	849 figsize=figsize,
amine@404	850 save_as=save_as,
amine@404	851 dpi=dpi,
amine@404	852 theme=theme,
amine@404	853 )
amine@250	854
amine@250	855 def split_and_plot(
amine@250	856 self,
amine@250	857 min_dur=0.2,
amine@250	858 max_dur=5,
amine@250	859 max_silence=0.3,
amine@250	860 drop_trailing_silence=False,
amine@250	861 strict_min_dur=False,
amine@301	862 scale_signal=True,
amine@250	863 show=True,
amine@301	864 figsize=None,
amine@301	865 save_as=None,
amine@301	866 dpi=120,
amine@301	867 theme="auditok",
amine@404	868 **kwargs,
amine@250	869 ):
amine@368	870 """Split region and plot signal and detections. Alias: :meth:`splitp`.
amine@368	871 See :func:`auditok.split()` for a comprehensive description of split
amine@373	872 parameters. Also see :meth:`plot` for plot parameters.
amine@250	873 """
amine@404	874 regions = self.split(
amine@404	875 min_dur=min_dur,
amine@404	876 max_dur=max_dur,
amine@404	877 max_silence=max_silence,
amine@404	878 drop_trailing_silence=drop_trailing_silence,
amine@404	879 strict_min_dur=strict_min_dur,
amine@404	880 **kwargs,
amine@404	881 )
amine@404	882 regions = list(regions)
amine@404	883 detections = ((reg.meta.start, reg.meta.end) for reg in regions)
amine@404	884 eth = kwargs.get(
amine@404	885 "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
amine@404	886 )
amine@404	887 plot(
amine@404	888 self,
amine@404	889 scale_signal=scale_signal,
amine@404	890 detections=detections,
amine@404	891 energy_threshold=eth,
amine@404	892 show=show,
amine@404	893 figsize=figsize,
amine@404	894 save_as=save_as,
amine@404	895 dpi=dpi,
amine@404	896 theme=theme,
amine@404	897 )
amine@404	898 return regions
amine@250	899
amine@246	900 @property
amine@246	901 def samples(self):
amine@387	902 """Audio region as arrays of samples, one array per channel."""
amine@411	903 if self._samples is None: # TODO fixit
amine@411	904 _samples = signal.to_array(
amine@411	905 self.data, self.sample_width, self.channels
amine@323	906 )
amine@411	907 return _samples
amine@246	908
amine@408	909 def __array__(self):
amine@408	910 return self.samples
amine@408	911
amine@408	912 def numpy(self):
amine@408	913 return self.samples
amine@408	914
amine@82	915 def __len__(self):
amine@85	916 """
amine@245	917 Return region length in number of samples.
amine@85	918 """
amine@411	919 return len(self.data) // (self.sample_width * self.channels)
amine@245	920
amine@245	921 @property
amine@245	922 def len(self):
amine@245	923 """
amine@245	924 Return region length in number of samples.
amine@245	925 """
amine@245	926 return len(self)
amine@82	927
amine@83	928 def __bytes__(self):
amine@411	929 return self.data
amine@83	930
amine@244	931 def __str__(self):
amine@178	932 return (
amine@244	933 "AudioRegion(duration={:.3f}, "
amine@178	934 "sampling_rate={}, sample_width={}, channels={})".format(
amine@244	935 self.duration, self.sr, self.sw, self.ch
amine@178	936 )
amine@178	937 )
amine@83	938
amine@244	939 def __repr__(self):
amine@409	940 return "<{}>".format(str(self))
amine@83	941
amine@87	942 def __add__(self, other):
amine@87	943 """
amine@87	944 Concatenates this region and `other` and return a new region.
amine@87	945 Both regions must have the same sampling rate, sample width
amine@87	946 and number of channels. If not, raises a `ValueError`.
amine@87	947 """
amine@87	948 if not isinstance(other, AudioRegion):
amine@178	949 raise TypeError(
amine@185	950 "Can only concatenate AudioRegion, "
amine@185	951 'not "{}"'.format(type(other))
amine@178	952 )
amine@87	953 if other.sr != self.sr:
amine@178	954 raise ValueError(
amine@178	955 "Can only concatenate AudioRegions of the same "
amine@178	956 "sampling rate ({} != {})".format(self.sr, other.sr)
amine@178	957 )
amine@87	958 if other.sw != self.sw:
amine@178	959 raise ValueError(
amine@178	960 "Can only concatenate AudioRegions of the same "
amine@178	961 "sample width ({} != {})".format(self.sw, other.sw)
amine@178	962 )
amine@87	963 if other.ch != self.ch:
amine@178	964 raise ValueError(
amine@178	965 "Can only concatenate AudioRegions of the same "
amine@178	966 "number of channels ({} != {})".format(self.ch, other.ch)
amine@178	967 )
amine@411	968 data = self.data + other.data
amine@244	969 return AudioRegion(data, self.sr, self.sw, self.ch)
amine@87	970
amine@87	971 def __radd__(self, other):
amine@87	972 """
amine@87	973 Concatenates `other` and this region. `other` should be an
amine@87	974 `AudioRegion` with the same audio parameters as this region
amine@87	975 but can exceptionally be `0` to make it possible to concatenate
amine@87	976 many regions with `sum`.
amine@87	977 """
amine@87	978 if other == 0:
amine@87	979 return self
amine@87	980 return other.add(self)
amine@87	981
amine@195	982 def __mul__(self, n):
amine@195	983 if not isinstance(n, int):
amine@195	984 err_msg = "Can't multiply AudioRegion by a non-int of type '{}'"
amine@195	985 raise TypeError(err_msg.format(type(n)))
amine@411	986 data = self.data * n
amine@244	987 return AudioRegion(data, self.sr, self.sw, self.ch)
amine@195	988
amine@195	989 def __rmul__(self, n):
amine@195	990 return self * n
amine@195	991
amine@247	992 def __truediv__(self, n):
amine@247	993 if not isinstance(n, int) or n <= 0:
amine@387	994 raise TypeError("AudioRegion can only be divided by a positive int")
amine@253	995 samples_per_sub_region, rest = divmod(len(self), n)
amine@253	996 onset = 0
amine@247	997 sub_regions = []
amine@253	998 while onset < len(self):
amine@253	999 offset = 0
amine@253	1000 if rest > 0:
amine@253	1001 offset = 1
amine@253	1002 rest -= 1
amine@253	1003 offset += onset + samples_per_sub_region
amine@253	1004 sub_regions.append(self[onset:offset])
amine@253	1005 onset = offset
amine@247	1006 return sub_regions
amine@247	1007
amine@198	1008 def __eq__(self, other):
amine@198	1009 if other is self:
amine@198	1010 return True
amine@198	1011 if not isinstance(other, AudioRegion):
amine@198	1012 return False
amine@198	1013 return (
amine@411	1014 (self.data == other.data)
amine@198	1015 and (self.sr == other.sr)
amine@198	1016 and (self.sw == other.sw)
amine@198	1017 and (self.ch == other.ch)
amine@198	1018 )
amine@198	1019
amine@188	1020 def __getitem__(self, index):
amine@230	1021 err_msg = "Slicing AudioRegion by samples requires indices of type "
amine@230	1022 err_msg += "'int' without a step (e.g. region.sec[1600:3200])"
amine@230	1023 start_sample, stop_sample = _check_convert_index(index, (int), err_msg)
amine@188	1024
amine@228	1025 bytes_per_sample = self.sample_width * self.channels
amine@411	1026 len_samples = len(self.data) // bytes_per_sample
amine@188	1027
amine@230	1028 if start_sample < 0:
amine@230	1029 start_sample = max(start_sample + len_samples, 0)
amine@228	1030 onset = start_sample * bytes_per_sample
amine@188	1031
amine@230	1032 if stop_sample is not None:
amine@230	1033 if stop_sample < 0:
amine@230	1034 stop_sample = max(stop_sample + len_samples, 0)
amine@230	1035 offset = index.stop * bytes_per_sample
amine@230	1036 else:
amine@228	1037 offset = None
amine@228	1038
amine@411	1039 data = self.data[onset:offset]
amine@244	1040 return AudioRegion(data, self.sr, self.sw, self.ch)
amine@188	1041
amine@2	1042
amine@178	1043 class StreamTokenizer:
amine@32	1044 """
amine@32	1045 Class for stream tokenizers. It implements a 4-state automaton scheme
amine@32	1046 to extract sub-sequences of interest on the fly.
amine@67	1047
amine@351	1048 Parameters
amine@351	1049 ----------
amine@351	1050 validator : callable, DataValidator (must implement `is_valid`)
amine@351	1051 called with each data frame read from source. Should take one positional
amine@351	1052 argument and return True or False for valid and invalid frames
amine@351	1053 respectively.
amine@67	1054
amine@351	1055 min_length : int
amine@351	1056 Minimum number of frames of a valid token. This includes all
amine@351	1057 tolerated non valid frames within the token.
amine@67	1058
amine@351	1059 max_length : int
amine@351	1060 Maximum number of frames of a valid token. This includes all
amine@351	1061 tolerated non valid frames within the token.
amine@67	1062
amine@368	1063 max_continuous_silence : int
amine@351	1064 Maximum number of consecutive non-valid frames within a token.
amine@351	1065 Note that, within a valid token, there may be many tolerated
amine@351	1066 silent regions that contain each a number of non valid frames up
amine@351	1067 to `max_continuous_silence`
amine@67	1068
amine@368	1069 init_min : int
amine@351	1070 Minimum number of consecutive valid frames that must be
amine@351	1071 initially gathered before any sequence of non valid frames can
amine@351	1072 be tolerated. This option is not always needed, it can be used to
amine@351	1073 drop non-valid tokens as early as possible. Default = 0 means
amine@351	1074 that the option is by default ineffective.
amine@67	1075
amine@368	1076 init_max_silence : int
amine@351	1077 Maximum number of tolerated consecutive non-valid frames if the
amine@351	1078 number already gathered valid frames has not yet reached
amine@351	1079 'init_min'.This argument is normally used if `init_min` is used.
amine@351	1080 Default = 0, by default this argument is not taken into
amine@351	1081 consideration.
amine@67	1082
amine@368	1083 mode : int
amine@368	1084 mode can be one of the following:
amine@67	1085
amine@368	1086 -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and
amine@368	1087 accept a token shorter than `min_length` if it is the continuation
amine@368	1088 of the latest delivered token.
amine@67	1089
amine@368	1090 -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered
amine@368	1091 because `max_length` is reached, and token `i+1` is immediately
amine@368	1092 adjacent to token `i` (i.e. token `i` ends at frame `k` and token
amine@368	1093 `i+1` starts at frame `k+1`) then accept token `i+1` only of it has
amine@368	1094 a size of at least `min_length`. The default behavior is to accept
amine@368	1095 token `i+1` event if it is shorter than `min_length` (provided that
amine@368	1096 the above conditions are fulfilled of course).
amine@327	1097
amine@368	1098 -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing
amine@368	1099 non-valid frames from a token to be delivered if and only if it
amine@368	1100 is not truncated. This can be a bit tricky. A token is actually
amine@368	1101 delivered if:
amine@368	1102
amine@368	1103 - `max_continuous_silence` is reached.
amine@368	1104
amine@368	1105 - Its length reaches `max_length`. This is referred to as a
amine@368	1106 truncated token.
amine@368	1107
amine@368	1108 In the current implementation, a `StreamTokenizer`'s decision is only
amine@368	1109 based on already seen data and on incoming data. Thus, if a token is
amine@368	1110 truncated at a non-valid but tolerated frame (`max_length` is reached
amine@368	1111 but `max_continuous_silence` not yet) any tailing silence will be kept
amine@368	1112 because it can potentially be part of valid token (if `max_length` was
amine@368	1113 bigger). But if `max_continuous_silence` is reached before
amine@368	1114 `max_length`, the delivered token will not be considered as truncated
amine@368	1115 but a result of normal end of detection (i.e. no more valid data).
amine@368	1116 In that case the trailing silence can be removed if you use the
amine@368	1117 `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
amine@368	1118
amine@404	1119 -4 `(StreamTokenizer.STRICT_MIN_LENGTH \| StreamTokenizer.DROP_TRAILING_SILENCE)`: # noqa: B950
amine@368	1120 use both options. That means: first remove tailing silence, then
amine@368	1121 check if the token still has a length of at least `min_length`.
amine@368	1122
amine@368	1123
amine@368	1124 Examples
amine@368	1125 --------
amine@67	1126
amine@351	1127 In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
amine@351	1128 accepted although it is shorter than `min_length` (3), because it
amine@351	1129 immediately follows the latest delivered token:
amine@67	1130
amine@368	1131 >>> from auditok.core import StreamTokenizer
leminhnguyen@395	1132 >>> from auditok.util import StringDataSource, DataValidator
amine@351	1133
amine@368	1134 >>> class UpperCaseChecker(DataValidator):
amine@368	1135 >>> def is_valid(self, frame):
amine@351	1136 return frame.isupper()
amine@368	1137 >>> dsource = StringDataSource("aaaAAAABBbbb")
amine@368	1138 >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
amine@351	1139 min_length=3,
amine@351	1140 max_length=4,
amine@351	1141 max_continuous_silence=0)
amine@368	1142 >>> tokenizer.tokenize(dsource)
amine@368	1143 [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
amine@67	1144
amine@67	1145
amine@351	1146 The following tokenizer will however reject the 'BB' token:
amine@67	1147
amine@368	1148 >>> dsource = StringDataSource("aaaAAAABBbbb")
amine@368	1149 >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
amine@351	1150 min_length=3, max_length=4,
amine@351	1151 max_continuous_silence=0,
amine@351	1152 mode=StreamTokenizer.STRICT_MIN_LENGTH)
amine@368	1153 >>> tokenizer.tokenize(dsource)
amine@368	1154 [(['A', 'A', 'A', 'A'], 3, 6)]
amine@351	1155
amine@351	1156
amine@351	1157
amine@368	1158 >>> tokenizer = StreamTokenizer(
amine@368	1159 >>> validator=UpperCaseChecker(),
amine@368	1160 >>> min_length=3,
amine@368	1161 >>> max_length=6,
amine@368	1162 >>> max_continuous_silence=3,
amine@368	1163 >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE
amine@368	1164 >>> )
amine@368	1165 >>> dsource = StringDataSource("aaaAAAaaaBBbbbb")
amine@368	1166 >>> tokenizer.tokenize(dsource)
amine@368	1167 [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
amine@32	1168
amine@351	1169 The first token is delivered with its tailing silence because it is
amine@351	1170 truncated while the second one has its tailing frames removed.
amine@32	1171
amine@351	1172 Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
amine@67	1173
amine@351	1174 .. code:: python
amine@67	1175
amine@351	1176 [
amine@351	1177 (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8),
amine@351	1178 (['B', 'B', 'b', 'b', 'b'], 9, 13)
amine@351	1179 ]
amine@67	1180
amine@32	1181 """
amine@67	1182
amine@32	1183 SILENCE = 0
amine@32	1184 POSSIBLE_SILENCE = 1
amine@67	1185 POSSIBLE_NOISE = 2
amine@32	1186 NOISE = 3
amine@297	1187 NORMAL = 0
amine@32	1188 STRICT_MIN_LENGTH = 2
amine@32	1189 DROP_TRAILING_SILENCE = 4
amine@67	1190
amine@178	1191 def __init__(
amine@178	1192 self,
amine@178	1193 validator,
amine@178	1194 min_length,
amine@178	1195 max_length,
amine@178	1196 max_continuous_silence,
amine@178	1197 init_min=0,
amine@178	1198 init_max_silence=0,
amine@178	1199 mode=0,
amine@178	1200 ):
amine@297	1201 if callable(validator):
amine@297	1202 self._is_valid = validator
amine@297	1203 elif isinstance(validator, DataValidator):
amine@297	1204 self._is_valid = validator.is_valid
amine@297	1205 else:
amine@185	1206 raise TypeError(
amine@333	1207 "'validator' must be a callable or an instance of "
amine@333	1208 "DataValidator"
amine@185	1209 )
amine@67	1210
amine@2	1211 if max_length <= 0:
amine@185	1212 raise ValueError(
amine@185	1213 "'max_length' must be > 0 (value={0})".format(max_length)
amine@185	1214 )
amine@67	1215
amine@2	1216 if min_length <= 0 or min_length > max_length:
amine@387	1217 err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})"
amine@328	1218 raise ValueError(err_msg.format(min_length))
amine@67	1219
amine@2	1220 if max_continuous_silence >= max_length:
amine@328	1221 err_msg = "'max_continuous_silence' must be < 'max_length' "
amine@328	1222 err_msg += "(value={0})"
amine@328	1223 raise ValueError(err_msg.format(max_continuous_silence))
amine@67	1224
amine@5	1225 if init_min >= max_length:
amine@178	1226 raise ValueError(
amine@178	1227 "'init_min' must be < 'max_length' (value={0})".format(
amine@178	1228 max_continuous_silence
amine@178	1229 )
amine@178	1230 )
amine@67	1231
amine@2	1232 self.validator = validator
amine@2	1233 self.min_length = min_length
amine@2	1234 self.max_length = max_length
amine@2	1235 self.max_continuous_silence = max_continuous_silence
amine@2	1236 self.init_min = init_min
amine@2	1237 self.init_max_silent = init_max_silence
amine@297	1238 self._set_mode(mode)
amine@2	1239 self._deliver = None
amine@2	1240 self._tokens = None
amine@2	1241 self._state = None
amine@2	1242 self._data = None
amine@2	1243 self._contiguous_token = False
amine@2	1244 self._init_count = 0
amine@2	1245 self._silence_length = 0
amine@2	1246 self._start_frame = 0
amine@2	1247 self._current_frame = 0
amine@67	1248
amine@297	1249 def _set_mode(self, mode):
amine@297	1250 strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH
amine@297	1251 strict_min_and_drop_trailing \|= StreamTokenizer.DROP_TRAILING_SILENCE
amine@327	1252 if mode not in [
amine@297	1253 StreamTokenizer.NORMAL,
amine@297	1254 StreamTokenizer.STRICT_MIN_LENGTH,
amine@297	1255 StreamTokenizer.DROP_TRAILING_SILENCE,
amine@297	1256 strict_min_and_drop_trailing,
amine@178	1257 ]:
amine@2	1258 raise ValueError("Wrong value for mode")
amine@2	1259 self._mode = mode
amine@2	1260 self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
amine@297	1261 self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
amine@67	1262
amine@2	1263 def _reinitialize(self):
amine@2	1264 self._contiguous_token = False
amine@2	1265 self._data = []
amine@2	1266 self._tokens = []
amine@2	1267 self._state = self.SILENCE
amine@2	1268 self._current_frame = -1
amine@2	1269 self._deliver = self._append_token
amine@67	1270
amine@177	1271 def tokenize(self, data_source, callback=None, generator=False):
amine@2	1272 """
amine@328	1273 Read data from `data_source`, one frame a time, and process the read
amine@333	1274 frames in order to detect sequences of frames that make up valid
amine@333	1275 tokens.
amine@67	1276
amine@32	1277 :Parameters:
amine@328	1278 `data_source` : instance of the :class:`DataSource` class that
amine@328	1279 implements a `read` method. 'read' should return a slice of
amine@328	1280 signal, i.e. frame (of whatever type as long as it can be
amine@328	1281 processed by validator) and None if there is no more signal.
amine@67	1282
amine@32	1283 `callback` : an optional 3-argument function.
amine@328	1284 If a `callback` function is given, it will be called each time
amine@328	1285 a valid token is found.
amine@67	1286
amine@67	1287
amine@32	1288 :Returns:
amine@328	1289 A list of tokens if `callback` is None. Each token is tuple with the
amine@328	1290 following elements:
amine@67	1291
amine@32	1292 .. code python
amine@67	1293
amine@32	1294 (data, start, end)
amine@67	1295
amine@328	1296 where `data` is a list of read frames, `start`: index of the first
amine@328	1297 frame in the original data and `end` : index of the last frame.
amine@2	1298 """
amine@177	1299 token_gen = self._iter_tokens(data_source)
amine@177	1300 if callback:
amine@177	1301 for token in token_gen:
amine@177	1302 callback(*token)
amine@177	1303 return
amine@177	1304 if generator:
amine@177	1305 return token_gen
amine@177	1306 return list(token_gen)
amine@67	1307
amine@177	1308 def _iter_tokens(self, data_source):
amine@2	1309 self._reinitialize()
amine@2	1310 while True:
amine@67	1311 frame = data_source.read()
amine@177	1312 self._current_frame += 1
amine@47	1313 if frame is None:
amine@177	1314 token = self._post_process()
amine@177	1315 if token is not None:
amine@177	1316 yield token
amine@2	1317 break
amine@177	1318 token = self._process(frame)
amine@177	1319 if token is not None:
amine@177	1320 yield token
amine@67	1321
amine@327	1322 def _process(self, frame): # noqa: C901
amine@67	1323
amine@297	1324 frame_is_valid = self._is_valid(frame)
amine@67	1325
amine@2	1326 if self._state == self.SILENCE:
amine@67	1327
amine@2	1328 if frame_is_valid:
amine@2	1329 # seems we got a valid frame after a silence
amine@2	1330 self._init_count = 1
amine@2	1331 self._silence_length = 0
amine@2	1332 self._start_frame = self._current_frame
amine@2	1333 self._data.append(frame)
amine@67	1334
amine@67	1335 if self._init_count >= self.init_min:
amine@2	1336 self._state = self.NOISE
amine@2	1337 if len(self._data) >= self.max_length:
amine@177	1338 return self._process_end_of_detection(True)
amine@2	1339 else:
amine@2	1340 self._state = self.POSSIBLE_NOISE
amine@67	1341
amine@2	1342 elif self._state == self.POSSIBLE_NOISE:
amine@67	1343
amine@2	1344 if frame_is_valid:
amine@2	1345 self._silence_length = 0
amine@2	1346 self._init_count += 1
amine@2	1347 self._data.append(frame)
amine@67	1348 if self._init_count >= self.init_min:
amine@2	1349 self._state = self.NOISE
amine@2	1350 if len(self._data) >= self.max_length:
amine@177	1351 return self._process_end_of_detection(True)
amine@67	1352
amine@67	1353 else:
amine@2	1354 self._silence_length += 1
amine@178	1355 if (
amine@178	1356 self._silence_length > self.init_max_silent
amine@178	1357 or len(self._data) + 1 >= self.max_length
amine@178	1358 ):
amine@2	1359 # either init_max_silent or max_length is reached
amine@2	1360 # before _init_count, back to silence
amine@2	1361 self._data = []
amine@2	1362 self._state = self.SILENCE
amine@2	1363 else:
amine@2	1364 self._data.append(frame)
amine@67	1365
amine@2	1366 elif self._state == self.NOISE:
amine@67	1367
amine@2	1368 if frame_is_valid:
amine@2	1369 self._data.append(frame)
amine@2	1370 if len(self._data) >= self.max_length:
amine@177	1371 return self._process_end_of_detection(True)
amine@67	1372
amine@67	1373 elif self.max_continuous_silence <= 0:
amine@328	1374 # max token reached at this frame will _deliver if
amine@328	1375 # _contiguous_token and not _strict_min_length
amine@2	1376 self._state = self.SILENCE
amine@177	1377 return self._process_end_of_detection()
amine@2	1378 else:
amine@2	1379 # this is the first silent frame following a valid one
amine@2	1380 # and it is tolerated
amine@2	1381 self._silence_length = 1
amine@2	1382 self._data.append(frame)
amine@2	1383 self._state = self.POSSIBLE_SILENCE
amine@2	1384 if len(self._data) == self.max_length:
amine@177	1385 return self._process_end_of_detection(True)
amine@67	1386 # don't reset _silence_length because we still
amine@2	1387 # need to know the total number of silent frames
amine@67	1388
amine@2	1389 elif self._state == self.POSSIBLE_SILENCE:
amine@67	1390
amine@2	1391 if frame_is_valid:
amine@2	1392 self._data.append(frame)
amine@2	1393 self._silence_length = 0
amine@2	1394 self._state = self.NOISE
amine@2	1395 if len(self._data) >= self.max_length:
amine@177	1396 return self._process_end_of_detection(True)
amine@67	1397
amine@2	1398 else:
amine@2	1399 if self._silence_length >= self.max_continuous_silence:
amine@177	1400 self._state = self.SILENCE
amine@2	1401 if self._silence_length < len(self._data):
amine@67	1402 # _deliver only gathered frames aren't all silent
amine@177	1403 return self._process_end_of_detection()
amine@177	1404 self._data = []
amine@2	1405 self._silence_length = 0
amine@2	1406 else:
amine@2	1407 self._data.append(frame)
amine@2	1408 self._silence_length += 1
amine@2	1409 if len(self._data) >= self.max_length:
amine@177	1410 return self._process_end_of_detection(True)
amine@67	1411 # don't reset _silence_length because we still
amine@2	1412 # need to know the total number of silent frames
amine@67	1413
amine@2	1414 def _post_process(self):
amine@2	1415 if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
amine@2	1416 if len(self._data) > 0 and len(self._data) > self._silence_length:
amine@177	1417 return self._process_end_of_detection()
amine@67	1418
amine@2	1419 def _process_end_of_detection(self, truncated=False):
amine@67	1420
amine@185	1421 if (
amine@185	1422 not truncated
amine@297	1423 and self._drop_trailing_silence
amine@185	1424 and self._silence_length > 0
amine@185	1425 ):
amine@2	1426 # happens if max_continuous_silence is reached
amine@2	1427 # or max_length is reached at a silent frame
amine@178	1428 self._data = self._data[0 : -self._silence_length]
amine@67	1429
amine@178	1430 if (len(self._data) >= self.min_length) or (
amine@178	1431 len(self._data) > 0
amine@178	1432 and not self._strict_min_length
amine@178	1433 and self._contiguous_token
amine@178	1434 ):
amine@67	1435
amine@177	1436 start_frame = self._start_frame
amine@177	1437 end_frame = self._start_frame + len(self._data) - 1
amine@177	1438 data = self._data
amine@177	1439 self._data = []
amine@177	1440 token = (data, start_frame, end_frame)
amine@67	1441
amine@2	1442 if truncated:
amine@2	1443 # next token (if any) will start at _current_frame + 1
amine@2	1444 self._start_frame = self._current_frame + 1
amine@2	1445 # remember that it is contiguous with the just delivered one
amine@2	1446 self._contiguous_token = True
amine@2	1447 else:
amine@2	1448 self._contiguous_token = False
amine@177	1449 return token
amine@2	1450 else:
amine@67	1451 self._contiguous_token = False
amine@67	1452
amine@2	1453 self._data = []
amine@67	1454
amine@2	1455 def _append_token(self, data, start, end):
amine@178	1456 self._tokens.append((data, start, end))

Mercurial > hg > auditok

annotate auditok/core.py @ 413:0a6bc66562d3