annotate auditok/core.py @ 413:0a6bc66562d3

Use modern syntax for super everywhere
author Amine Sehili <amine.sehili@gmail.com>
date Fri, 21 Jun 2024 20:12:53 +0200
parents 5a6685f1e42d
children 9f83c1ecb03b
rev   line source
amine@33 1 """
amine@368 2 .. autosummary::
amine@368 3 :toctree: generated/
amine@33 4
amine@371 5 load
amine@368 6 split
amine@368 7 AudioRegion
amine@368 8 StreamTokenizer
amine@33 9 """
amine@404 10
amine@404 11 import math
amine@187 12 import os
amine@411 13 import warnings
amine@411 14 from dataclasses import dataclass, field
amine@411 15 from pathlib import Path
amine@404 16
amine@404 17 from .exceptions import TooSmallBlockDuration
amine@404 18 from .io import check_audio_data, get_audio_source, player_for, to_file
amine@404 19 from .plotting import plot
amine@404 20 from .util import AudioEnergyValidator, AudioReader, DataValidator
amine@263 21
amine@263 22 try:
amine@246 23 from . import signal_numpy as signal
amine@246 24 except ImportError:
amine@246 25 from . import signal
amine@246 26
amine@371 27 __all__ = ["load", "split", "AudioRegion", "StreamTokenizer"]
amine@179 28
amine@179 29
amine@179 30 DEFAULT_ANALYSIS_WINDOW = 0.05
amine@179 31 DEFAULT_ENERGY_THRESHOLD = 50
amine@368 32 _EPSILON = 1e-10
amine@179 33
amine@179 34
amine@371 35 def load(input, skip=0, max_read=None, **kwargs):
amine@371 36 """Load audio data from a source and return it as an :class:`AudioRegion`.
amine@373 37
amine@373 38 Parameters
amine@373 39 ----------
amine@373 40 input : None, str, bytes, AudioSource
amine@373 41 source to read audio data from. If `str`, it should be a path to a
amine@373 42 valid audio file. If `bytes`, it is used as raw audio data. If it is
amine@373 43 "-", raw data will be read from stdin. If None, read audio data from
amine@373 44 the microphone using PyAudio. If of type `bytes` or is a path to a
amine@373 45 raw audio file then `sampling_rate`, `sample_width` and `channels`
amine@373 46 parameters (or their alias) are required. If it's an
amine@373 47 :class:`AudioSource` object it's used directly to read data.
amine@373 48 skip : float, default: 0
amine@373 49 amount, in seconds, of audio data to skip from source. If read from
amine@373 50 a microphone, `skip` must be 0, otherwise a `ValueError` is raised.
amine@373 51 max_read : float, default: None
amine@373 52 amount, in seconds, of audio data to read from source. If read from
amine@373 53 microphone, `max_read` should not be None, otherwise a `ValueError` is
amine@373 54 raised.
amine@373 55 audio_format, fmt : str
amine@373 56 type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only
amine@373 57 be used if `input` is a string path to an audio file. If not given,
amine@373 58 audio type will be guessed from file name extension or from file
amine@373 59 header.
amine@373 60 sampling_rate, sr : int
amine@373 61 sampling rate of audio data. Required if `input` is a raw audio file,
amine@373 62 a `bytes` object or None (i.e., read from microphone).
amine@373 63 sample_width, sw : int
amine@373 64 number of bytes used to encode one audio sample, typically 1, 2 or 4.
amine@373 65 Required for raw data, see `sampling_rate`.
amine@373 66 channels, ch : int
amine@373 67 number of channels of audio data. Required for raw data, see
amine@373 68 `sampling_rate`.
amine@373 69 large_file : bool, default: False
amine@373 70 If True, AND if `input` is a path to a *wav* of a *raw* audio file
amine@373 71 (and **only** these two formats) then audio file is not fully loaded to
amine@373 72 memory in order to create the region (but the portion of data needed to
amine@373 73 create the region is of course loaded to memory). Set to True if
amine@373 74 `max_read` is significantly smaller then the size of a large audio file
amine@373 75 that shouldn't be entirely loaded to memory.
amine@373 76
amine@373 77 Returns
amine@373 78 -------
amine@373 79 region: AudioRegion
amine@373 80
amine@373 81 Raises
amine@373 82 ------
amine@373 83 ValueError
amine@373 84 raised if `input` is None (i.e., read data from microphone) and `skip`
amine@373 85 != 0 or `input` is None `max_read` is None (meaning that when reading
amine@373 86 from the microphone, no data should be skipped, and maximum amount of
amine@373 87 data to read should be explicitly provided).
amine@371 88 """
amine@371 89 return AudioRegion.load(input, skip, max_read, **kwargs)
amine@371 90
amine@371 91
amine@179 92 def split(
amine@179 93 input,
amine@179 94 min_dur=0.2,
amine@179 95 max_dur=5,
amine@179 96 max_silence=0.3,
amine@179 97 drop_trailing_silence=False,
amine@183 98 strict_min_dur=False,
amine@404 99 **kwargs,
amine@179 100 ):
amine@351 101 """
amine@368 102 Split audio data and return a generator of AudioRegions
amine@179 103
amine@351 104 Parameters
amine@351 105 ----------
amine@351 106 input : str, bytes, AudioSource, AudioReader, AudioRegion or None
amine@351 107 input audio data. If str, it should be a path to an existing audio file.
amine@362 108 "-" is interpreted as standard input. If bytes, input is considered as
amine@362 109 raw audio data. If None, read audio from microphone.
amine@368 110 Every object that is not an `AudioReader` will be transformed into an
amine@351 111 `AudioReader` before processing. If it is an `str` that refers to a raw
amine@351 112 audio file, `bytes` or None, audio parameters should be provided using
amine@404 113 kwargs (i.e., `sampling_rate`, `sample_width` and `channels` or their
amine@351 114 alias).
amine@368 115 If `input` is str then audio format will be guessed from file extension.
amine@351 116 `audio_format` (alias `fmt`) kwarg can also be given to specify audio
amine@351 117 format explicitly. If none of these options is available, rely on
amine@351 118 backend (currently only pydub is supported) to load data.
amine@351 119 min_dur : float, default: 0.2
amine@404 120 minimum duration in seconds of a detected audio event. By using large
amine@351 121 values for `min_dur`, very short audio events (e.g., very short 1-word
amine@404 122 utterances like 'yes' or 'no') can be mis detected. Using a very small
amine@404 123 value may result in a high number of too short audio events.
amine@351 124 max_dur : float, default: 5
amine@351 125 maximum duration in seconds of a detected audio event. If an audio event
amine@351 126 lasts more than `max_dur` it will be truncated. If the continuation of a
amine@351 127 truncated audio event is shorter than `min_dur` then this continuation
amine@351 128 is accepted as a valid audio event if `strict_min_dur` is False.
amine@351 129 Otherwise it is rejected.
amine@351 130 max_silence : float, default: 0.3
amine@351 131 maximum duration of continuous silence within an audio event. There
amine@351 132 might be many silent gaps of this duration within one audio event. If
amine@351 133 the continuous silence happens at the end of the event than it's kept as
amine@351 134 part of the event if `drop_trailing_silence` is False (default).
amine@351 135 drop_trailing_silence : bool, default: False
amine@351 136 Whether to remove trailing silence from detected events. To avoid abrupt
amine@368 137 cuts in speech, trailing silence should be kept, therefore this
amine@368 138 parameter should be False.
amine@351 139 strict_min_dur : bool, default: False
amine@351 140 strict minimum duration. Do not accept an audio event if it is shorter
amine@368 141 than `min_dur` even if it is contiguous to the latest valid event. This
amine@368 142 happens if the the latest detected event had reached `max_dur`.
amine@179 143
amine@368 144 Other Parameters
amine@368 145 ----------------
amine@351 146 analysis_window, aw : float, default: 0.05 (50 ms)
amine@351 147 duration of analysis window in seconds. A value between 0.01 (10 ms) and
amine@351 148 0.1 (100 ms) should be good for most use-cases.
amine@351 149 audio_format, fmt : str
amine@351 150 type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
amine@368 151 used if `input` is a string path to an audio file. If not given, audio
amine@351 152 type will be guessed from file name extension or from file header.
amine@351 153 sampling_rate, sr : int
amine@362 154 sampling rate of audio data. Required if `input` is a raw audio file, is
amine@351 155 a bytes object or None (i.e., read from microphone).
amine@351 156 sample_width, sw : int
amine@351 157 number of bytes used to encode one audio sample, typically 1, 2 or 4.
amine@351 158 Required for raw data, see `sampling_rate`.
amine@351 159 channels, ch : int
amine@362 160 number of channels of audio data. Required for raw data, see
amine@351 161 `sampling_rate`.
amine@351 162 use_channel, uc : {None, "mix"} or int
amine@351 163 which channel to use for split if `input` has multiple audio channels.
amine@351 164 Regardless of which channel is used for splitting, returned audio events
amine@351 165 contain data from *all* channels, just as `input`.
amine@351 166 The following values are accepted:
amine@368 167
amine@368 168 - None (alias "any"): accept audio activity from any channel, even if
amine@368 169 other channels are silent. This is the default behavior.
amine@368 170
amine@368 171 - "mix" ("avg" or "average"): mix down all channels (i.e. compute
amine@368 172 average channel) and split the resulting channel.
amine@368 173
amine@368 174 - int (0 <=, > `channels`): use one channel, specified by integer id,
amine@368 175 for split.
amine@368 176
amine@351 177 large_file : bool, default: False
amine@351 178 If True, AND if `input` is a path to a *wav* of a *raw* audio file
amine@351 179 (and only these two formats) then audio data is lazily loaded to memory
amine@351 180 (i.e., one analysis window a time). Otherwise the whole file is loaded
amine@351 181 to memory before split. Set to True if the size of the file is larger
amine@351 182 than available memory.
amine@368 183 max_read, mr : float, default: None, read until end of stream
amine@351 184 maximum data to read from source in seconds.
amine@351 185 validator, val : callable, DataValidator
amine@404 186 custom data validator. If `None` (default), an `AudioEnergyValidtor` is
amine@362 187 used with the given energy threshold. Can be a callable or an instance
amine@351 188 of `DataValidator` that implements `is_valid`. In either case, it'll be
amine@351 189 called with with a window of audio data as the first parameter.
amine@351 190 energy_threshold, eth : float, default: 50
amine@362 191 energy threshold for audio activity detection. Audio regions that have
amine@351 192 enough windows of with a signal energy equal to or above this threshold
amine@362 193 are considered valid audio events. Here we are referring to this amount
amine@362 194 as the energy of the signal but to be more accurate, it is the log
amine@368 195 energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see
amine@368 196 :class:`AudioEnergyValidator` and
amine@368 197 :func:`calculate_energy_single_channel`). If `validator` is given, this
amine@368 198 argument is ignored.
amine@368 199
amine@368 200 Yields
amine@368 201 ------
amine@368 202 AudioRegion
amine@368 203 a generator of detected :class:`AudioRegion` s.
amine@179 204 """
amine@225 205 if min_dur <= 0:
amine@404 206 raise ValueError(f"'min_dur' ({min_dur}) must be > 0")
amine@225 207 if max_dur <= 0:
amine@404 208 raise ValueError(f"'max_dur' ({max_dur}) must be > 0")
amine@225 209 if max_silence < 0:
amine@404 210 raise ValueError(f"'max_silence' ({max_silence}) must be >= 0")
amine@219 211
amine@295 212 if isinstance(input, AudioReader):
amine@179 213 source = input
amine@207 214 analysis_window = source.block_dur
amine@179 215 else:
amine@207 216 analysis_window = kwargs.get(
amine@210 217 "analysis_window", kwargs.get("aw", DEFAULT_ANALYSIS_WINDOW)
amine@207 218 )
amine@237 219 if analysis_window <= 0:
amine@237 220 raise ValueError(
amine@404 221 f"'analysis_window' ({analysis_window}) must be > 0"
amine@237 222 )
amine@210 223
amine@179 224 params = kwargs.copy()
amine@210 225 params["max_read"] = params.get("max_read", params.get("mr"))
amine@212 226 params["audio_format"] = params.get("audio_format", params.get("fmt"))
amine@179 227 if isinstance(input, AudioRegion):
amine@179 228 params["sampling_rate"] = input.sr
amine@179 229 params["sample_width"] = input.sw
amine@179 230 params["channels"] = input.ch
amine@179 231 input = bytes(input)
amine@236 232 try:
amine@297 233 source = AudioReader(input, block_dur=analysis_window, **params)
amine@404 234 except TooSmallBlockDuration as exc:
amine@404 235 err_msg = f"Too small 'analysis_window' ({exc.block_dur}) for "
amine@404 236 err_msg += f"sampling rate ({exc.sampling_rate}). Analysis window "
amine@404 237 err_msg += f"should at least be 1/{exc.sampling_rate} to cover "
amine@404 238 err_msg += "one data sample"
amine@404 239 raise ValueError(err_msg) from exc
amine@179 240
amine@210 241 validator = kwargs.get("validator", kwargs.get("val"))
amine@179 242 if validator is None:
amine@185 243 energy_threshold = kwargs.get(
amine@185 244 "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
amine@185 245 )
amine@241 246 use_channel = kwargs.get("use_channel", kwargs.get("uc"))
amine@241 247 validator = AudioEnergyValidator(
amine@241 248 energy_threshold, source.sw, source.ch, use_channel=use_channel
amine@241 249 )
amine@387 250 mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0
amine@183 251 if strict_min_dur:
amine@179 252 mode |= StreamTokenizer.STRICT_MIN_LENGTH
amine@222 253 min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil)
amine@236 254 max_length = _duration_to_nb_windows(
amine@236 255 max_dur, analysis_window, math.floor, _EPSILON
amine@236 256 )
amine@185 257 max_continuous_silence = _duration_to_nb_windows(
amine@232 258 max_silence, analysis_window, math.floor, _EPSILON
amine@185 259 )
amine@179 260
amine@222 261 err_msg = "({0} sec.) results in {1} analysis window(s) "
amine@222 262 err_msg += "({1} == {6}({0} / {2})) which is {5} the number "
amine@222 263 err_msg += "of analysis window(s) for 'max_dur' ({3} == floor({4} / {2}))"
amine@219 264 if min_length > max_length:
amine@219 265 err_msg = "'min_dur' " + err_msg
amine@219 266 raise ValueError(
amine@219 267 err_msg.format(
amine@222 268 min_dur,
amine@222 269 min_length,
amine@222 270 analysis_window,
amine@222 271 max_length,
amine@222 272 max_dur,
amine@222 273 "higher than",
amine@222 274 "ceil",
amine@219 275 )
amine@219 276 )
amine@219 277
amine@219 278 if max_continuous_silence >= max_length:
amine@219 279 err_msg = "'max_silence' " + err_msg
amine@219 280 raise ValueError(
amine@219 281 err_msg.format(
amine@219 282 max_silence,
amine@219 283 max_continuous_silence,
amine@219 284 analysis_window,
amine@219 285 max_length,
amine@219 286 max_dur,
amine@222 287 "higher or equal to",
amine@222 288 "floor",
amine@219 289 )
amine@219 290 )
amine@219 291
amine@179 292 tokenizer = StreamTokenizer(
amine@179 293 validator, min_length, max_length, max_continuous_silence, mode=mode
amine@179 294 )
amine@179 295 source.open()
amine@179 296 token_gen = tokenizer.tokenize(source, generator=True)
amine@179 297 region_gen = (
amine@179 298 _make_audio_region(
amine@323 299 token[0],
amine@323 300 token[1],
amine@185 301 source.block_dur,
amine@185 302 source.sr,
amine@185 303 source.sw,
amine@185 304 source.ch,
amine@179 305 )
amine@179 306 for token in token_gen
amine@179 307 )
amine@179 308 return region_gen
amine@179 309
amine@179 310
amine@236 311 def _duration_to_nb_windows(
amine@236 312 duration, analysis_window, round_fn=round, epsilon=0
amine@236 313 ):
amine@179 314 """
amine@215 315 Converts a given duration into a positive integer of analysis windows.
amine@179 316 if `duration / analysis_window` is not an integer, the result will be
amine@179 317 rounded to the closest bigger integer. If `duration == 0`, returns `0`.
amine@215 318 If `duration < analysis_window`, returns 1.
amine@179 319 `duration` and `analysis_window` can be in seconds or milliseconds but
amine@179 320 must be in the same unit.
amine@179 321
amine@351 322 Parameters
amine@351 323 ----------
amine@351 324 duration : float
amine@232 325 a given duration in seconds or ms.
amine@179 326 analysis_window: float
amine@232 327 size of analysis window, in the same unit as `duration`.
amine@351 328 round_fn : callable
amine@232 329 function called to round the result. Default: `round`.
amine@351 330 epsilon : float
amine@232 331 small value to add to the division result before rounding.
amine@232 332 E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with
amine@232 333 `round_fn=math.floor` returns `2` instead of `3`. Adding a small value
amine@232 334 to `0.3 / 0.1` avoids this error.
amine@179 335
amine@351 336 Returns
amine@351 337 -------
amine@368 338 nb_windows : int
amine@179 339 minimum number of `analysis_window`'s to cover `durartion`. That means
amine@179 340 that `analysis_window * nb_windows >= duration`.
amine@179 341 """
amine@215 342 if duration < 0 or analysis_window <= 0:
amine@215 343 err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0"
amine@215 344 raise ValueError(err_msg.format(duration, analysis_window))
amine@179 345 if duration == 0:
amine@179 346 return 0
amine@232 347 return int(round_fn(duration / analysis_window + epsilon))
amine@179 348
amine@179 349
amine@179 350 def _make_audio_region(
amine@323 351 data_frames,
amine@323 352 start_frame,
amine@185 353 frame_duration,
amine@185 354 sampling_rate,
amine@185 355 sample_width,
amine@185 356 channels,
amine@179 357 ):
amine@351 358 """
amine@351 359 Helper function to create an `AudioRegion` from parameters returned by
amine@351 360 tokenization object. It takes care of setting up region `start` and `end`
amine@351 361 in metadata.
amine@179 362
amine@351 363 Parameters
amine@351 364 ----------
amine@179 365 frame_duration: float
amine@179 366 duration of analysis window in seconds
amine@351 367 start_frame : int
amine@404 368 index of the first analysis window
amine@404 369 sampling_rate : int
amine@179 370 sampling rate of audio data
amine@351 371 sample_width : int
amine@179 372 number of bytes of one audio sample
amine@351 373 channels : int
amine@179 374 number of channels of audio data
amine@179 375
amine@351 376 Returns
amine@351 377 -------
amine@351 378 audio_region : AudioRegion
amine@404 379 AudioRegion whose start time is calculated as:
amine@185 380 `1000 * start_frame * frame_duration`
amine@179 381 """
amine@179 382 start = start_frame * frame_duration
amine@179 383 data = b"".join(data_frames)
amine@411 384 return AudioRegion(data, sampling_rate, sample_width, channels, start)
amine@81 385
amine@81 386
amine@308 387 def _read_chunks_online(max_read, **kwargs):
amine@351 388 """
amine@351 389 Helper function to read audio data from an online blocking source
amine@351 390 (i.e., microphone). Used to build an `AudioRegion` and can intercept
amine@351 391 KeyboardInterrupt so that reading stops as soon as this exception is
amine@351 392 raised. Makes building `AudioRegion`s on [i]python sessions and jupyter
amine@351 393 notebooks more user friendly.
amine@351 394
amine@351 395 Parameters
amine@351 396 ----------
amine@351 397 max_read : float
amine@351 398 maximum amount of data to read in seconds.
amine@351 399 kwargs :
amine@351 400 audio parameters (sampling_rate, sample_width and channels).
amine@351 401
amine@351 402 See also
amine@351 403 --------
amine@351 404 `AudioRegion.build`
amine@351 405 """
amine@308 406 reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs)
amine@308 407 reader.open()
amine@308 408 data = []
amine@308 409 try:
amine@308 410 while True:
amine@308 411 frame = reader.read()
amine@308 412 if frame is None:
amine@308 413 break
amine@308 414 data.append(frame)
amine@308 415 except KeyboardInterrupt:
amine@308 416 # Stop data acquisition from microphone when pressing
amine@308 417 # Ctrl+C on a [i]python session or a notebook
amine@308 418 pass
amine@308 419 reader.close()
amine@308 420 return (
amine@308 421 b"".join(data),
amine@308 422 reader.sampling_rate,
amine@308 423 reader.sample_width,
amine@308 424 reader.channels,
amine@308 425 )
amine@308 426
amine@308 427
amine@308 428 def _read_offline(input, skip=0, max_read=None, **kwargs):
amine@351 429 """
amine@351 430 Helper function to read audio data from an offline (i.e., file). Used to
amine@351 431 build `AudioRegion`s.
amine@351 432
amine@351 433 Parameters
amine@351 434 ----------
amine@351 435 input : str, bytes
amine@351 436 path to audio file (if str), or a bytes object representing raw audio
amine@351 437 data.
amine@351 438 skip : float, default 0
amine@351 439 amount of data to skip from the begining of audio source.
amine@351 440 max_read : float, default: None
amine@351 441 maximum amount of audio data to read. Default: None, means read until
amine@351 442 end of stream.
amine@351 443 kwargs :
amine@351 444 audio parameters (sampling_rate, sample_width and channels).
amine@351 445
amine@351 446 See also
amine@351 447 --------
amine@351 448 `AudioRegion.build`
amine@351 449
amine@351 450 """
amine@308 451 audio_source = get_audio_source(input, **kwargs)
amine@308 452 audio_source.open()
amine@308 453 if skip is not None and skip > 0:
amine@308 454 skip_samples = round(skip * audio_source.sampling_rate)
amine@308 455 audio_source.read(skip_samples)
amine@308 456 if max_read is not None:
amine@308 457 if max_read < 0:
amine@308 458 max_read = None
amine@308 459 else:
amine@308 460 max_read = round(max_read * audio_source.sampling_rate)
amine@308 461 data = audio_source.read(max_read)
amine@323 462 audio_source.close()
amine@308 463 return (
amine@308 464 data,
amine@308 465 audio_source.sampling_rate,
amine@308 466 audio_source.sample_width,
amine@308 467 audio_source.channels,
amine@308 468 )
amine@308 469
amine@308 470
amine@228 471 def _check_convert_index(index, types, err_msg):
amine@228 472 if not isinstance(index, slice) or index.step is not None:
amine@228 473 raise TypeError(err_msg)
amine@228 474 start = index.start if index.start is not None else 0
amine@228 475 stop = index.stop
amine@228 476 for index in (start, stop):
amine@228 477 if index is not None and not isinstance(index, types):
amine@228 478 raise TypeError(err_msg)
amine@228 479 return start, stop
amine@228 480
amine@228 481
amine@228 482 class _SecondsView:
amine@351 483 """A class to create a view of `AudioRegion` that can be sliced using
amine@351 484 indices in seconds.
amine@351 485 """
amine@351 486
amine@228 487 def __init__(self, region):
amine@228 488 self._region = region
amine@228 489
amine@228 490 def __getitem__(self, index):
amine@228 491 err_msg = "Slicing AudioRegion by seconds requires indices of type "
amine@228 492 err_msg += "'int' or 'float' without a step (e.g. region.sec[7.5:10])"
amine@228 493 start_s, stop_s = _check_convert_index(index, (int, float), err_msg)
amine@228 494 sr = self._region.sampling_rate
amine@228 495 start_sample = int(start_s * sr)
amine@228 496 stop_sample = None if stop_s is None else round(stop_s * sr)
amine@228 497 return self._region[start_sample:stop_sample]
amine@228 498
amine@245 499 @property
amine@245 500 def len(self):
amine@245 501 """
amine@245 502 Return region duration in seconds.
amine@245 503 """
amine@245 504 return self._region.duration
amine@245 505
amine@228 506
amine@228 507 class _MillisView(_SecondsView):
amine@351 508 """A class to create a view of `AudioRegion` that can be sliced using
amine@351 509 indices in milliseconds.
amine@351 510 """
amine@351 511
amine@228 512 def __getitem__(self, index):
amine@228 513 err_msg = (
amine@228 514 "Slicing AudioRegion by milliseconds requires indices of type "
amine@228 515 )
amine@228 516 err_msg += "'int' without a step (e.g. region.sec[500:1500])"
amine@228 517 start_ms, stop_ms = _check_convert_index(index, (int), err_msg)
amine@228 518 start_sec = start_ms / 1000
amine@228 519 stop_sec = None if stop_ms is None else stop_ms / 1000
amine@228 520 index = slice(start_sec, stop_sec)
amine@413 521 return super().__getitem__(index)
amine@228 522
amine@245 523 def __len__(self):
amine@245 524 """
amine@245 525 Return region duration in milliseconds.
amine@245 526 """
amine@245 527 return round(self._region.duration * 1000)
amine@245 528
amine@245 529 @property
amine@245 530 def len(self):
amine@245 531 """
amine@245 532 Return region duration in milliseconds.
amine@245 533 """
amine@245 534 return len(self)
amine@245 535
amine@228 536
amine@244 537 class _AudioRegionMetadata(dict):
amine@387 538 """A class to store `AudioRegion`'s metadata."""
amine@351 539
amine@244 540 def __getattr__(self, name):
amine@411 541 warnings.warn(
amine@411 542 "`AudioRegion.meta` is deprecated and will be removed in future "
amine@411 543 "versions. For the 'start' and 'end' fields, please use "
amine@411 544 "`AudioRegion.start` and `AudioRegion.end`.",
amine@411 545 DeprecationWarning,
amine@411 546 stacklevel=2,
amine@411 547 )
amine@244 548 if name in self:
amine@244 549 return self[name]
amine@244 550 else:
amine@244 551 err_msg = "AudioRegion metadata has no entry '{}'"
amine@244 552 raise AttributeError(err_msg.format(name))
amine@244 553
amine@244 554 def __setattr__(self, name, value):
amine@244 555 self[name] = value
amine@244 556
amine@244 557 def __str__(self):
amine@244 558 return "\n".join("{}: {}".format(k, v) for k, v in self.items())
amine@244 559
amine@244 560 def __repr__(self):
amine@244 561 return str(self)
amine@244 562
amine@244 563
amine@411 564 @dataclass(frozen=True)
amine@81 565 class AudioRegion(object):
amine@368 566 """
amine@368 567 AudioRegion encapsulates raw audio data and provides an interface to
amine@368 568 perform simple operations on it. Use `AudioRegion.load` to build an
amine@368 569 `AudioRegion` from different types of objects.
amine@368 570
amine@368 571 Parameters
amine@368 572 ----------
amine@368 573 data : bytes
amine@368 574 raw audio data as a bytes object
amine@368 575 sampling_rate : int
amine@368 576 sampling rate of audio data
amine@368 577 sample_width : int
amine@368 578 number of bytes of one audio sample
amine@368 579 channels : int
amine@368 580 number of channels of audio data
amine@412 581 start : float, default: None
amine@412 582 optional start time of the region. This is typically provided by the
amine@412 583 `split` function.
amine@368 584
amine@368 585 See also
amine@368 586 --------
amine@368 587 AudioRegion.load
amine@368 588 """
amine@368 589
amine@411 590 data: bytes
amine@411 591 sampling_rate: int
amine@411 592 sample_width: int
amine@411 593 channels: int
amine@411 594 start: float = field(default=None, repr=None)
amine@411 595
amine@411 596 def __post_init__(self):
amine@244 597
amine@411 598 check_audio_data(self.data, self.sample_width, self.channels)
amine@411 599
amine@411 600 object.__setattr__(self, "splitp", self.split_and_plot)
amine@411 601 object.__setattr__(self, "_samples", None)
amine@411 602
amine@411 603 duration = len(self.data) / (
amine@411 604 self.sampling_rate * self.sample_width * self.channels
amine@411 605 )
amine@411 606 object.__setattr__(self, "duration", duration)
amine@411 607
amine@411 608 if self.start is not None:
amine@411 609 object.__setattr__(self, "end", self.start + self.duration)
amine@411 610 object.__setattr__(
amine@411 611 self,
amine@411 612 "meta",
amine@411 613 _AudioRegionMetadata({"start": self.start, "end": self.end}),
amine@411 614 )
amine@411 615 else:
amine@411 616 object.__setattr__(self, "end", None)
amine@411 617 object.__setattr__(self, "meta", None)
amine@411 618
amine@411 619 # `seconds` and `millis` are defined below as @property with docstring
amine@411 620 object.__setattr__(self, "_seconds_view", _SecondsView(self))
amine@411 621 object.__setattr__(self, "_millis_view", _MillisView(self))
amine@411 622
amine@411 623 object.__setattr__(self, "sec", self.seconds)
amine@411 624 object.__setattr__(self, "s", self.seconds)
amine@411 625 object.__setattr__(self, "ms", self.millis)
amine@244 626
amine@239 627 @classmethod
amine@307 628 def load(cls, input, skip=0, max_read=None, **kwargs):
amine@351 629 """
amine@373 630 Create an `AudioRegion` by loading data from `input`. See :func:`load`
amine@373 631 for parameters descripion.
amine@351 632
amine@351 633 Returns
amine@351 634 -------
amine@351 635 region: AudioRegion
amine@351 636
amine@351 637 Raises
amine@351 638 ------
amine@368 639 ValueError
amine@368 640 raised if `input` is None and `skip` != 0 or `max_read` is None.
amine@351 641 """
amine@308 642 if input is None:
amine@351 643 if skip > 0:
amine@351 644 raise ValueError(
amine@351 645 "'skip' should be 0 when reading from microphone"
amine@351 646 )
amine@308 647 if max_read is None or max_read < 0:
amine@308 648 raise ValueError(
amine@333 649 "'max_read' should not be None when reading from "
amine@333 650 "microphone"
amine@308 651 )
amine@308 652 data, sampling_rate, sample_width, channels = _read_chunks_online(
amine@308 653 max_read, **kwargs
amine@307 654 )
amine@239 655 else:
amine@308 656 data, sampling_rate, sample_width, channels = _read_offline(
amine@308 657 input, skip=skip, max_read=max_read, **kwargs
amine@308 658 )
amine@308 659
amine@308 660 return cls(data, sampling_rate, sample_width, channels)
amine@239 661
amine@228 662 @property
amine@351 663 def seconds(self):
amine@373 664 """
amine@404 665 A view to slice audio region by seconds using
amine@404 666 ``region.seconds[start:end]``.
amine@373 667 """
amine@228 668 return self._seconds_view
amine@228 669
amine@228 670 @property
amine@228 671 def millis(self):
amine@404 672 """A view to slice audio region by milliseconds using
amine@404 673 ``region.millis[start:end]``."""
amine@228 674 return self._millis_view
amine@228 675
amine@81 676 @property
amine@81 677 def sr(self):
leminhnguyen@395 678 """Sampling rate of audio data, alias for `sampling_rate`."""
amine@411 679 return self.sampling_rate
amine@81 680
amine@81 681 @property
amine@81 682 def sw(self):
amine@411 683 """Number of bytes per sample, alias for `sample_width`."""
amine@411 684 return self.sample_width
amine@81 685
amine@81 686 @property
amine@81 687 def ch(self):
amine@387 688 """Number of channels of audio data, alias for `channels`."""
amine@411 689 return self.channels
amine@2 690
amine@270 691 def play(self, progress_bar=False, player=None, **progress_bar_kwargs):
amine@351 692 """
amine@351 693 Play audio region.
amine@201 694
amine@351 695 Parameters
amine@351 696 ----------
amine@351 697 progress_bar : bool, default: False
amine@351 698 whether to use a progress bar while playing audio. Default: False.
amine@351 699 `progress_bar` requires `tqdm`, if not installed, no progress bar
amine@351 700 will be shown.
amine@351 701 player : AudioPalyer, default: None
amine@351 702 audio player to use. if None (default), use `player_for()`
amine@201 703 to get a new audio player.
amine@351 704 progress_bar_kwargs : kwargs
amine@351 705 keyword arguments to pass to `tqdm` progress_bar builder (e.g.,
amine@368 706 use `leave=False` to clean up the screen when play finishes).
amine@201 707 """
amine@199 708 if player is None:
amine@199 709 player = player_for(self)
amine@411 710 player.play(self.data, progress_bar=progress_bar, **progress_bar_kwargs)
amine@199 711
amine@411 712 def save(
amine@411 713 self, filename, audio_format=None, exists_ok=True, **audio_parameters
amine@411 714 ):
amine@351 715 """
amine@351 716 Save audio region to file.
amine@187 717
amine@351 718 Parameters
amine@351 719 ----------
amine@411 720 filename : str, Path
amine@411 721 path to output audio file. If of type `str`, it may contain a
amine@411 722 `{start}`, `{end}` and a `{duration}` placeholders.
amine@411 723 Regions returned by `split` contain a `start` and and `end`
amine@411 724 attributes that can be used to build output file name as in the
amine@411 725 example.
amine@362 726 audio_format : str, default: None
amine@351 727 format used to save audio data. If None (default), format is guessed
amine@351 728 from file name's extension. If file name has no extension, audio
amine@351 729 data is saved as a raw (headerless) audio file.
amine@351 730 exists_ok : bool, default: True
amine@368 731 If True, overwrite `file` if a file with the same name exists.
amine@368 732 If False, raise an `IOError` if `file` exists.
amine@351 733 audio_parameters: dict
amine@351 734 any keyword arguments to be passed to audio saving backend.
amine@187 735
amine@351 736 Returns
amine@351 737 -------
amine@351 738 file: str
amine@411 739 name of output file with filled placehoders.
amine@351 740 Raises
amine@411 741 IOError if `filename` exists and `exists_ok` is False.
amine@187 742
amine@368 743
amine@368 744 Examples
amine@368 745 --------
amine@411 746 Create and AudioRegion, explicitly passing a value for `start`. `end`
amine@411 747 will be computed based on `start` and the region's duration.
amine@411 748
amine@411 749 >>> region = AudioRegion(b'\0' * 2 * 24000,
amine@368 750 >>> sampling_rate=16000,
amine@368 751 >>> sample_width=2,
amine@411 752 >>> channels=1,
amine@411 753 >>> start=2.25)
amine@411 754 >>> region
amine@411 755 <AudioRegion(duration=1.500, sampling_rate=16000, sample_width=2, channels=1)>
amine@411 756
amine@411 757 >>> assert region.end == 3.75
amine@411 758 >>> assert region.save('audio_{start}-{end}.wav') == "audio_2.25-3.75.wav"
amine@411 759 >>> filename = region.save('audio_{start:.3f}-{end:.3f}_{duration:.3f}.wav')
amine@411 760 >>> assert filename == "audio_2.250-3.750_1.500.wav"
amine@187 761 """
amine@411 762 if isinstance(filename, Path):
amine@411 763 if not exists_ok and filename.exists():
amine@411 764 raise FileExistsError(
amine@411 765 "file '{filename}' exists".format(filename=str(filename))
amine@411 766 )
amine@411 767 if isinstance(filename, str):
amine@411 768 filename = filename.format(
amine@411 769 duration=self.duration,
amine@411 770 meta=self.meta,
amine@411 771 start=self.start,
amine@411 772 end=self.end,
amine@411 773 )
amine@411 774 if not exists_ok and os.path.exists(filename):
amine@411 775 raise FileExistsError(
amine@411 776 "file '{filename}' exists".format(filename=filename)
amine@411 777 )
amine@187 778 to_file(
amine@411 779 self.data,
amine@411 780 filename,
amine@351 781 audio_format,
amine@187 782 sr=self.sr,
amine@187 783 sw=self.sw,
amine@187 784 ch=self.ch,
amine@195 785 audio_parameters=audio_parameters,
amine@187 786 )
amine@411 787 return filename
amine@187 788
amine@248 789 def split(
amine@248 790 self,
amine@248 791 min_dur=0.2,
amine@248 792 max_dur=5,
amine@248 793 max_silence=0.3,
amine@248 794 drop_trailing_silence=False,
amine@248 795 strict_min_dur=False,
amine@404 796 **kwargs,
amine@248 797 ):
amine@368 798 """Split audio region. See :func:`auditok.split()` for a comprehensive
amine@368 799 description of split parameters.
amine@373 800 See Also :meth:`AudioRegio.split_and_plot`.
amine@248 801 """
amine@306 802 if kwargs.get("max_read", kwargs.get("mr")) is not None:
amine@306 803 warn_msg = "'max_read' (or 'mr') should not be used with "
amine@306 804 warn_msg += "AudioRegion.split_and_plot(). You should rather "
amine@306 805 warn_msg += "slice audio region before calling this method"
amine@306 806 raise RuntimeWarning(warn_msg)
amine@248 807 return split(
amine@248 808 self,
amine@248 809 min_dur=min_dur,
amine@248 810 max_dur=max_dur,
amine@248 811 max_silence=max_silence,
amine@248 812 drop_trailing_silence=drop_trailing_silence,
amine@248 813 strict_min_dur=strict_min_dur,
amine@404 814 **kwargs,
amine@248 815 )
amine@248 816
amine@301 817 def plot(
amine@301 818 self,
amine@301 819 scale_signal=True,
amine@301 820 show=True,
amine@301 821 figsize=None,
amine@301 822 save_as=None,
amine@301 823 dpi=120,
amine@301 824 theme="auditok",
amine@301 825 ):
amine@404 826 """Plot audio region using one sub-plot per each channel.
amine@373 827
amine@373 828 Parameters
amine@373 829 ----------
amine@373 830 scale_signal : bool, default: True
amine@373 831 if true, scale signal by subtracting its mean and dividing by its
amine@373 832 standard deviation before plotting.
amine@373 833 show : bool
amine@373 834 whether to show plotted signal right after the call.
amine@373 835 figsize : tuple, default: None
amine@373 836 width and height of the figure to pass to `matplotlib`.
amine@373 837 save_as : str, default None.
amine@373 838 if provided, also save plot to file.
amine@373 839 dpi : int, default: 120
amine@373 840 plot dpi to pass to `matplotlib`.
amine@373 841 theme : str or dict, default: "auditok"
amine@373 842 plot theme to use. Currently only "auditok" theme is implemented. To
amine@373 843 provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
amine@368 844 """
amine@404 845 plot(
amine@404 846 self,
amine@404 847 scale_signal=scale_signal,
amine@404 848 show=show,
amine@404 849 figsize=figsize,
amine@404 850 save_as=save_as,
amine@404 851 dpi=dpi,
amine@404 852 theme=theme,
amine@404 853 )
amine@250 854
amine@250 855 def split_and_plot(
amine@250 856 self,
amine@250 857 min_dur=0.2,
amine@250 858 max_dur=5,
amine@250 859 max_silence=0.3,
amine@250 860 drop_trailing_silence=False,
amine@250 861 strict_min_dur=False,
amine@301 862 scale_signal=True,
amine@250 863 show=True,
amine@301 864 figsize=None,
amine@301 865 save_as=None,
amine@301 866 dpi=120,
amine@301 867 theme="auditok",
amine@404 868 **kwargs,
amine@250 869 ):
amine@368 870 """Split region and plot signal and detections. Alias: :meth:`splitp`.
amine@368 871 See :func:`auditok.split()` for a comprehensive description of split
amine@373 872 parameters. Also see :meth:`plot` for plot parameters.
amine@250 873 """
amine@404 874 regions = self.split(
amine@404 875 min_dur=min_dur,
amine@404 876 max_dur=max_dur,
amine@404 877 max_silence=max_silence,
amine@404 878 drop_trailing_silence=drop_trailing_silence,
amine@404 879 strict_min_dur=strict_min_dur,
amine@404 880 **kwargs,
amine@404 881 )
amine@404 882 regions = list(regions)
amine@404 883 detections = ((reg.meta.start, reg.meta.end) for reg in regions)
amine@404 884 eth = kwargs.get(
amine@404 885 "energy_threshold", kwargs.get("eth", DEFAULT_ENERGY_THRESHOLD)
amine@404 886 )
amine@404 887 plot(
amine@404 888 self,
amine@404 889 scale_signal=scale_signal,
amine@404 890 detections=detections,
amine@404 891 energy_threshold=eth,
amine@404 892 show=show,
amine@404 893 figsize=figsize,
amine@404 894 save_as=save_as,
amine@404 895 dpi=dpi,
amine@404 896 theme=theme,
amine@404 897 )
amine@404 898 return regions
amine@250 899
amine@246 900 @property
amine@246 901 def samples(self):
amine@387 902 """Audio region as arrays of samples, one array per channel."""
amine@411 903 if self._samples is None: # TODO fixit
amine@411 904 _samples = signal.to_array(
amine@411 905 self.data, self.sample_width, self.channels
amine@323 906 )
amine@411 907 return _samples
amine@246 908
amine@408 909 def __array__(self):
amine@408 910 return self.samples
amine@408 911
amine@408 912 def numpy(self):
amine@408 913 return self.samples
amine@408 914
amine@82 915 def __len__(self):
amine@85 916 """
amine@245 917 Return region length in number of samples.
amine@85 918 """
amine@411 919 return len(self.data) // (self.sample_width * self.channels)
amine@245 920
amine@245 921 @property
amine@245 922 def len(self):
amine@245 923 """
amine@245 924 Return region length in number of samples.
amine@245 925 """
amine@245 926 return len(self)
amine@82 927
amine@83 928 def __bytes__(self):
amine@411 929 return self.data
amine@83 930
amine@244 931 def __str__(self):
amine@178 932 return (
amine@244 933 "AudioRegion(duration={:.3f}, "
amine@178 934 "sampling_rate={}, sample_width={}, channels={})".format(
amine@244 935 self.duration, self.sr, self.sw, self.ch
amine@178 936 )
amine@178 937 )
amine@83 938
amine@244 939 def __repr__(self):
amine@409 940 return "<{}>".format(str(self))
amine@83 941
amine@87 942 def __add__(self, other):
amine@87 943 """
amine@87 944 Concatenates this region and `other` and return a new region.
amine@87 945 Both regions must have the same sampling rate, sample width
amine@87 946 and number of channels. If not, raises a `ValueError`.
amine@87 947 """
amine@87 948 if not isinstance(other, AudioRegion):
amine@178 949 raise TypeError(
amine@185 950 "Can only concatenate AudioRegion, "
amine@185 951 'not "{}"'.format(type(other))
amine@178 952 )
amine@87 953 if other.sr != self.sr:
amine@178 954 raise ValueError(
amine@178 955 "Can only concatenate AudioRegions of the same "
amine@178 956 "sampling rate ({} != {})".format(self.sr, other.sr)
amine@178 957 )
amine@87 958 if other.sw != self.sw:
amine@178 959 raise ValueError(
amine@178 960 "Can only concatenate AudioRegions of the same "
amine@178 961 "sample width ({} != {})".format(self.sw, other.sw)
amine@178 962 )
amine@87 963 if other.ch != self.ch:
amine@178 964 raise ValueError(
amine@178 965 "Can only concatenate AudioRegions of the same "
amine@178 966 "number of channels ({} != {})".format(self.ch, other.ch)
amine@178 967 )
amine@411 968 data = self.data + other.data
amine@244 969 return AudioRegion(data, self.sr, self.sw, self.ch)
amine@87 970
amine@87 971 def __radd__(self, other):
amine@87 972 """
amine@87 973 Concatenates `other` and this region. `other` should be an
amine@87 974 `AudioRegion` with the same audio parameters as this region
amine@87 975 but can exceptionally be `0` to make it possible to concatenate
amine@87 976 many regions with `sum`.
amine@87 977 """
amine@87 978 if other == 0:
amine@87 979 return self
amine@87 980 return other.add(self)
amine@87 981
amine@195 982 def __mul__(self, n):
amine@195 983 if not isinstance(n, int):
amine@195 984 err_msg = "Can't multiply AudioRegion by a non-int of type '{}'"
amine@195 985 raise TypeError(err_msg.format(type(n)))
amine@411 986 data = self.data * n
amine@244 987 return AudioRegion(data, self.sr, self.sw, self.ch)
amine@195 988
amine@195 989 def __rmul__(self, n):
amine@195 990 return self * n
amine@195 991
amine@247 992 def __truediv__(self, n):
amine@247 993 if not isinstance(n, int) or n <= 0:
amine@387 994 raise TypeError("AudioRegion can only be divided by a positive int")
amine@253 995 samples_per_sub_region, rest = divmod(len(self), n)
amine@253 996 onset = 0
amine@247 997 sub_regions = []
amine@253 998 while onset < len(self):
amine@253 999 offset = 0
amine@253 1000 if rest > 0:
amine@253 1001 offset = 1
amine@253 1002 rest -= 1
amine@253 1003 offset += onset + samples_per_sub_region
amine@253 1004 sub_regions.append(self[onset:offset])
amine@253 1005 onset = offset
amine@247 1006 return sub_regions
amine@247 1007
amine@198 1008 def __eq__(self, other):
amine@198 1009 if other is self:
amine@198 1010 return True
amine@198 1011 if not isinstance(other, AudioRegion):
amine@198 1012 return False
amine@198 1013 return (
amine@411 1014 (self.data == other.data)
amine@198 1015 and (self.sr == other.sr)
amine@198 1016 and (self.sw == other.sw)
amine@198 1017 and (self.ch == other.ch)
amine@198 1018 )
amine@198 1019
amine@188 1020 def __getitem__(self, index):
amine@230 1021 err_msg = "Slicing AudioRegion by samples requires indices of type "
amine@230 1022 err_msg += "'int' without a step (e.g. region.sec[1600:3200])"
amine@230 1023 start_sample, stop_sample = _check_convert_index(index, (int), err_msg)
amine@188 1024
amine@228 1025 bytes_per_sample = self.sample_width * self.channels
amine@411 1026 len_samples = len(self.data) // bytes_per_sample
amine@188 1027
amine@230 1028 if start_sample < 0:
amine@230 1029 start_sample = max(start_sample + len_samples, 0)
amine@228 1030 onset = start_sample * bytes_per_sample
amine@188 1031
amine@230 1032 if stop_sample is not None:
amine@230 1033 if stop_sample < 0:
amine@230 1034 stop_sample = max(stop_sample + len_samples, 0)
amine@230 1035 offset = index.stop * bytes_per_sample
amine@230 1036 else:
amine@228 1037 offset = None
amine@228 1038
amine@411 1039 data = self.data[onset:offset]
amine@244 1040 return AudioRegion(data, self.sr, self.sw, self.ch)
amine@188 1041
amine@2 1042
amine@178 1043 class StreamTokenizer:
amine@32 1044 """
amine@32 1045 Class for stream tokenizers. It implements a 4-state automaton scheme
amine@32 1046 to extract sub-sequences of interest on the fly.
amine@67 1047
amine@351 1048 Parameters
amine@351 1049 ----------
amine@351 1050 validator : callable, DataValidator (must implement `is_valid`)
amine@351 1051 called with each data frame read from source. Should take one positional
amine@351 1052 argument and return True or False for valid and invalid frames
amine@351 1053 respectively.
amine@67 1054
amine@351 1055 min_length : int
amine@351 1056 Minimum number of frames of a valid token. This includes all
amine@351 1057 tolerated non valid frames within the token.
amine@67 1058
amine@351 1059 max_length : int
amine@351 1060 Maximum number of frames of a valid token. This includes all
amine@351 1061 tolerated non valid frames within the token.
amine@67 1062
amine@368 1063 max_continuous_silence : int
amine@351 1064 Maximum number of consecutive non-valid frames within a token.
amine@351 1065 Note that, within a valid token, there may be many tolerated
amine@351 1066 *silent* regions that contain each a number of non valid frames up
amine@351 1067 to `max_continuous_silence`
amine@67 1068
amine@368 1069 init_min : int
amine@351 1070 Minimum number of consecutive valid frames that must be
amine@351 1071 **initially** gathered before any sequence of non valid frames can
amine@351 1072 be tolerated. This option is not always needed, it can be used to
amine@351 1073 drop non-valid tokens as early as possible. **Default = 0** means
amine@351 1074 that the option is by default ineffective.
amine@67 1075
amine@368 1076 init_max_silence : int
amine@351 1077 Maximum number of tolerated consecutive non-valid frames if the
amine@351 1078 number already gathered valid frames has not yet reached
amine@351 1079 'init_min'.This argument is normally used if `init_min` is used.
amine@351 1080 **Default = 0**, by default this argument is not taken into
amine@351 1081 consideration.
amine@67 1082
amine@368 1083 mode : int
amine@368 1084 mode can be one of the following:
amine@67 1085
amine@368 1086 -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and
amine@368 1087 accept a token shorter than `min_length` if it is the continuation
amine@368 1088 of the latest delivered token.
amine@67 1089
amine@368 1090 -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered
amine@368 1091 because `max_length` is reached, and token `i+1` is immediately
amine@368 1092 adjacent to token `i` (i.e. token `i` ends at frame `k` and token
amine@368 1093 `i+1` starts at frame `k+1`) then accept token `i+1` only of it has
amine@368 1094 a size of at least `min_length`. The default behavior is to accept
amine@368 1095 token `i+1` event if it is shorter than `min_length` (provided that
amine@368 1096 the above conditions are fulfilled of course).
amine@327 1097
amine@368 1098 -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing
amine@368 1099 non-valid frames from a token to be delivered if and only if it
amine@368 1100 is not **truncated**. This can be a bit tricky. A token is actually
amine@368 1101 delivered if:
amine@368 1102
amine@368 1103 - `max_continuous_silence` is reached.
amine@368 1104
amine@368 1105 - Its length reaches `max_length`. This is referred to as a
amine@368 1106 **truncated** token.
amine@368 1107
amine@368 1108 In the current implementation, a `StreamTokenizer`'s decision is only
amine@368 1109 based on already seen data and on incoming data. Thus, if a token is
amine@368 1110 truncated at a non-valid but tolerated frame (`max_length` is reached
amine@368 1111 but `max_continuous_silence` not yet) any tailing silence will be kept
amine@368 1112 because it can potentially be part of valid token (if `max_length` was
amine@368 1113 bigger). But if `max_continuous_silence` is reached before
amine@368 1114 `max_length`, the delivered token will not be considered as truncated
amine@368 1115 but a result of *normal* end of detection (i.e. no more valid data).
amine@368 1116 In that case the trailing silence can be removed if you use the
amine@368 1117 `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
amine@368 1118
amine@404 1119 -4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`: # noqa: B950
amine@368 1120 use both options. That means: first remove tailing silence, then
amine@368 1121 check if the token still has a length of at least `min_length`.
amine@368 1122
amine@368 1123
amine@368 1124 Examples
amine@368 1125 --------
amine@67 1126
amine@351 1127 In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
amine@351 1128 accepted although it is shorter than `min_length` (3), because it
amine@351 1129 immediately follows the latest delivered token:
amine@67 1130
amine@368 1131 >>> from auditok.core import StreamTokenizer
leminhnguyen@395 1132 >>> from auditok.util import StringDataSource, DataValidator
amine@351 1133
amine@368 1134 >>> class UpperCaseChecker(DataValidator):
amine@368 1135 >>> def is_valid(self, frame):
amine@351 1136 return frame.isupper()
amine@368 1137 >>> dsource = StringDataSource("aaaAAAABBbbb")
amine@368 1138 >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
amine@351 1139 min_length=3,
amine@351 1140 max_length=4,
amine@351 1141 max_continuous_silence=0)
amine@368 1142 >>> tokenizer.tokenize(dsource)
amine@368 1143 [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
amine@67 1144
amine@67 1145
amine@351 1146 The following tokenizer will however reject the 'BB' token:
amine@67 1147
amine@368 1148 >>> dsource = StringDataSource("aaaAAAABBbbb")
amine@368 1149 >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
amine@351 1150 min_length=3, max_length=4,
amine@351 1151 max_continuous_silence=0,
amine@351 1152 mode=StreamTokenizer.STRICT_MIN_LENGTH)
amine@368 1153 >>> tokenizer.tokenize(dsource)
amine@368 1154 [(['A', 'A', 'A', 'A'], 3, 6)]
amine@351 1155
amine@351 1156
amine@351 1157
amine@368 1158 >>> tokenizer = StreamTokenizer(
amine@368 1159 >>> validator=UpperCaseChecker(),
amine@368 1160 >>> min_length=3,
amine@368 1161 >>> max_length=6,
amine@368 1162 >>> max_continuous_silence=3,
amine@368 1163 >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE
amine@368 1164 >>> )
amine@368 1165 >>> dsource = StringDataSource("aaaAAAaaaBBbbbb")
amine@368 1166 >>> tokenizer.tokenize(dsource)
amine@368 1167 [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
amine@32 1168
amine@351 1169 The first token is delivered with its tailing silence because it is
amine@351 1170 truncated while the second one has its tailing frames removed.
amine@32 1171
amine@351 1172 Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
amine@67 1173
amine@351 1174 .. code:: python
amine@67 1175
amine@351 1176 [
amine@351 1177 (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8),
amine@351 1178 (['B', 'B', 'b', 'b', 'b'], 9, 13)
amine@351 1179 ]
amine@67 1180
amine@32 1181 """
amine@67 1182
amine@32 1183 SILENCE = 0
amine@32 1184 POSSIBLE_SILENCE = 1
amine@67 1185 POSSIBLE_NOISE = 2
amine@32 1186 NOISE = 3
amine@297 1187 NORMAL = 0
amine@32 1188 STRICT_MIN_LENGTH = 2
amine@32 1189 DROP_TRAILING_SILENCE = 4
amine@67 1190
amine@178 1191 def __init__(
amine@178 1192 self,
amine@178 1193 validator,
amine@178 1194 min_length,
amine@178 1195 max_length,
amine@178 1196 max_continuous_silence,
amine@178 1197 init_min=0,
amine@178 1198 init_max_silence=0,
amine@178 1199 mode=0,
amine@178 1200 ):
amine@297 1201 if callable(validator):
amine@297 1202 self._is_valid = validator
amine@297 1203 elif isinstance(validator, DataValidator):
amine@297 1204 self._is_valid = validator.is_valid
amine@297 1205 else:
amine@185 1206 raise TypeError(
amine@333 1207 "'validator' must be a callable or an instance of "
amine@333 1208 "DataValidator"
amine@185 1209 )
amine@67 1210
amine@2 1211 if max_length <= 0:
amine@185 1212 raise ValueError(
amine@185 1213 "'max_length' must be > 0 (value={0})".format(max_length)
amine@185 1214 )
amine@67 1215
amine@2 1216 if min_length <= 0 or min_length > max_length:
amine@387 1217 err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})"
amine@328 1218 raise ValueError(err_msg.format(min_length))
amine@67 1219
amine@2 1220 if max_continuous_silence >= max_length:
amine@328 1221 err_msg = "'max_continuous_silence' must be < 'max_length' "
amine@328 1222 err_msg += "(value={0})"
amine@328 1223 raise ValueError(err_msg.format(max_continuous_silence))
amine@67 1224
amine@5 1225 if init_min >= max_length:
amine@178 1226 raise ValueError(
amine@178 1227 "'init_min' must be < 'max_length' (value={0})".format(
amine@178 1228 max_continuous_silence
amine@178 1229 )
amine@178 1230 )
amine@67 1231
amine@2 1232 self.validator = validator
amine@2 1233 self.min_length = min_length
amine@2 1234 self.max_length = max_length
amine@2 1235 self.max_continuous_silence = max_continuous_silence
amine@2 1236 self.init_min = init_min
amine@2 1237 self.init_max_silent = init_max_silence
amine@297 1238 self._set_mode(mode)
amine@2 1239 self._deliver = None
amine@2 1240 self._tokens = None
amine@2 1241 self._state = None
amine@2 1242 self._data = None
amine@2 1243 self._contiguous_token = False
amine@2 1244 self._init_count = 0
amine@2 1245 self._silence_length = 0
amine@2 1246 self._start_frame = 0
amine@2 1247 self._current_frame = 0
amine@67 1248
amine@297 1249 def _set_mode(self, mode):
amine@297 1250 strict_min_and_drop_trailing = StreamTokenizer.STRICT_MIN_LENGTH
amine@297 1251 strict_min_and_drop_trailing |= StreamTokenizer.DROP_TRAILING_SILENCE
amine@327 1252 if mode not in [
amine@297 1253 StreamTokenizer.NORMAL,
amine@297 1254 StreamTokenizer.STRICT_MIN_LENGTH,
amine@297 1255 StreamTokenizer.DROP_TRAILING_SILENCE,
amine@297 1256 strict_min_and_drop_trailing,
amine@178 1257 ]:
amine@2 1258 raise ValueError("Wrong value for mode")
amine@2 1259 self._mode = mode
amine@2 1260 self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
amine@297 1261 self._drop_trailing_silence = (mode & self.DROP_TRAILING_SILENCE) != 0
amine@67 1262
amine@2 1263 def _reinitialize(self):
amine@2 1264 self._contiguous_token = False
amine@2 1265 self._data = []
amine@2 1266 self._tokens = []
amine@2 1267 self._state = self.SILENCE
amine@2 1268 self._current_frame = -1
amine@2 1269 self._deliver = self._append_token
amine@67 1270
amine@177 1271 def tokenize(self, data_source, callback=None, generator=False):
amine@2 1272 """
amine@328 1273 Read data from `data_source`, one frame a time, and process the read
amine@333 1274 frames in order to detect sequences of frames that make up valid
amine@333 1275 tokens.
amine@67 1276
amine@32 1277 :Parameters:
amine@328 1278 `data_source` : instance of the :class:`DataSource` class that
amine@328 1279 implements a `read` method. 'read' should return a slice of
amine@328 1280 signal, i.e. frame (of whatever type as long as it can be
amine@328 1281 processed by validator) and None if there is no more signal.
amine@67 1282
amine@32 1283 `callback` : an optional 3-argument function.
amine@328 1284 If a `callback` function is given, it will be called each time
amine@328 1285 a valid token is found.
amine@67 1286
amine@67 1287
amine@32 1288 :Returns:
amine@328 1289 A list of tokens if `callback` is None. Each token is tuple with the
amine@328 1290 following elements:
amine@67 1291
amine@32 1292 .. code python
amine@67 1293
amine@32 1294 (data, start, end)
amine@67 1295
amine@328 1296 where `data` is a list of read frames, `start`: index of the first
amine@328 1297 frame in the original data and `end` : index of the last frame.
amine@2 1298 """
amine@177 1299 token_gen = self._iter_tokens(data_source)
amine@177 1300 if callback:
amine@177 1301 for token in token_gen:
amine@177 1302 callback(*token)
amine@177 1303 return
amine@177 1304 if generator:
amine@177 1305 return token_gen
amine@177 1306 return list(token_gen)
amine@67 1307
amine@177 1308 def _iter_tokens(self, data_source):
amine@2 1309 self._reinitialize()
amine@2 1310 while True:
amine@67 1311 frame = data_source.read()
amine@177 1312 self._current_frame += 1
amine@47 1313 if frame is None:
amine@177 1314 token = self._post_process()
amine@177 1315 if token is not None:
amine@177 1316 yield token
amine@2 1317 break
amine@177 1318 token = self._process(frame)
amine@177 1319 if token is not None:
amine@177 1320 yield token
amine@67 1321
amine@327 1322 def _process(self, frame): # noqa: C901
amine@67 1323
amine@297 1324 frame_is_valid = self._is_valid(frame)
amine@67 1325
amine@2 1326 if self._state == self.SILENCE:
amine@67 1327
amine@2 1328 if frame_is_valid:
amine@2 1329 # seems we got a valid frame after a silence
amine@2 1330 self._init_count = 1
amine@2 1331 self._silence_length = 0
amine@2 1332 self._start_frame = self._current_frame
amine@2 1333 self._data.append(frame)
amine@67 1334
amine@67 1335 if self._init_count >= self.init_min:
amine@2 1336 self._state = self.NOISE
amine@2 1337 if len(self._data) >= self.max_length:
amine@177 1338 return self._process_end_of_detection(True)
amine@2 1339 else:
amine@2 1340 self._state = self.POSSIBLE_NOISE
amine@67 1341
amine@2 1342 elif self._state == self.POSSIBLE_NOISE:
amine@67 1343
amine@2 1344 if frame_is_valid:
amine@2 1345 self._silence_length = 0
amine@2 1346 self._init_count += 1
amine@2 1347 self._data.append(frame)
amine@67 1348 if self._init_count >= self.init_min:
amine@2 1349 self._state = self.NOISE
amine@2 1350 if len(self._data) >= self.max_length:
amine@177 1351 return self._process_end_of_detection(True)
amine@67 1352
amine@67 1353 else:
amine@2 1354 self._silence_length += 1
amine@178 1355 if (
amine@178 1356 self._silence_length > self.init_max_silent
amine@178 1357 or len(self._data) + 1 >= self.max_length
amine@178 1358 ):
amine@2 1359 # either init_max_silent or max_length is reached
amine@2 1360 # before _init_count, back to silence
amine@2 1361 self._data = []
amine@2 1362 self._state = self.SILENCE
amine@2 1363 else:
amine@2 1364 self._data.append(frame)
amine@67 1365
amine@2 1366 elif self._state == self.NOISE:
amine@67 1367
amine@2 1368 if frame_is_valid:
amine@2 1369 self._data.append(frame)
amine@2 1370 if len(self._data) >= self.max_length:
amine@177 1371 return self._process_end_of_detection(True)
amine@67 1372
amine@67 1373 elif self.max_continuous_silence <= 0:
amine@328 1374 # max token reached at this frame will _deliver if
amine@328 1375 # _contiguous_token and not _strict_min_length
amine@2 1376 self._state = self.SILENCE
amine@177 1377 return self._process_end_of_detection()
amine@2 1378 else:
amine@2 1379 # this is the first silent frame following a valid one
amine@2 1380 # and it is tolerated
amine@2 1381 self._silence_length = 1
amine@2 1382 self._data.append(frame)
amine@2 1383 self._state = self.POSSIBLE_SILENCE
amine@2 1384 if len(self._data) == self.max_length:
amine@177 1385 return self._process_end_of_detection(True)
amine@67 1386 # don't reset _silence_length because we still
amine@2 1387 # need to know the total number of silent frames
amine@67 1388
amine@2 1389 elif self._state == self.POSSIBLE_SILENCE:
amine@67 1390
amine@2 1391 if frame_is_valid:
amine@2 1392 self._data.append(frame)
amine@2 1393 self._silence_length = 0
amine@2 1394 self._state = self.NOISE
amine@2 1395 if len(self._data) >= self.max_length:
amine@177 1396 return self._process_end_of_detection(True)
amine@67 1397
amine@2 1398 else:
amine@2 1399 if self._silence_length >= self.max_continuous_silence:
amine@177 1400 self._state = self.SILENCE
amine@2 1401 if self._silence_length < len(self._data):
amine@67 1402 # _deliver only gathered frames aren't all silent
amine@177 1403 return self._process_end_of_detection()
amine@177 1404 self._data = []
amine@2 1405 self._silence_length = 0
amine@2 1406 else:
amine@2 1407 self._data.append(frame)
amine@2 1408 self._silence_length += 1
amine@2 1409 if len(self._data) >= self.max_length:
amine@177 1410 return self._process_end_of_detection(True)
amine@67 1411 # don't reset _silence_length because we still
amine@2 1412 # need to know the total number of silent frames
amine@67 1413
amine@2 1414 def _post_process(self):
amine@2 1415 if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
amine@2 1416 if len(self._data) > 0 and len(self._data) > self._silence_length:
amine@177 1417 return self._process_end_of_detection()
amine@67 1418
amine@2 1419 def _process_end_of_detection(self, truncated=False):
amine@67 1420
amine@185 1421 if (
amine@185 1422 not truncated
amine@297 1423 and self._drop_trailing_silence
amine@185 1424 and self._silence_length > 0
amine@185 1425 ):
amine@2 1426 # happens if max_continuous_silence is reached
amine@2 1427 # or max_length is reached at a silent frame
amine@178 1428 self._data = self._data[0 : -self._silence_length]
amine@67 1429
amine@178 1430 if (len(self._data) >= self.min_length) or (
amine@178 1431 len(self._data) > 0
amine@178 1432 and not self._strict_min_length
amine@178 1433 and self._contiguous_token
amine@178 1434 ):
amine@67 1435
amine@177 1436 start_frame = self._start_frame
amine@177 1437 end_frame = self._start_frame + len(self._data) - 1
amine@177 1438 data = self._data
amine@177 1439 self._data = []
amine@177 1440 token = (data, start_frame, end_frame)
amine@67 1441
amine@2 1442 if truncated:
amine@2 1443 # next token (if any) will start at _current_frame + 1
amine@2 1444 self._start_frame = self._current_frame + 1
amine@2 1445 # remember that it is contiguous with the just delivered one
amine@2 1446 self._contiguous_token = True
amine@2 1447 else:
amine@2 1448 self._contiguous_token = False
amine@177 1449 return token
amine@2 1450 else:
amine@67 1451 self._contiguous_token = False
amine@67 1452
amine@2 1453 self._data = []
amine@67 1454
amine@2 1455 def _append_token(self, data, start, end):
amine@178 1456 self._tokens.append((data, start, end))