# HG changeset patch # User Amine Sehili # Date 1729344103 -7200 # Node ID 12a7f01c633b4ee35832107bc3a3edad8fb2ca9c # Parent 88e99cfd9c4ce07e0e9af8e4d9c6f5695e5c3c32 Update docstrings diff -r 88e99cfd9c4c -r 12a7f01c633b auditok/core.py --- a/auditok/core.py Sat Oct 19 14:13:28 2024 +0200 +++ b/auditok/core.py Sat Oct 19 15:21:43 2024 +0200 @@ -4,6 +4,8 @@ load split + make_silence + split_and_join_with_silence AudioRegion StreamTokenizer """ @@ -40,59 +42,65 @@ def load(input, skip=0, max_read=None, **kwargs): - """Load audio data from a source and return it as an :class:`AudioRegion`. + """ + Load audio data from a specified source and return it as an + :class:`AudioRegion`. Parameters ---------- - input : None, str, bytes, AudioSource - source to read audio data from. If `str`, it should be a path to a - valid audio file. If `bytes`, it is used as raw audio data. If it is - "-", raw data will be read from stdin. If None, read audio data from - the microphone using PyAudio. If of type `bytes` or is a path to a - raw audio file then `sampling_rate`, `sample_width` and `channels` - parameters (or their alias) are required. If it's an - :class:`AudioSource` object it's used directly to read data. + input : None, str, Path, bytes, AudioSource + The source from which to read audio data. If a `str` or `Path`, it + should specify the path to a valid audio file. If `bytes`, it is + treated as raw audio data. If set to "-", raw data will be read from + standard input (stdin). If `None`, audio data is read from the + microphone using PyAudio. For `bytes` data or a raw audio file path, + `sampling_rate`, `sample_width`, and `channels` parameters (or their + aliases) must be specified. If an :class:`AudioSource` object is + provided, it is used directly to read data. skip : float, default: 0 - amount, in seconds, of audio data to skip from source. If read from - a microphone, `skip` must be 0, otherwise a `ValueError` is raised. + Duration in seconds of audio data to skip from the beginning of the + source. When reading from a microphone, `skip` must be 0; otherwise, + a `ValueError` is raised. max_read : float, default: None - amount, in seconds, of audio data to read from source. If read from - microphone, `max_read` should not be None, otherwise a `ValueError` is - raised. + Duration in seconds of audio data to read from the source. When reading + from the microphone, `max_read` must not be `None`; otherwise, a + `ValueError` is raised. audio_format, fmt : str - type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only - be used if `input` is a string path to an audio file. If not given, - audio type will be guessed from file name extension or from file + Format of the audio data (e.g., wav, ogg, flac, raw, etc.). This is + only used if `input` is a string path to an audio file. If not + provided, the audio format is inferred from the file's extension or header. sampling_rate, sr : int - sampling rate of audio data. Required if `input` is a raw audio file, - a `bytes` object or None (i.e., read from microphone). + Sampling rate of the audio data. Required if `input` is a raw audio + file, a `bytes` object, or `None` (i.e., when reading from the + microphone). sample_width, sw : int - number of bytes used to encode one audio sample, typically 1, 2 or 4. - Required for raw data, see `sampling_rate`. + Number of bytes used to encode a single audio sample, typically 1, 2, + or 4. Required for raw audio data; see `sampling_rate`. channels, ch : int - number of channels of audio data. Required for raw data, see - `sampling_rate`. + Number of channels in the audio data. Required for raw audio data; + see `sampling_rate`. large_file : bool, default: False - If True, AND if `input` is a path to a *wav* of a *raw* audio file - (and **only** these two formats) then audio file is not fully loaded to - memory in order to create the region (but the portion of data needed to - create the region is of course loaded to memory). Set to True if - `max_read` is significantly smaller then the size of a large audio file - that shouldn't be entirely loaded to memory. + If `True`, and `input` is a path to a *wav* or *raw* audio file, the + file is not fully loaded into memory to create the region (only the + necessary portion of data is loaded). This should be set to `True` + when `max_read` is much smaller than the total size of a large audio + file, to avoid loading the entire file into memory. Returns ------- - region: AudioRegion + region : AudioRegion Raises ------ ValueError - raised if `input` is None (i.e., read data from microphone) and `skip` - != 0 or `input` is None `max_read` is None (meaning that when reading - from the microphone, no data should be skipped, and maximum amount of - data to read should be explicitly provided). + Raised if `input` is `None` (i.e., reading from the microphone) and + `skip` is not 0, or if `max_read` is `None` when `input` is `None`. + This ensures that when reading from the microphone, no data is + skipped, and the maximum amount of data to read is explicitly + specified. """ + return AudioRegion.load(input, skip, max_read, **kwargs) @@ -106,109 +114,105 @@ **kwargs, ): """ - Split audio data and return a generator of AudioRegions + Split audio data and return a generator of :class:`AudioRegion`s. Parameters ---------- - input : str, bytes, AudioSource, AudioReader, AudioRegion or None - input audio data. If str, it should be a path to an existing audio file. - "-" is interpreted as standard input. If bytes, input is considered as - raw audio data. If None, read audio from microphone. - Every object that is not an `AudioReader` will be transformed into an - `AudioReader` before processing. If it is an `str` that refers to a raw - audio file, `bytes` or None, audio parameters should be provided using - kwargs (i.e., `sampling_rate`, `sample_width` and `channels` or their - alias). - If `input` is str then audio format will be guessed from file extension. - `audio_format` (alias `fmt`) kwarg can also be given to specify audio - format explicitly. If none of these options is available, rely on - backend (currently only pydub is supported) to load data. - min_dur : float, default: 0.2 - minimum duration in seconds of a detected audio event. By using large - values for `min_dur`, very short audio events (e.g., very short 1-word - utterances like 'yes' or 'no') can be mis detected. Using a very small - value may result in a high number of too short audio events. - max_dur : float, default: 5 - maximum duration in seconds of a detected audio event. If an audio event - lasts more than `max_dur` it will be truncated. If the continuation of a - truncated audio event is shorter than `min_dur` then this continuation - is accepted as a valid audio event if `strict_min_dur` is False. - Otherwise it is rejected. - max_silence : float, default: 0.3 - maximum duration of continuous silence within an audio event. There - might be many silent gaps of this duration within one audio event. If - the continuous silence happens at the end of the event than it's kept as - part of the event if `drop_trailing_silence` is False (default). - drop_trailing_silence : bool, default: False - Whether to remove trailing silence from detected events. To avoid abrupt - cuts in speech, trailing silence should be kept, therefore this - parameter should be False. - strict_min_dur : bool, default: False - strict minimum duration. Do not accept an audio event if it is shorter - than `min_dur` even if it is contiguous to the latest valid event. This - happens if the the latest detected event had reached `max_dur`. + input : str, Path, bytes, AudioSource, AudioReader, AudioRegion, or None + Audio data input. If `str` or `Path`, it should be the path to an audio + file. Use "-" to indicate standard input. If bytes, the input is treated + as raw audio data. If None, audio is read from the microphone. + + Any input not of type `AudioReader` is converted into an `AudioReader` + before processing. If `input` is raw audio data (str, bytes, or None), + specify audio parameters using kwargs (e.g., `sampling_rate`, + `sample_width`, `channels`). + + For string inputs, audio format is inferred from the file extension, or + specify explicitly via `audio_format` or `fmt`. Otherwise, the backend + (currently only `pydub`) handles loading data. + + min_dur : float, default=0.2 + Minimum duration in seconds of a detected audio event. Higher values + can exclude very short utterances (e.g., single words like "yes" or + "no"). Lower values may increase the number of short audio events. + + max_dur : float, default=5 + Maximum duration in seconds for an audio event. Events longer than this + are truncated. If the remainder of a truncated event is shorter than + `min_dur`, it is included as a valid event if `strict_min_dur` is False; + otherwise, it is rejected. + + max_silence : float, default=0.3 + Maximum duration of continuous silence allowed within an audio event. + Multiple silent gaps of this duration may appear in a single event. + Trailing silence at the end of an event is kept if + `drop_trailing_silence` is False. + + drop_trailing_silence : bool, default=False + Whether to remove trailing silence from detected events. To prevent + abrupt speech cuts, it is recommended to keep trailing silence, so + default is False. + + strict_min_dur : bool, default=False + Whether to strictly enforce `min_dur` for all events, rejecting any + event shorter than `min_dur`, even if contiguous with a valid event. Other Parameters ---------------- - analysis_window, aw : float, default: 0.05 (50 ms) - duration of analysis window in seconds. A value between 0.01 (10 ms) and - 0.1 (100 ms) should be good for most use-cases. + analysis_window, aw : float, default=0.05 (50 ms) + Duration of analysis window in seconds. Values between 0.01 and 0.1 are + generally effective. + audio_format, fmt : str - type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be - used if `input` is a string path to an audio file. If not given, audio - type will be guessed from file name extension or from file header. + Type of audio data (e.g., wav, ogg, flac, raw). Used if `input` is a + file path. If not specified, audio format is inferred from the file + extension or header. + sampling_rate, sr : int - sampling rate of audio data. Required if `input` is a raw audio file, is - a bytes object or None (i.e., read from microphone). + Sampling rate of audio data, required if `input` is raw data (bytes or + None). + sample_width, sw : int - number of bytes used to encode one audio sample, typically 1, 2 or 4. - Required for raw data, see `sampling_rate`. + Number of bytes per audio sample (typically 1, 2, or 4). Required for + raw audio; see `sampling_rate`. + channels, ch : int - number of channels of audio data. Required for raw data, see - `sampling_rate`. + Number of audio channels. Required for raw data; see `sampling_rate`. + use_channel, uc : {None, "mix"} or int - which channel to use for split if `input` has multiple audio channels. - Regardless of which channel is used for splitting, returned audio events - contain data from *all* channels, just as `input`. - The following values are accepted: + Channel selection for splitting if `input` has multiple channels. All + channels are retained in detected events. Options: - - None (alias "any"): accept audio activity from any channel, even if - other channels are silent. This is the default behavior. + - None or "any" (default): accept activity from any channel. + - "mix" or "average": mix all channels into a single averaged channel. + - int (0 <= value < channels): use the specified channel ID for splitting. - - "mix" ("avg" or "average"): mix down all channels (i.e. compute - average channel) and split the resulting channel. + large_file : bool, default=False + If True and `input` is a path to a wav or raw file, audio is processed + lazily. Otherwise, the entire file is loaded before splitting. Set to + True if file size exceeds available memory. - - int (0 <=, > `channels`): use one channel, specified by integer id, - for split. + max_read, mr : float, default=None + Maximum data read from source in seconds. Default is to read to end. - large_file : bool, default: False - If True, AND if `input` is a path to a *wav* of a *raw* audio file - (and only these two formats) then audio data is lazily loaded to memory - (i.e., one analysis window a time). Otherwise the whole file is loaded - to memory before split. Set to True if the size of the file is larger - than available memory. - max_read, mr : float, default: None, read until end of stream - maximum data to read from source in seconds. - validator, val : callable, DataValidator - custom data validator. If `None` (default), an `AudioEnergyValidator` is - used with the given energy threshold. Can be a callable or an instance - of `DataValidator` that implements `is_valid`. In either case, it'll be - called with with a window of audio data as the first parameter. - energy_threshold, eth : float, default: 50 - energy threshold for audio activity detection. Audio regions that have - enough windows of with a signal energy equal to or above this threshold - are considered valid audio events. Here we are referring to this amount - as the energy of the signal but to be more accurate, it is the log - energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see - :class:`AudioEnergyValidator` and - :func:`calculate_energy_single_channel`). If `validator` is given, this - argument is ignored. + validator, val : callable or DataValidator, default=None + Custom validator for audio data. If None, uses `AudioEnergyValidator` + with the given `energy_threshold`. Should be callable or an instance of + `DataValidator` implementing `is_valid`. + + energy_threshold, eth : float, default=50 + Energy threshold for audio activity detection. Audio regions with + sufficient signal energy above this threshold are considered valid. + Calculated as the log energy: `20 * log10(sqrt(dot(x, x) / len(x)))`. + Ignored if `validator` is specified. Yields ------ AudioRegion - a generator of detected :class:`AudioRegion` s. + Generator yielding detected :class:`AudioRegion` instances. """ + if min_dur <= 0: raise ValueError(f"'min_dur' ({min_dur}) must be > 0") if max_dur <= 0: @@ -316,23 +320,24 @@ def make_silence(duration, sampling_rate=16000, sample_width=2, channels=1): - """Generate a silence of a specific duration. + """ + Generate a silence of specified duration. Parameters ---------- duration : float - silence duration in seconds. + Duration of silence in seconds. sampling_rate : int, optional - sampling rate of audio data, by default 16000. + Sampling rate of the audio data, default is 16000. sample_width : int, optional - number of bytes used to encode one audio sample, by default 2. + Number of bytes per audio sample, default is 2. channels : int, optional - number of channels of audio data, by default 1. + Number of audio channels, default is 1. Returns ------- AudioRegion - a "silent" AudioRegion of the desired duration. + A "silent" AudioRegion of the specified duration. """ size = round(duration * sampling_rate) * sample_width * channels data = b"\0" * size @@ -341,25 +346,22 @@ def split_and_join_with_silence(input, silence_duration, **kwargs): - """Split input audio and join (i.e. glue) the resulting regions with a - silence of duration `silence_duration`. This can be used to create audio - data with shortened or lengthened silence between audio events. - + """ + Split input audio and join (glue) the resulting regions with a specified + silence duration between them. This can be used to adjust the length of + silence between audio events, either shortening or lengthening pauses. Parameters ---------- silence_duration : float - silence duration in seconds. + Duration of silence in seconds between audio events. Returns ------- - AudioRegion, None - An AudioRegion with the desired between-events silence duration. - None if no audio event could be detected in input data. - - See also - -------- - :func:`load` + AudioRegion or None + An :meth:`AudioRegion` with the specified between-events silence + duration. Returns None if no audio events are detected in the input + data. """ regions = list(split(input, **kwargs)) if regions: @@ -374,33 +376,35 @@ duration, analysis_window, round_fn=round, epsilon=0 ): """ - Converts a given duration into a positive integer of analysis windows. - if `duration / analysis_window` is not an integer, the result will be - rounded to the closest bigger integer. If `duration == 0`, returns `0`. - If `duration < analysis_window`, returns 1. - `duration` and `analysis_window` can be in seconds or milliseconds but - must be in the same unit. + Helper function to convert a given duration into a positive integer + of analysis windows. If `duration / analysis_window` is not an integer, + the result will be rounded up to the nearest integer. If `duration == 0`, + returns 0. If `duration < analysis_window`, returns 1. + + Both `duration` and `analysis_window` should be in the same units, + either seconds or milliseconds. Parameters ---------- duration : float - a given duration in seconds or ms. - analysis_window: float - size of analysis window, in the same unit as `duration`. - round_fn : callable - function called to round the result. Default: `round`. - epsilon : float - small value to add to the division result before rounding. - E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with - `round_fn=math.floor` returns `2` instead of `3`. Adding a small value - to `0.3 / 0.1` avoids this error. + The given duration in seconds or milliseconds. + analysis_window : float + The size of each analysis window, in the same units as `duration`. + round_fn : callable, optional + A function for rounding the result, default is `round`. + epsilon : float, optional + A small value added before rounding to address floating-point + precision issues, ensuring accurate rounding for cases like + `0.3 / 0.1`, where `round_fn=math.floor` would otherwise yield + an incorrect result. Returns ------- nb_windows : int - minimum number of `analysis_window`'s to cover `durartion`. That means - that `analysis_window * nb_windows >= duration`. + The minimum number of `analysis_window` units needed to cover + `duration`, ensuring `analysis_window * nb_windows >= duration`. """ + if duration < 0 or analysis_window <= 0: err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0" raise ValueError(err_msg.format(duration, analysis_window)) @@ -418,28 +422,28 @@ channels, ): """ - Helper function to create an `AudioRegion` from parameters returned by - tokenization object. It takes care of setting up region `start` and `end` - in metadata. + Helper function to create an :class:`AudioRegion` from parameters provided + by a tokenization object. This function handles setting the `start` and `end` + metadata for the region. Parameters ---------- - frame_duration: float - duration of analysis window in seconds + frame_duration : float + Duration of each analysis window in seconds. start_frame : int - index of the first analysis window + Index of the first analysis window. sampling_rate : int - sampling rate of audio data + Sampling rate of the audio data. sample_width : int - number of bytes of one audio sample + Number of bytes per audio sample. channels : int - number of channels of audio data + Number of audio channels. Returns ------- audio_region : AudioRegion - AudioRegion whose start time is calculated as: - `1000 * start_frame * frame_duration` + An AudioRegion object with `start` time calculated as: + `1000 * start_frame * frame_duration`. """ start = start_frame * frame_duration data = b"".join(data_frames) @@ -449,17 +453,18 @@ def _read_chunks_online(max_read, **kwargs): """ Helper function to read audio data from an online blocking source - (i.e., microphone). Used to build an `AudioRegion` and can intercept - KeyboardInterrupt so that reading stops as soon as this exception is - raised. Makes building `AudioRegion`s on [i]python sessions and jupyter - notebooks more user friendly. + (e.g., a microphone). This function builds an `AudioRegion` and can + intercept `KeyboardInterrupt` to stop reading immediately when the + exception is raised, making it more user-friendly for [i]Python sessions + and Jupyter notebooks. Parameters ---------- max_read : float - maximum amount of data to read in seconds. + Maximum duration of audio data to read, in seconds. kwargs : - audio parameters (sampling_rate, sample_width and channels). + Additional audio parameters (e.g., `sampling_rate`, `sample_width`, + and `channels`). """ reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs) reader.open() @@ -472,7 +477,7 @@ data.append(frame) except KeyboardInterrupt: # Stop data acquisition from microphone when pressing - # Ctrl+C on a [i]python session or a notebook + # Ctrl+C in an [i]python session or a notebook pass reader.close() return ( @@ -485,22 +490,25 @@ def _read_offline(input, skip=0, max_read=None, **kwargs): """ - Helper function to read audio data from an offline (i.e., file). Used to - build `AudioRegion`s. + Helper function to read audio data from an offline source (e.g., file). + This function is used to build :class:`AudioRegion` objects. Parameters ---------- - input : str, bytes - path to audio file (if str), or a bytes object representing raw audio - data. - skip : float, default 0 - amount of data to skip from the begining of audio source. - max_read : float, default: None - maximum amount of audio data to read. Default: None, means read until - end of stream. + input : str or bytes + Path to an audio file (if str) or a bytes object representing raw + audio data. + skip : float, optional, default=0 + Amount of data to skip from the beginning of the audio source, in + seconds. + max_read : float, optional, default=None + Maximum duration of audio data to read, in seconds. Default is None, + which reads until the end of the stream. kwargs : - audio parameters (sampling_rate, sample_width and channels). + Additional audio parameters (e.g., `sampling_rate`, `sample_width`, + and `channels`). """ + audio_source = get_audio_source(input, **kwargs) audio_source.open() if skip is not None and skip > 0: @@ -533,8 +541,9 @@ class _SecondsView: - """A class to create a view of `AudioRegion` that can be sliced using - indices in seconds. + """ + A class to create a view of an :class:`AudioRegion` that supports slicing + with time-based indices in seconds. """ def __init__(self, region): @@ -588,7 +597,7 @@ class _AudioRegionMetadata(dict): - """A class to store `AudioRegion`'s metadata.""" + """A class to store :class:`AudioRegion`'s metadata.""" def __getattr__(self, name): warnings.warn( @@ -617,27 +626,23 @@ @dataclass(frozen=True) class AudioRegion(object): """ - AudioRegion encapsulates raw audio data and provides an interface to - perform simple operations on it. Use `AudioRegion.load` to build an - `AudioRegion` from different types of objects. + `AudioRegion` encapsulates raw audio data and provides an interface for + performing basic audio operations. Use :meth:`AudioRegion.load` or + :func:`load` to create an `AudioRegion` from various input types. Parameters ---------- data : bytes - raw audio data as a bytes object + Raw audio data as a bytes object. sampling_rate : int - sampling rate of audio data + Sampling rate of the audio data. sample_width : int - number of bytes of one audio sample + Number of bytes per audio sample. channels : int - number of channels of audio data - start : float, default: None - optional start time of the region. This is typically provided by the - `split` function. - - See also - -------- - AudioRegion.load + Number of audio channels. + start : float, optional, default=None + Optional start time of the region, typically provided by the `split` + function. """ data: bytes @@ -677,18 +682,22 @@ @classmethod def load(cls, input, skip=0, max_read=None, **kwargs): """ - Create an `AudioRegion` by loading data from `input`. See :func:`load` - for parameters descripion. + Create an :class:`AudioRegion` by loading data from `input`. + + See :func:`load` for a full description of parameters. Returns ------- - region: AudioRegion + region : AudioRegion + An AudioRegion instance created from the specified input data. Raises ------ ValueError - raised if `input` is None and `skip` != 0 or `max_read` is None. + Raised if `input` is None and either `skip` is not 0 or `max_read` + is None. """ + if input is None: if skip > 0: raise ValueError( @@ -740,21 +749,21 @@ def play(self, progress_bar=False, player=None, **progress_bar_kwargs): """ - Play audio region. + Play the audio region. Parameters ---------- - progress_bar : bool, default: False - whether to use a progress bar while playing audio. Default: False. - `progress_bar` requires `tqdm`, if not installed, no progress bar - will be shown. - player : AudioPalyer, default: None - audio player to use. if None (default), use `player_for()` - to get a new audio player. - progress_bar_kwargs : kwargs - keyword arguments to pass to `tqdm` progress_bar builder (e.g., - use `leave=False` to clean up the screen when play finishes). + progress_bar : bool, optional, default=False + Whether to display a progress bar during playback. Requires `tqdm`, + if not installed, no progress bar will be shown. + player : AudioPlayer, optional, default=None + Audio player to use for playback. If None (default), a new player is + obtained via `player_for()`. + progress_bar_kwargs : dict, optional + Additional keyword arguments to pass to the `tqdm` progress bar + (e.g., `leave=False` to clear the bar from the screen upon completion). """ + if player is None: player = player_for(self) player.play(self.data, progress_bar=progress_bar, **progress_bar_kwargs) @@ -763,38 +772,39 @@ self, filename, audio_format=None, exists_ok=True, **audio_parameters ): """ - Save audio region to file. + Save the audio region to a file. Parameters ---------- - filename : str, Path - path to output audio file. If of type `str`, it may contain a - `{start}`, `{end}` and a `{duration}` placeholders. - Regions returned by `split` contain a `start` and and `end` - attributes that can be used to build output file name as in the - example. - audio_format : str, default: None - format used to save audio data. If None (default), format is guessed - from file name's extension. If file name has no extension, audio - data is saved as a raw (headerless) audio file. - exists_ok : bool, default: True - If True, overwrite `file` if a file with the same name exists. - If False, raise an `IOError` if `file` exists. - audio_parameters: dict - any keyword arguments to be passed to audio saving backend. + filename : str or Path + Path to the output audio file. If a string, it may include `{start}`, + `{end}`, and `{duration}` placeholders. Regions created by `split` + contain `start` and `end` attributes that can be used to format the + filename, as shown in the example. + audio_format : str, optional, default=None + Format used to save the audio data. If None (default), the format is + inferred from the file extension. If the filename has no extension, + the audio is saved as a raw (headerless) audio file. + exists_ok : bool, optional, default=True + If True, overwrite the file if it already exists. If False, raise an + `IOError` if the file exists. + audio_parameters : dict, optional + Additional keyword arguments to pass to the audio-saving backend. Returns ------- - file: str - name of output file with filled placehoders. + file : str + The output filename with placeholders filled in. + Raises - IOError if `filename` exists and `exists_ok` is False. - + ------ + IOError + If `filename` exists and `exists_ok` is False. Examples -------- - Create and AudioRegion, explicitly passing a value for `start`. `end` - will be computed based on `start` and the region's duration. + Create an AudioRegion, specifying `start`. The `end` will be computed + based on `start` and the region's duration. >>> region = AudioRegion(b'\0' * 2 * 24000, >>> sampling_rate=16000, @@ -809,6 +819,7 @@ >>> filename = region.save('audio_{start:.3f}-{end:.3f}_{duration:.3f}.wav') >>> assert filename == "audio_2.250-3.750_1.500.wav" """ + if isinstance(filename, Path): if not exists_ok and filename.exists(): raise FileExistsError( @@ -845,7 +856,8 @@ strict_min_dur=False, **kwargs, ): - """Split audio region. See :func:`auditok.split()` for a comprehensive + """ + Split audio region. See :func:`auditok.split` for a comprehensive description of split parameters. See Also :meth:`AudioRegio.split_and_plot`. """ @@ -873,25 +885,28 @@ dpi=120, theme="auditok", ): - """Plot audio region using one sub-plot per each channel. + """ + Plot the audio region with one subplot per channel. Parameters ---------- - scale_signal : bool, default: True - if true, scale signal by subtracting its mean and dividing by its + scale_signal : bool, optional, default=True + If True, scale the signal by subtracting its mean and dividing by its standard deviation before plotting. - show : bool - whether to show plotted signal right after the call. - figsize : tuple, default: None - width and height of the figure to pass to `matplotlib`. - save_as : str, default None. - if provided, also save plot to file. - dpi : int, default: 120 - plot dpi to pass to `matplotlib`. - theme : str or dict, default: "auditok" - plot theme to use. Currently only "auditok" theme is implemented. To - provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`. + show : bool, optional, default=False + Whether to display the plot immediately after the function call. + figsize : tuple, optional, default=None + Width and height of the figure, passed to `matplotlib`. + save_as : str, optional, default=None + If specified, save the plot to the given filename. + dpi : int, optional, default=120 + Dots per inch (DPI) for the plot, passed to `matplotlib`. + theme : str or dict, optional, default="auditok" + Plot theme to use. Only the "auditok" theme is currently implemented. + To define a custom theme, refer to + :attr:`auditok.plotting.AUDITOK_PLOT_THEME`. """ + plot( self, scale_signal=scale_signal, @@ -917,9 +932,15 @@ theme="auditok", **kwargs, ): - """Split region and plot signal and detections. Alias: :meth:`splitp`. - See :func:`auditok.split()` for a comprehensive description of split - parameters. Also see :meth:`plot` for plot parameters. + """ + Split the audio region, then plot the signal and detected regions. + + Alias + ----- + :meth:`splitp` + + Refer to :func:`auditok.split()` for a detailed description of split + parameters, and to :meth:`plot` for plot-specific parameters. """ regions = self.split( min_dur=min_dur, @@ -1001,8 +1022,9 @@ @property def len(self): """ - Return region length in number of samples. + Return the length of the audio region in number of samples. """ + return len(self) def __bytes__(self): @@ -1021,10 +1043,12 @@ def __add__(self, other): """ - Concatenates this region and `other` and return a new region. - Both regions must have the same sampling rate, sample width - and number of channels. If not, raises a `ValueError`. + Concatenate this audio region with `other`, returning a new region. + + Both regions must have the same sampling rate, sample width, and number + of channels. If they differ, a `ValueError` is raised. """ + if not isinstance(other, AudioRegion): raise TypeError( "Can only concatenate AudioRegion, " @@ -1036,10 +1060,18 @@ def __radd__(self, other): """ - Concatenates `other` and this region. `other` should be an - `AudioRegion` with the same audio parameters as this region - but can exceptionally be `0` to make it possible to concatenate - many regions with `sum`. + Concatenate `other` with this audio region. + + Parameters + ---------- + other : AudioRegion or int + An `AudioRegion` with the same audio parameters as this region, or + `0` to enable concatenating multiple regions using `sum`. + + Returns + ------- + AudioRegion + A new `AudioRegion` representing the concatenation result. """ if other == 0: return self @@ -1108,91 +1140,60 @@ class StreamTokenizer: """ - Class for stream tokenizers. It implements a 4-state automaton scheme - to extract sub-sequences of interest on the fly. + Class for stream tokenizers, implementing a 4-state automaton scheme + to extract relevant sub-sequences from a data stream in real-time. Parameters ---------- - validator : callable, DataValidator (must implement `is_valid`) - called with each data frame read from source. Should take one positional - argument and return True or False for valid and invalid frames - respectively. + validator : callable or :class:`DataValidator` (must implement `is_valid`). + Called with each data frame read from the source. Should take a + single argument and return True or False to indicate valid and + invalid frames, respectively. min_length : int - Minimum number of frames of a valid token. This includes all - tolerated non valid frames within the token. + Minimum number of frames in a valid token, including any tolerated + non-valid frames within the token. max_length : int - Maximum number of frames of a valid token. This includes all - tolerated non valid frames within the token. + Maximum number of frames in a valid token, including all tolerated + non-valid frames within the token. max_continuous_silence : int - Maximum number of consecutive non-valid frames within a token. - Note that, within a valid token, there may be many tolerated - *silent* regions that contain each a number of non valid frames up - to `max_continuous_silence` + Maximum number of consecutive non-valid frames within a token. Each + silent region may contain up to `max_continuous_silence` frames. - init_min : int - Minimum number of consecutive valid frames that must be - **initially** gathered before any sequence of non valid frames can - be tolerated. This option is not always needed, it can be used to - drop non-valid tokens as early as possible. **Default = 0** means - that the option is by default ineffective. + init_min : int, default=0 + Minimum number of consecutive valid frames required before + tolerating any non-valid frames. Helps discard non-valid tokens + early if needed. - init_max_silence : int - Maximum number of tolerated consecutive non-valid frames if the - number already gathered valid frames has not yet reached - 'init_min'.This argument is normally used if `init_min` is used. - **Default = 0**, by default this argument is not taken into - consideration. + init_max_silence : int, default=0 + Maximum number of tolerated consecutive non-valid frames before + reaching `init_min`. Used if `init_min` is specified. mode : int - mode can be one of the following: + Defines the tokenizer behavior with the following options: - -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and - accept a token shorter than `min_length` if it is the continuation - of the latest delivered token. + - `StreamTokenizer.NORMAL` (0, default): Do not drop trailing silence + and allow tokens shorter than `min_length` if they immediately follow + a delivered token. - -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered - because `max_length` is reached, and token `i+1` is immediately - adjacent to token `i` (i.e. token `i` ends at frame `k` and token - `i+1` starts at frame `k+1`) then accept token `i+1` only of it has - a size of at least `min_length`. The default behavior is to accept - token `i+1` event if it is shorter than `min_length` (provided that - the above conditions are fulfilled of course). + - `StreamTokenizer.STRICT_MIN_LENGTH` (2): If a token `i` is + delivered at `max_length`, any adjacent token `i+1` must meet + `min_length`. - -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing - non-valid frames from a token to be delivered if and only if it - is not **truncated**. This can be a bit tricky. A token is actually - delivered if: + - `StreamTokenizer.DROP_TRAILING_SILENCE` (4): Drop all trailing + non-valid frames from a token unless the token is truncated + (e.g., at `max_length`). - - `max_continuous_silence` is reached. - - - Its length reaches `max_length`. This is referred to as a - **truncated** token. - - In the current implementation, a `StreamTokenizer`'s decision is only - based on already seen data and on incoming data. Thus, if a token is - truncated at a non-valid but tolerated frame (`max_length` is reached - but `max_continuous_silence` not yet) any tailing silence will be kept - because it can potentially be part of valid token (if `max_length` was - bigger). But if `max_continuous_silence` is reached before - `max_length`, the delivered token will not be considered as truncated - but a result of *normal* end of detection (i.e. no more valid data). - In that case the trailing silence can be removed if you use the - `StreamTokenizer.DROP_TRAILING_SILENCE` mode. - - -4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`: # noqa: B950 - use both options. That means: first remove tailing silence, then - check if the token still has a length of at least `min_length`. - + - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`: + Apply both `STRICT_MIN_LENGTH` and `DROP_TRAILING_SILENCE`. Examples -------- - - In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is - accepted although it is shorter than `min_length` (3), because it - immediately follows the latest delivered token: + In the following, without `STRICT_MIN_LENGTH`, the 'BB' token is + accepted even though it is shorter than `min_length` (3) because it + immediately follows the last delivered token: >>> from auditok.core import StreamTokenizer >>> from auditok.util import StringDataSource, DataValidator @@ -1200,42 +1201,43 @@ >>> class UpperCaseChecker(DataValidator): >>> def is_valid(self, frame): return frame.isupper() + >>> dsource = StringDataSource("aaaAAAABBbbb") - >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(), - min_length=3, - max_length=4, - max_continuous_silence=0) + >>> tokenizer = StreamTokenizer( + >>> validator=UpperCaseChecker(), + >>> min_length=3, + >>> max_length=4, + >>> max_continuous_silence=0 + >>> ) >>> tokenizer.tokenize(dsource) [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)] + Using `STRICT_MIN_LENGTH` mode rejects the 'BB' token: - The following tokenizer will however reject the 'BB' token: - - >>> dsource = StringDataSource("aaaAAAABBbbb") - >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(), - min_length=3, max_length=4, - max_continuous_silence=0, - mode=StreamTokenizer.STRICT_MIN_LENGTH) + >>> tokenizer = StreamTokenizer( + >>> validator=UpperCaseChecker(), + >>> min_length=3, + >>> max_length=4, + >>> max_continuous_silence=0, + >>> mode=StreamTokenizer.STRICT_MIN_LENGTH + >>> ) >>> tokenizer.tokenize(dsource) [(['A', 'A', 'A', 'A'], 3, 6)] - + With `DROP_TRAILING_SILENCE`, trailing silence is removed if not truncated: >>> tokenizer = StreamTokenizer( - >>> validator=UpperCaseChecker(), - >>> min_length=3, - >>> max_length=6, - >>> max_continuous_silence=3, - >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE - >>> ) + >>> validator=UpperCaseChecker(), + >>> min_length=3, + >>> max_length=6, + >>> max_continuous_silence=3, + >>> mode=StreamTokenizer.DROP_TRAILING_SILENCE + >>> ) >>> dsource = StringDataSource("aaaAAAaaaBBbbbb") >>> tokenizer.tokenize(dsource) [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)] - The first token is delivered with its tailing silence because it is - truncated while the second one has its tailing frames removed. - - Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be: + Without `DROP_TRAILING_SILENCE`, the output includes trailing frames: .. code:: python @@ -1243,7 +1245,6 @@ (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13) ] - """ SILENCE = 0 @@ -1336,32 +1337,41 @@ def tokenize(self, data_source, callback=None, generator=False): """ - Read data from `data_source`, one frame a time, and process the read - frames in order to detect sequences of frames that make up valid - tokens. + Read data from `data_source` one frame at a time and process each frame + to detect sequences that form valid tokens. - :Parameters: - `data_source` : instance of the :class:`DataSource` class that - implements a `read` method. 'read' should return a slice of - signal, i.e. frame (of whatever type as long as it can be - processed by validator) and None if there is no more signal. + Parameters + ---------- + data_source : DataSource + An instance of the :class:`DataSource` class that implements a `read` + method. `read` should return a slice of the signal (a frame of any + type that can be processed by the validator) or None when there is no + more data in the source. - `callback` : an optional 3-argument function. - If a `callback` function is given, it will be called each time - a valid token is found. + callback : callable, optional + A function that takes three arguments. If provided, `callback` is + called each time a valid token is detected. + generator : bool, optional, default=False + If True, the method yields tokens as they are detected, rather than + returning a list. If False, a list of tokens is returned. - :Returns: - A list of tokens if `callback` is None. Each token is tuple with the - following elements: + Returns + ------- + list of tuples or generator + A list of tokens if `generator` is False, or a generator yielding + tokens if `generator` is True. Each token is a tuple with the + following structure: - .. code python + .. code:: python (data, start, end) - where `data` is a list of read frames, `start`: index of the first - frame in the original data and `end` : index of the last frame. + where `data` is a list of frames in the token, `start` is the index + of the first frame in the original data, and `end` is the index of + the last frame. """ + token_gen = self._iter_tokens(data_source) if callback: for token in token_gen: