Mercurial > hg > auditok

--- a/auditok/core.py	Sat Oct 19 14:13:28 2024 +0200
+++ b/auditok/core.py	Sat Oct 19 15:21:43 2024 +0200
@@ -4,6 +4,8 @@

     load
     split
+    make_silence
+    split_and_join_with_silence
     AudioRegion
     StreamTokenizer
 """
@@ -40,59 +42,65 @@


 def load(input, skip=0, max_read=None, **kwargs):
-    """Load audio data from a source and return it as an :class:`AudioRegion`.
+    """
+    Load audio data from a specified source and return it as an
+    :class:`AudioRegion`.

     Parameters
     ----------
-    input : None, str, bytes, AudioSource
-        source to read audio data from. If `str`, it should be a path to a
-        valid audio file. If `bytes`, it is used as raw audio data. If it is
-        "-", raw data will be read from stdin. If None, read audio data from
-        the microphone using PyAudio. If of type `bytes` or is a path to a
-        raw audio file then `sampling_rate`, `sample_width` and `channels`
-        parameters (or their alias) are required. If it's an
-        :class:`AudioSource` object it's used directly to read data.
+    input : None, str, Path, bytes, AudioSource
+        The source from which to read audio data. If a `str` or `Path`, it
+        should specify the path to a valid audio file. If `bytes`, it is
+        treated as raw audio data. If set to "-", raw data will be read from
+        standard input (stdin). If `None`, audio data is read from the
+        microphone using PyAudio. For `bytes` data or a raw audio file path,
+        `sampling_rate`, `sample_width`, and `channels` parameters (or their
+        aliases) must be specified. If an :class:`AudioSource` object is
+        provided, it is used directly to read data.
     skip : float, default: 0
-        amount, in seconds, of audio data to skip from source. If read from
-        a microphone, `skip` must be 0, otherwise a `ValueError` is raised.
+        Duration in seconds of audio data to skip from the beginning of the
+        source. When reading from a microphone, `skip` must be 0; otherwise,
+        a `ValueError` is raised.
     max_read : float, default: None
-        amount, in seconds, of audio data to read from source. If read from
-        microphone, `max_read` should not be None, otherwise a `ValueError` is
-        raised.
+        Duration in seconds of audio data to read from the source. When reading
+        from the microphone, `max_read` must not be `None`; otherwise, a
+        `ValueError` is raised.
     audio_format, fmt : str
-        type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only
-        be used if `input` is a string path to an audio file. If not given,
-        audio type will be guessed from file name extension or from file
+        Format of the audio data (e.g., wav, ogg, flac, raw, etc.). This is
+        only used if `input` is a string path to an audio file. If not
+        provided, the audio format is inferred from the file's extension or
         header.
     sampling_rate, sr : int
-        sampling rate of audio data. Required if `input` is a raw audio file,
-        a `bytes` object or None (i.e., read from microphone).
+        Sampling rate of the audio data. Required if `input` is a raw audio
+        file, a `bytes` object, or `None` (i.e., when reading from the
+        microphone).
     sample_width, sw : int
-        number of bytes used to encode one audio sample, typically 1, 2 or 4.
-        Required for raw data, see `sampling_rate`.
+        Number of bytes used to encode a single audio sample, typically 1, 2,
+        or 4. Required for raw audio data; see `sampling_rate`.
     channels, ch : int
-        number of channels of audio data. Required for raw data, see
-        `sampling_rate`.
+        Number of channels in the audio data. Required for raw audio data;
+        see `sampling_rate`.
     large_file : bool, default: False
-        If True, AND if `input` is a path to a *wav* of a *raw* audio file
-        (and **only** these two formats) then audio file is not fully loaded to
-        memory in order to create the region (but the portion of data needed to
-        create the region is of course loaded to memory). Set to True if
-        `max_read` is significantly smaller then the size of a large audio file
-        that shouldn't be entirely loaded to memory.
+        If `True`, and `input` is a path to a *wav* or *raw* audio file, the
+        file is not fully loaded into memory to create the region (only the
+        necessary portion of data is loaded). This should be set to `True`
+        when `max_read` is much smaller than the total size of a large audio
+        file, to avoid loading the entire file into memory.

     Returns
     -------
-    region: AudioRegion
+    region : AudioRegion

     Raises
     ------
     ValueError
-        raised if `input` is None (i.e., read data from microphone) and `skip`
-        != 0 or `input` is None `max_read` is None (meaning that when reading
-        from the microphone, no data should be skipped, and maximum amount of
-        data to read should be explicitly provided).
+        Raised if `input` is `None` (i.e., reading from the microphone) and
+        `skip` is not 0, or if `max_read` is `None` when `input` is `None`.
+        This ensures that when reading from the microphone, no data is
+        skipped, and the maximum amount of data to read is explicitly
+        specified.
     """
+
     return AudioRegion.load(input, skip, max_read, **kwargs)


@@ -106,109 +114,105 @@
     **kwargs,
 ):
     """
-    Split audio data and return a generator of AudioRegions
+    Split audio data and return a generator of :class:`AudioRegion`s.

     Parameters
     ----------
-    input : str, bytes, AudioSource, AudioReader, AudioRegion or None
-        input audio data. If str, it should be a path to an existing audio file.
-        "-" is interpreted as standard input. If bytes, input is considered as
-        raw audio data. If None, read audio from microphone.
-        Every object that is not an `AudioReader` will be transformed into an
-        `AudioReader` before processing. If it is an `str` that refers to a raw
-        audio file, `bytes` or None, audio parameters should be provided using
-        kwargs (i.e., `sampling_rate`, `sample_width` and `channels` or their
-        alias).
-        If `input` is str then audio format will be guessed from file extension.
-        `audio_format` (alias `fmt`) kwarg can also be given to specify audio
-        format explicitly. If none of these options is available, rely on
-        backend (currently only pydub is supported) to load data.
-    min_dur : float, default: 0.2
-        minimum duration in seconds of a detected audio event. By using large
-        values for `min_dur`, very short audio events (e.g., very short 1-word
-        utterances like 'yes' or 'no') can be mis detected. Using a very small
-        value may result in a high number of too short audio events.
-    max_dur : float, default: 5
-        maximum duration in seconds of a detected audio event. If an audio event
-        lasts more than `max_dur` it will be truncated. If the continuation of a
-        truncated audio event is shorter than `min_dur` then this continuation
-        is accepted as a valid audio event if `strict_min_dur` is False.
-        Otherwise it is rejected.
-    max_silence : float, default: 0.3
-        maximum duration of continuous silence within an audio event. There
-        might be many silent gaps of this duration within one audio event. If
-        the continuous silence happens at the end of the event than it's kept as
-        part of the event if `drop_trailing_silence` is False (default).
-    drop_trailing_silence : bool, default: False
-        Whether to remove trailing silence from detected events. To avoid abrupt
-        cuts in speech, trailing silence should be kept, therefore this
-        parameter should be False.
-    strict_min_dur : bool, default: False
-        strict minimum duration. Do not accept an audio event if it is shorter
-        than `min_dur` even if it is contiguous to the latest valid event. This
-        happens if the the latest detected event had reached `max_dur`.
+    input : str, Path, bytes, AudioSource, AudioReader, AudioRegion, or None
+        Audio data input. If `str` or `Path`, it should be the path to an audio
+        file. Use "-" to indicate standard input. If bytes, the input is treated
+        as raw audio data. If None, audio is read from the microphone.
+
+        Any input not of type `AudioReader` is converted into an `AudioReader`
+        before processing. If `input` is raw audio data (str, bytes, or None),
+        specify audio parameters using kwargs (e.g., `sampling_rate`,
+        `sample_width`, `channels`).
+
+        For string inputs, audio format is inferred from the file extension, or
+        specify explicitly via `audio_format` or `fmt`. Otherwise, the backend
+        (currently only `pydub`) handles loading data.
+
+    min_dur : float, default=0.2
+        Minimum duration in seconds of a detected audio event. Higher values
+        can exclude very short utterances (e.g., single words like "yes" or
+        "no"). Lower values may increase the number of short audio events.
+
+    max_dur : float, default=5
+        Maximum duration in seconds for an audio event. Events longer than this
+        are truncated. If the remainder of a truncated event is shorter than
+        `min_dur`, it is included as a valid event if `strict_min_dur` is False;
+        otherwise, it is rejected.
+
+    max_silence : float, default=0.3
+        Maximum duration of continuous silence allowed within an audio event.
+        Multiple silent gaps of this duration may appear in a single event.
+        Trailing silence at the end of an event is kept if
+        `drop_trailing_silence` is False.
+
+    drop_trailing_silence : bool, default=False
+        Whether to remove trailing silence from detected events. To prevent
+        abrupt speech cuts, it is recommended to keep trailing silence, so
+        default is False.
+
+    strict_min_dur : bool, default=False
+        Whether to strictly enforce `min_dur` for all events, rejecting any
+        event shorter than `min_dur`, even if contiguous with a valid event.

     Other Parameters
     ----------------
-    analysis_window, aw : float, default: 0.05 (50 ms)
-        duration of analysis window in seconds. A value between 0.01 (10 ms) and
-        0.1 (100 ms) should be good for most use-cases.
+    analysis_window, aw : float, default=0.05 (50 ms)
+        Duration of analysis window in seconds. Values between 0.01 and 0.1 are
+        generally effective.
+
     audio_format, fmt : str
-        type of audio data (e.g., wav, ogg, flac, raw, etc.). This will only be
-        used if `input` is a string path to an audio file. If not given, audio
-        type will be guessed from file name extension or from file header.
+        Type of audio data (e.g., wav, ogg, flac, raw). Used if `input` is a
+        file path. If not specified, audio format is inferred from the file
+        extension or header.
+
     sampling_rate, sr : int
-        sampling rate of audio data. Required if `input` is a raw audio file, is
-        a bytes object or None (i.e., read from microphone).
+        Sampling rate of audio data, required if `input` is raw data (bytes or
+        None).
+
     sample_width, sw : int
-        number of bytes used to encode one audio sample, typically 1, 2 or 4.
-        Required for raw data, see `sampling_rate`.
+        Number of bytes per audio sample (typically 1, 2, or 4). Required for
+        raw audio; see `sampling_rate`.
+
     channels, ch : int
-        number of channels of audio data. Required for raw data, see
-        `sampling_rate`.
+        Number of audio channels. Required for raw data; see `sampling_rate`.
+
     use_channel, uc : {None, "mix"} or int
-        which channel to use for split if `input` has multiple audio channels.
-        Regardless of which channel is used for splitting, returned audio events
-        contain data from *all* channels, just as `input`.
-        The following values are accepted:
+        Channel selection for splitting if `input` has multiple channels. All
+        channels are retained in detected events. Options:

-        - None (alias "any"): accept audio activity from any channel, even if
-          other channels are silent. This is the default behavior.
+        - None or "any" (default): accept activity from any channel.
+        - "mix" or "average": mix all channels into a single averaged channel.
+        - int (0 <= value < channels): use the specified channel ID for splitting.

-        - "mix" ("avg" or "average"): mix down all channels (i.e. compute
-          average channel) and split the resulting channel.
+    large_file : bool, default=False
+        If True and `input` is a path to a wav or raw file, audio is processed
+        lazily. Otherwise, the entire file is loaded before splitting. Set to
+        True if file size exceeds available memory.

-        - int (0 <=, > `channels`): use one channel, specified by integer id,
-          for split.
+    max_read, mr : float, default=None
+        Maximum data read from source in seconds. Default is to read to end.

-    large_file : bool, default: False
-        If True, AND if `input` is a path to a *wav* of a *raw* audio file
-        (and only these two formats) then audio data is lazily loaded to memory
-        (i.e., one analysis window a time). Otherwise the whole file is loaded
-        to memory before split. Set to True if the size of the file is larger
-        than available memory.
-    max_read, mr : float, default: None, read until end of stream
-        maximum data to read from source in seconds.
-    validator, val : callable, DataValidator
-        custom data validator. If `None` (default), an `AudioEnergyValidator` is
-        used with the given energy threshold. Can be a callable or an instance
-        of `DataValidator` that implements `is_valid`. In either case, it'll be
-        called with with a window of audio data as the first parameter.
-    energy_threshold, eth : float, default: 50
-        energy threshold for audio activity detection. Audio regions that have
-        enough windows of with a signal energy equal to or above this threshold
-        are considered valid audio events. Here we are referring to this amount
-        as the energy of the signal but to be more accurate, it is the log
-        energy of computed as: `20 * log10(sqrt(dot(x, x) / len(x)))` (see
-        :class:`AudioEnergyValidator` and
-        :func:`calculate_energy_single_channel`). If `validator` is given, this
-        argument is ignored.
+    validator, val : callable or DataValidator, default=None
+        Custom validator for audio data. If None, uses `AudioEnergyValidator`
+        with the given `energy_threshold`. Should be callable or an instance of
+        `DataValidator` implementing `is_valid`.
+
+    energy_threshold, eth : float, default=50
+        Energy threshold for audio activity detection. Audio regions with
+        sufficient signal energy above this threshold are considered valid.
+        Calculated as the log energy: `20 * log10(sqrt(dot(x, x) / len(x)))`.
+        Ignored if `validator` is specified.

     Yields
     ------
     AudioRegion
-        a generator of detected :class:`AudioRegion` s.
+        Generator yielding detected :class:`AudioRegion` instances.
     """
+
     if min_dur <= 0:
         raise ValueError(f"'min_dur' ({min_dur}) must be > 0")
     if max_dur <= 0:
@@ -316,23 +320,24 @@


 def make_silence(duration, sampling_rate=16000, sample_width=2, channels=1):
-    """Generate a silence of a specific  duration.
+    """
+    Generate a silence of specified duration.

     Parameters
     ----------
     duration : float
-        silence duration in seconds.
+        Duration of silence in seconds.
     sampling_rate : int, optional
-        sampling rate of audio data, by default 16000.
+        Sampling rate of the audio data, default is 16000.
     sample_width : int, optional
-        number of bytes used to encode one audio sample, by default 2.
+        Number of bytes per audio sample, default is 2.
     channels : int, optional
-        number of channels of audio data, by default 1.
+        Number of audio channels, default is 1.

     Returns
     -------
     AudioRegion
-        a "silent" AudioRegion of the desired duration.
+        A "silent" AudioRegion of the specified duration.
     """
     size = round(duration * sampling_rate) * sample_width * channels
     data = b"\0" * size
@@ -341,25 +346,22 @@


 def split_and_join_with_silence(input, silence_duration, **kwargs):
-    """Split input audio and join (i.e. glue) the resulting regions with a
-    silence of duration `silence_duration`. This can be used to create audio
-    data with shortened or lengthened silence between audio events.
-
+    """
+    Split input audio and join (glue) the resulting regions with a specified
+    silence duration between them. This can be used to adjust the length of
+    silence between audio events, either shortening or lengthening pauses.

     Parameters
     ----------
     silence_duration : float
-        silence duration in seconds.
+        Duration of silence in seconds between audio events.

     Returns
     -------
-    AudioRegion, None
-        An AudioRegion with the desired between-events silence duration.
-        None if no audio event could be detected in input data.
-
-    See also
-    --------
-    :func:`load`
+    AudioRegion or None
+        An :meth:`AudioRegion` with the specified between-events silence
+        duration. Returns None if no audio events are detected in the input
+        data.
     """
     regions = list(split(input, **kwargs))
     if regions:
@@ -374,33 +376,35 @@
     duration, analysis_window, round_fn=round, epsilon=0
 ):
     """
-    Converts a given duration into a positive integer of analysis windows.
-    if `duration / analysis_window` is not an integer, the result will be
-    rounded to the closest bigger integer. If `duration == 0`, returns `0`.
-    If `duration < analysis_window`, returns 1.
-    `duration` and `analysis_window` can be in seconds or milliseconds but
-    must be in the same unit.
+    Helper function to convert a given duration into a positive integer
+    of analysis windows. If `duration / analysis_window` is not an integer,
+    the result will be rounded up to the nearest integer. If `duration == 0`,
+    returns 0. If `duration < analysis_window`, returns 1.
+
+    Both `duration` and `analysis_window` should be in the same units,
+    either seconds or milliseconds.

     Parameters
     ----------
     duration : float
-        a given duration in seconds or ms.
-    analysis_window: float
-        size of analysis window, in the same unit as `duration`.
-    round_fn : callable
-        function called to round the result. Default: `round`.
-    epsilon : float
-        small value to add to the division result before rounding.
-        E.g., `0.3 / 0.1 = 2.9999999999999996`, when called with
-        `round_fn=math.floor` returns `2` instead of `3`. Adding a small value
-        to `0.3 / 0.1` avoids this error.
+        The given duration in seconds or milliseconds.
+    analysis_window : float
+        The size of each analysis window, in the same units as `duration`.
+    round_fn : callable, optional
+        A function for rounding the result, default is `round`.
+    epsilon : float, optional
+        A small value added before rounding to address floating-point
+        precision issues, ensuring accurate rounding for cases like
+        `0.3 / 0.1`, where `round_fn=math.floor` would otherwise yield
+        an incorrect result.

     Returns
     -------
     nb_windows : int
-        minimum number of `analysis_window`'s to cover `durartion`. That means
-        that `analysis_window * nb_windows >= duration`.
+        The minimum number of `analysis_window` units needed to cover
+        `duration`, ensuring `analysis_window * nb_windows >= duration`.
     """
+
     if duration < 0 or analysis_window <= 0:
         err_msg = "'duration' ({}) must be >= 0 and 'analysis_window' ({}) > 0"
         raise ValueError(err_msg.format(duration, analysis_window))
@@ -418,28 +422,28 @@
     channels,
 ):
     """
-    Helper function to create an `AudioRegion` from parameters returned by
-    tokenization object. It takes care of setting up region `start` and `end`
-    in metadata.
+    Helper function to create an :class:`AudioRegion` from parameters provided
+    by a tokenization object. This function handles setting the `start` and `end`
+    metadata for the region.

     Parameters
     ----------
-    frame_duration: float
-        duration of analysis window in seconds
+    frame_duration : float
+        Duration of each analysis window in seconds.
     start_frame : int
-        index of the first analysis window
+        Index of the first analysis window.
     sampling_rate : int
-        sampling rate of audio data
+        Sampling rate of the audio data.
     sample_width : int
-        number of bytes of one audio sample
+        Number of bytes per audio sample.
     channels : int
-        number of channels of audio data
+        Number of audio channels.

     Returns
     -------
     audio_region : AudioRegion
-        AudioRegion whose start time is calculated as:
-        `1000 * start_frame * frame_duration`
+        An AudioRegion object with `start` time calculated as:
+        `1000 * start_frame * frame_duration`.
     """
     start = start_frame * frame_duration
     data = b"".join(data_frames)
@@ -449,17 +453,18 @@
 def _read_chunks_online(max_read, **kwargs):
     """
     Helper function to read audio data from an online blocking source
-    (i.e., microphone). Used to build an `AudioRegion` and can intercept
-    KeyboardInterrupt so that reading stops as soon as this exception is
-    raised. Makes building `AudioRegion`s on [i]python sessions and jupyter
-    notebooks more user friendly.
+    (e.g., a microphone). This function builds an `AudioRegion` and can
+    intercept `KeyboardInterrupt` to stop reading immediately when the
+    exception is raised, making it more user-friendly for [i]Python sessions
+    and Jupyter notebooks.

     Parameters
     ----------
     max_read : float
-        maximum amount of data to read in seconds.
+        Maximum duration of audio data to read, in seconds.
     kwargs :
-        audio parameters (sampling_rate, sample_width and channels).
+        Additional audio parameters (e.g., `sampling_rate`, `sample_width`,
+        and `channels`).
     """
     reader = AudioReader(None, block_dur=0.5, max_read=max_read, **kwargs)
     reader.open()
@@ -472,7 +477,7 @@
             data.append(frame)
     except KeyboardInterrupt:
         # Stop data acquisition from microphone when pressing
-        # Ctrl+C on a [i]python session or a notebook
+        # Ctrl+C in an [i]python session or a notebook
         pass
     reader.close()
     return (
@@ -485,22 +490,25 @@

 def _read_offline(input, skip=0, max_read=None, **kwargs):
     """
-    Helper function to read audio data from an offline (i.e., file). Used to
-    build `AudioRegion`s.
+    Helper function to read audio data from an offline source (e.g., file).
+    This function is used to build :class:`AudioRegion` objects.

     Parameters
     ----------
-    input : str, bytes
-        path to audio file (if str), or a bytes object representing raw audio
-        data.
-    skip : float, default 0
-        amount of data to skip from the begining of audio source.
-    max_read : float, default: None
-        maximum amount of audio data to read. Default: None, means read until
-        end of stream.
+    input : str or bytes
+        Path to an audio file (if str) or a bytes object representing raw
+        audio data.
+    skip : float, optional, default=0
+        Amount of data to skip from the beginning of the audio source, in
+        seconds.
+    max_read : float, optional, default=None
+        Maximum duration of audio data to read, in seconds. Default is None,
+        which reads until the end of the stream.
     kwargs :
-        audio parameters (sampling_rate, sample_width and channels).
+        Additional audio parameters (e.g., `sampling_rate`, `sample_width`,
+        and `channels`).
     """
+
     audio_source = get_audio_source(input, **kwargs)
     audio_source.open()
     if skip is not None and skip > 0:
@@ -533,8 +541,9 @@


 class _SecondsView:
-    """A class to create a view of `AudioRegion` that can be sliced using
-    indices in seconds.
+    """
+    A class to create a view of an :class:`AudioRegion` that supports slicing
+    with time-based indices in seconds.
     """

     def __init__(self, region):
@@ -588,7 +597,7 @@


 class _AudioRegionMetadata(dict):
-    """A class to store `AudioRegion`'s metadata."""
+    """A class to store :class:`AudioRegion`'s metadata."""

     def __getattr__(self, name):
         warnings.warn(
@@ -617,27 +626,23 @@
 @dataclass(frozen=True)
 class AudioRegion(object):
     """
-    AudioRegion encapsulates raw audio data and provides an interface to
-    perform simple operations on it. Use `AudioRegion.load` to build an
-    `AudioRegion` from different types of objects.
+    `AudioRegion` encapsulates raw audio data and provides an interface for
+    performing basic audio operations. Use :meth:`AudioRegion.load` or
+    :func:`load` to create an `AudioRegion` from various input types.

     Parameters
     ----------
     data : bytes
-        raw audio data as a bytes object
+        Raw audio data as a bytes object.
     sampling_rate : int
-        sampling rate of audio data
+        Sampling rate of the audio data.
     sample_width : int
-        number of bytes of one audio sample
+        Number of bytes per audio sample.
     channels : int
-        number of channels of audio data
-    start : float, default: None
-        optional start time of the region. This is typically provided by the
-        `split` function.
-
-    See also
-    --------
-    AudioRegion.load
+        Number of audio channels.
+    start : float, optional, default=None
+        Optional start time of the region, typically provided by the `split`
+        function.
     """

     data: bytes
@@ -677,18 +682,22 @@
     @classmethod
     def load(cls, input, skip=0, max_read=None, **kwargs):
         """
-        Create an `AudioRegion` by loading data from `input`. See :func:`load`
-        for parameters descripion.
+        Create an :class:`AudioRegion` by loading data from `input`.
+
+        See :func:`load` for a full description of parameters.

         Returns
         -------
-        region: AudioRegion
+        region : AudioRegion
+            An AudioRegion instance created from the specified input data.

         Raises
         ------
         ValueError
-            raised if `input` is None and `skip` != 0 or `max_read` is None.
+            Raised if `input` is None and either `skip` is not 0 or `max_read`
+            is None.
         """
+
         if input is None:
             if skip > 0:
                 raise ValueError(
@@ -740,21 +749,21 @@

     def play(self, progress_bar=False, player=None, **progress_bar_kwargs):
         """
-        Play audio region.
+        Play the audio region.

         Parameters
         ----------
-        progress_bar : bool, default: False
-            whether to use a progress bar while playing audio. Default: False.
-            `progress_bar` requires `tqdm`, if not installed, no progress bar
-            will be shown.
-        player : AudioPalyer, default: None
-            audio player to use. if None (default), use `player_for()`
-            to get a new audio player.
-        progress_bar_kwargs : kwargs
-            keyword arguments to pass to `tqdm` progress_bar builder (e.g.,
-            use `leave=False` to clean up the screen when play finishes).
+        progress_bar : bool, optional, default=False
+            Whether to display a progress bar during playback. Requires `tqdm`,
+            if not installed, no progress bar will be shown.
+        player : AudioPlayer, optional, default=None
+            Audio player to use for playback. If None (default), a new player is
+            obtained via `player_for()`.
+        progress_bar_kwargs : dict, optional
+            Additional keyword arguments to pass to the `tqdm` progress bar
+            (e.g., `leave=False` to clear the bar from the screen upon completion).
         """
+
         if player is None:
             player = player_for(self)
         player.play(self.data, progress_bar=progress_bar, **progress_bar_kwargs)
@@ -763,38 +772,39 @@
         self, filename, audio_format=None, exists_ok=True, **audio_parameters
     ):
         """
-        Save audio region to file.
+        Save the audio region to a file.

         Parameters
         ----------
-        filename : str, Path
-            path to output audio file. If of type `str`, it may contain a
-            `{start}`, `{end}` and a `{duration}` placeholders.
-            Regions returned by `split` contain a `start` and and `end`
-            attributes that can be used to build output file name as in the
-            example.
-        audio_format : str, default: None
-            format used to save audio data. If None (default), format is guessed
-            from file name's extension. If file name has no extension, audio
-            data is saved as a raw (headerless) audio file.
-        exists_ok : bool, default: True
-            If True, overwrite `file` if a file with the same name exists.
-            If False, raise an `IOError` if `file` exists.
-        audio_parameters: dict
-            any keyword arguments to be passed to audio saving backend.
+        filename : str or Path
+            Path to the output audio file. If a string, it may include `{start}`,
+            `{end}`, and `{duration}` placeholders. Regions created by `split`
+            contain `start` and `end` attributes that can be used to format the
+            filename, as shown in the example.
+        audio_format : str, optional, default=None
+            Format used to save the audio data. If None (default), the format is
+            inferred from the file extension. If the filename has no extension,
+            the audio is saved as a raw (headerless) audio file.
+        exists_ok : bool, optional, default=True
+            If True, overwrite the file if it already exists. If False, raise an
+            `IOError` if the file exists.
+        audio_parameters : dict, optional
+            Additional keyword arguments to pass to the audio-saving backend.

         Returns
         -------
-        file: str
-            name of output file with filled placehoders.
+        file : str
+            The output filename with placeholders filled in.
+
         Raises
-            IOError if `filename` exists and `exists_ok` is False.
-
+        ------
+        IOError
+            If `filename` exists and `exists_ok` is False.

         Examples
         --------
-        Create and AudioRegion, explicitly passing a value for `start`. `end`
-        will be computed based on `start` and the region's duration.
+        Create an AudioRegion, specifying `start`. The `end` will be computed
+        based on `start` and the region's duration.

         >>> region = AudioRegion(b'\0' * 2 * 24000,
         >>>                      sampling_rate=16000,
@@ -809,6 +819,7 @@
         >>> filename = region.save('audio_{start:.3f}-{end:.3f}_{duration:.3f}.wav')
         >>> assert filename == "audio_2.250-3.750_1.500.wav"
         """
+
         if isinstance(filename, Path):
             if not exists_ok and filename.exists():
                 raise FileExistsError(
@@ -845,7 +856,8 @@
         strict_min_dur=False,
         **kwargs,
     ):
-        """Split audio region. See :func:`auditok.split()` for a comprehensive
+        """
+        Split audio region. See :func:`auditok.split` for a comprehensive
         description of split parameters.
         See Also :meth:`AudioRegio.split_and_plot`.
         """
@@ -873,25 +885,28 @@
         dpi=120,
         theme="auditok",
     ):
-        """Plot audio region using one sub-plot per each channel.
+        """
+        Plot the audio region with one subplot per channel.

         Parameters
         ----------
-        scale_signal : bool, default: True
-            if true, scale signal by subtracting its mean and dividing by its
+        scale_signal : bool, optional, default=True
+            If True, scale the signal by subtracting its mean and dividing by its
             standard deviation before plotting.
-        show : bool
-            whether to show plotted signal right after the call.
-        figsize : tuple, default: None
-            width and height of the figure to pass to `matplotlib`.
-        save_as : str, default None.
-            if provided, also save plot to file.
-        dpi : int, default: 120
-            plot dpi to pass to `matplotlib`.
-        theme : str or dict, default: "auditok"
-            plot theme to use. Currently only "auditok" theme is implemented. To
-            provide you own them see :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
+        show : bool, optional, default=False
+            Whether to display the plot immediately after the function call.
+        figsize : tuple, optional, default=None
+            Width and height of the figure, passed to `matplotlib`.
+        save_as : str, optional, default=None
+            If specified, save the plot to the given filename.
+        dpi : int, optional, default=120
+            Dots per inch (DPI) for the plot, passed to `matplotlib`.
+        theme : str or dict, optional, default="auditok"
+            Plot theme to use. Only the "auditok" theme is currently implemented.
+            To define a custom theme, refer to
+            :attr:`auditok.plotting.AUDITOK_PLOT_THEME`.
         """
+
         plot(
             self,
             scale_signal=scale_signal,
@@ -917,9 +932,15 @@
         theme="auditok",
         **kwargs,
     ):
-        """Split region and plot signal and detections. Alias: :meth:`splitp`.
-        See :func:`auditok.split()` for a comprehensive description of split
-        parameters. Also see :meth:`plot` for plot parameters.
+        """
+        Split the audio region, then plot the signal and detected regions.
+
+        Alias
+        -----
+        :meth:`splitp`
+
+        Refer to :func:`auditok.split()` for a detailed description of split
+        parameters, and to :meth:`plot` for plot-specific parameters.
         """
         regions = self.split(
             min_dur=min_dur,
@@ -1001,8 +1022,9 @@
     @property
     def len(self):
         """
-        Return region length in number of samples.
+        Return the length of the audio region in number of samples.
         """
+
         return len(self)

     def __bytes__(self):
@@ -1021,10 +1043,12 @@

     def __add__(self, other):
         """
-        Concatenates this region and `other` and return a new region.
-        Both regions must have the same sampling rate, sample width
-        and number of channels. If not, raises a `ValueError`.
+        Concatenate this audio region with `other`, returning a new region.
+
+        Both regions must have the same sampling rate, sample width, and number
+        of channels. If they differ, a `ValueError` is raised.
         """
+
         if not isinstance(other, AudioRegion):
             raise TypeError(
                 "Can only concatenate AudioRegion, "
@@ -1036,10 +1060,18 @@

     def __radd__(self, other):
         """
-        Concatenates `other` and this region. `other` should be an
-        `AudioRegion` with the same audio parameters as this region
-        but can exceptionally be `0` to make it possible to concatenate
-        many regions with `sum`.
+        Concatenate `other` with this audio region.
+
+        Parameters
+        ----------
+        other : AudioRegion or int
+            An `AudioRegion` with the same audio parameters as this region, or
+            `0` to enable concatenating multiple regions using `sum`.
+
+        Returns
+        -------
+        AudioRegion
+            A new `AudioRegion` representing the concatenation result.
         """
         if other == 0:
             return self
@@ -1108,91 +1140,60 @@

 class StreamTokenizer:
     """
-    Class for stream tokenizers. It implements a 4-state automaton scheme
-    to extract sub-sequences of interest on the fly.
+    Class for stream tokenizers, implementing a 4-state automaton scheme
+    to extract relevant sub-sequences from a data stream in real-time.

     Parameters
     ----------
-    validator : callable, DataValidator (must implement `is_valid`)
-        called with each data frame read from source. Should take one positional
-        argument and return True or False for valid and invalid frames
-        respectively.
+    validator : callable or :class:`DataValidator` (must implement `is_valid`).
+        Called with each data frame read from the source. Should take a
+        single argument and return True or False to indicate valid and
+        invalid frames, respectively.

     min_length : int
-        Minimum number of frames of a valid token. This includes all
-        tolerated non valid frames within the token.
+        Minimum number of frames in a valid token, including any tolerated
+        non-valid frames within the token.

     max_length : int
-        Maximum number of frames of a valid token. This includes all
-        tolerated non valid frames within the token.
+        Maximum number of frames in a valid token, including all tolerated
+        non-valid frames within the token.

     max_continuous_silence : int
-        Maximum number of consecutive non-valid frames within a token.
-        Note that, within a valid token, there may be many tolerated
-        *silent* regions that contain each a number of non valid frames up
-        to `max_continuous_silence`
+        Maximum number of consecutive non-valid frames within a token. Each
+        silent region may contain up to `max_continuous_silence` frames.

-    init_min : int
-        Minimum number of consecutive valid frames that must be
-        **initially** gathered before any sequence of non valid frames can
-        be tolerated. This option is not always needed, it can be used to
-        drop non-valid tokens as early as possible. **Default = 0** means
-        that the option is by default ineffective.
+    init_min : int, default=0
+        Minimum number of consecutive valid frames required before
+        tolerating any non-valid frames. Helps discard non-valid tokens
+        early if needed.

-    init_max_silence : int
-        Maximum number of tolerated consecutive non-valid frames if the
-        number already gathered valid frames has not yet reached
-        'init_min'.This argument is normally used if `init_min` is used.
-        **Default = 0**, by default this argument is not taken into
-        consideration.
+    init_max_silence : int, default=0
+        Maximum number of tolerated consecutive non-valid frames before
+        reaching `init_min`. Used if `init_min` is specified.

     mode : int
-        mode can be one of the following:
+        Defines the tokenizer behavior with the following options:

-            -1 `StreamTokenizer.NORMAL` : do not drop trailing silence, and
-            accept a token shorter than `min_length` if it is the continuation
-            of the latest delivered token.
+        - `StreamTokenizer.NORMAL` (0, default): Do not drop trailing silence
+          and allow tokens shorter than `min_length` if they immediately follow
+          a delivered token.

-            -2 `StreamTokenizer.STRICT_MIN_LENGTH`: if token `i` is delivered
-            because `max_length` is reached, and token `i+1` is immediately
-            adjacent to token `i` (i.e. token `i` ends at frame `k` and token
-            `i+1` starts at frame `k+1`) then accept token `i+1` only of it has
-            a size of at least `min_length`. The default behavior is to accept
-            token `i+1` event if it is shorter than `min_length` (provided that
-            the above conditions are fulfilled of course).
+        - `StreamTokenizer.STRICT_MIN_LENGTH` (2): If a token `i` is
+          delivered at `max_length`, any adjacent token `i+1` must meet
+          `min_length`.

-            -3 `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing
-            non-valid frames from a token to be delivered if and only if it
-            is not **truncated**. This can be a bit tricky. A token is actually
-            delivered if:
+        - `StreamTokenizer.DROP_TRAILING_SILENCE` (4): Drop all trailing
+          non-valid frames from a token unless the token is truncated
+          (e.g., at `max_length`).

-                - `max_continuous_silence` is reached.
-
-                - Its length reaches `max_length`. This is referred to as a
-                  **truncated** token.
-
-            In the current implementation, a `StreamTokenizer`'s decision is only
-            based on already seen data and on incoming data. Thus, if a token is
-            truncated at a non-valid but tolerated frame (`max_length` is reached
-            but `max_continuous_silence` not yet) any tailing silence will be kept
-            because it can potentially be part of valid token (if `max_length` was
-            bigger). But if `max_continuous_silence` is reached before
-            `max_length`, the delivered token will not be considered as truncated
-            but a result of *normal* end of detection (i.e. no more valid data).
-            In that case the trailing silence can be removed if you use the
-            `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
-
-            -4 `(StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE)`:  # noqa: B950
-            use both options. That means: first remove tailing silence, then
-            check if the token still has a length of at least `min_length`.
-
+        - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`:
+          Apply both `STRICT_MIN_LENGTH` and `DROP_TRAILING_SILENCE`.

     Examples
     --------
-
-    In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
-    accepted although it is shorter than `min_length` (3), because it
-    immediately follows the latest delivered token:
+    In the following, without `STRICT_MIN_LENGTH`, the 'BB' token is
+    accepted even though it is shorter than `min_length` (3) because it
+    immediately follows the last delivered token:

     >>> from auditok.core import StreamTokenizer
     >>> from auditok.util import StringDataSource, DataValidator
@@ -1200,42 +1201,43 @@
     >>> class UpperCaseChecker(DataValidator):
     >>>     def is_valid(self, frame):
                 return frame.isupper()
+
     >>> dsource = StringDataSource("aaaAAAABBbbb")
-    >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
-                                    min_length=3,
-                                    max_length=4,
-                                    max_continuous_silence=0)
+    >>> tokenizer = StreamTokenizer(
+    >>>     validator=UpperCaseChecker(),
+    >>>     min_length=3,
+    >>>     max_length=4,
+    >>>     max_continuous_silence=0
+    >>> )
     >>> tokenizer.tokenize(dsource)
     [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]

+    Using `STRICT_MIN_LENGTH` mode rejects the 'BB' token:

-    The following tokenizer will however reject the 'BB' token:
-
-    >>> dsource = StringDataSource("aaaAAAABBbbb")
-    >>> tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
-                                    min_length=3, max_length=4,
-                                    max_continuous_silence=0,
-                                    mode=StreamTokenizer.STRICT_MIN_LENGTH)
+    >>> tokenizer = StreamTokenizer(
+    >>>     validator=UpperCaseChecker(),
+    >>>     min_length=3,
+    >>>     max_length=4,
+    >>>     max_continuous_silence=0,
+    >>>     mode=StreamTokenizer.STRICT_MIN_LENGTH
+    >>> )
     >>> tokenizer.tokenize(dsource)
     [(['A', 'A', 'A', 'A'], 3, 6)]

-
+    With `DROP_TRAILING_SILENCE`, trailing silence is removed if not truncated:

     >>> tokenizer = StreamTokenizer(
-    >>>                validator=UpperCaseChecker(),
-    >>>                min_length=3,
-    >>>                max_length=6,
-    >>>                max_continuous_silence=3,
-    >>>                mode=StreamTokenizer.DROP_TRAILING_SILENCE
-    >>>                )
+    >>>     validator=UpperCaseChecker(),
+    >>>     min_length=3,
+    >>>     max_length=6,
+    >>>     max_continuous_silence=3,
+    >>>     mode=StreamTokenizer.DROP_TRAILING_SILENCE
+    >>> )
     >>> dsource = StringDataSource("aaaAAAaaaBBbbbb")
     >>> tokenizer.tokenize(dsource)
     [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]

-    The first token is delivered with its tailing silence because it is
-    truncated while the second one has its tailing frames removed.
-
-    Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
+    Without `DROP_TRAILING_SILENCE`, the output includes trailing frames:

     .. code:: python

@@ -1243,7 +1245,6 @@
             (['A', 'A', 'A', 'a', 'a', 'a'], 3, 8),
             (['B', 'B', 'b', 'b', 'b'], 9, 13)
         ]
-
     """

     SILENCE = 0
@@ -1336,32 +1337,41 @@

     def tokenize(self, data_source, callback=None, generator=False):
         """
-        Read data from `data_source`, one frame a time, and process the read
-        frames in order to detect sequences of frames that make up valid
-        tokens.
+        Read data from `data_source` one frame at a time and process each frame
+        to detect sequences that form valid tokens.

-        :Parameters:
-           `data_source` : instance of the :class:`DataSource` class that
-               implements a `read` method. 'read' should return a slice of
-               signal, i.e. frame (of whatever type as long as it can be
-               processed by validator) and None if there is no more signal.
+        Parameters
+        ----------
+        data_source : DataSource
+            An instance of the :class:`DataSource` class that implements a `read`
+            method. `read` should return a slice of the signal (a frame of any
+            type that can be processed by the validator) or None when there is no
+            more data in the source.

-           `callback` : an optional 3-argument function.
-               If a `callback` function is given, it will be called each time
-               a valid token is found.
+        callback : callable, optional
+            A function that takes three arguments. If provided, `callback` is
+            called each time a valid token is detected.

+        generator : bool, optional, default=False
+            If True, the method yields tokens as they are detected, rather than
+            returning a list. If False, a list of tokens is returned.

-        :Returns:
-           A list of tokens if `callback` is None. Each token is tuple with the
-           following elements:
+        Returns
+        -------
+        list of tuples or generator
+            A list of tokens if `generator` is False, or a generator yielding
+            tokens if `generator` is True. Each token is a tuple with the
+            following structure:

-            .. code python
+            .. code:: python

                 (data, start, end)

-           where `data` is a list of read frames, `start`: index of the first
-           frame in the original data and `end` : index of the last frame.
+            where `data` is a list of frames in the token, `start` is the index
+            of the first frame in the original data, and `end` is the index of
+            the last frame.
         """
+
         token_gen = self._iter_tokens(data_source)
         if callback:
             for token in token_gen: