Mercurial > hg > auditok
changeset 387:bd242e80455f
Update documentation and configuration
author | Amine Sehili <amine.sehili@gmail.com> |
---|---|
date | Tue, 02 Mar 2021 20:10:50 +0100 |
parents | c030134b7870 |
children | 5fd9b6b7ff0d 9e143e277d51 |
files | .pre-commit-config.yaml CHANGELOG README.rst auditok/core.py auditok/util.py demos/audio_tokenize_demo.py demos/audio_trim_demo.py demos/echo.py doc/_static/css/custom_style.css doc/conf.py doc/examples.rst pyproject.toml |
diffstat | 12 files changed, 53 insertions(+), 314 deletions(-) [+] |
line wrap: on
line diff
--- a/.pre-commit-config.yaml Mon Mar 01 23:11:49 2021 +0100 +++ b/.pre-commit-config.yaml Tue Mar 02 20:10:50 2021 +0100 @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: stable + rev: 20.8b1 hooks: - id: black language_version: python3.7
--- a/CHANGELOG Mon Mar 01 23:11:49 2021 +0100 +++ b/CHANGELOG Tue Mar 02 20:10:50 2021 +0100 @@ -7,9 +7,11 @@ - Implement split function as a high-level API for tokenization - Implement AudioRegion class for simple audio objects manipulation - Use a much faster energy computation method (based on standard audioop module) +- Make ADSFactory deprecated - Choose which channel(s) to use for tokenization - Save multi-channel audio data - Refactor code in all modules +- Use genty for tests - Improve documentation - Use ArgumentParser instead of OptionParser in command-line script - Clean up command-line script and move functions and workers to dedicated modules
--- a/README.rst Mon Mar 01 23:11:49 2021 +0100 +++ b/README.rst Tue Mar 02 20:10:50 2021 +0100 @@ -10,7 +10,7 @@ ``auditok`` is an **Audio Activity Detection** tool that can process online data (read from an audio device or from standard input) as well as audio files. -It can be used as a command line program or by calling its API. +It can be used as a command-line program or by calling its API. The latest version of the documentation can be found on `readthedocs. <https://readthedocs.org/projects/auditok/badge/?version=latest>`_
--- a/auditok/core.py Mon Mar 01 23:11:49 2021 +0100 +++ b/auditok/core.py Tue Mar 02 20:10:50 2021 +0100 @@ -240,9 +240,7 @@ validator = AudioEnergyValidator( energy_threshold, source.sw, source.ch, use_channel=use_channel ) - mode = ( - StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0 - ) + mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0 if strict_min_dur: mode |= StreamTokenizer.STRICT_MIN_LENGTH min_length = _duration_to_nb_windows(min_dur, analysis_window, math.ceil) @@ -532,8 +530,7 @@ class _AudioRegionMetadata(dict): - """A class to store `AudioRegion`'s metadata. - """ + """A class to store `AudioRegion`'s metadata.""" def __getattr__(self, name): if name in self: @@ -610,8 +607,7 @@ @meta.setter def meta(self, new_meta): - """Meta data of audio region. - """ + """Meta data of audio region.""" self._meta = _AudioRegionMetadata(new_meta) @classmethod @@ -658,8 +654,7 @@ @property def millis(self): - """A view to slice audio region by milliseconds (using ``region.millis[start:end]``). - """ + """A view to slice audio region by milliseconds (using ``region.millis[start:end]``).""" return self._millis_view @property @@ -673,38 +668,32 @@ @property def sampling_rate(self): - """Samling rate of audio data. - """ + """Samling rate of audio data.""" return self._sampling_rate @property def sr(self): - """Samling rate of audio data, alias for `sampling_rate`. - """ + """Samling rate of audio data, alias for `sampling_rate`.""" return self._sampling_rate @property def sample_width(self): - """Number of bytes per sample, one channel considered. - """ + """Number of bytes per sample, one channel considered.""" return self._sample_width @property def sw(self): - """Number of bytes per sample, alias for `sampling_rate`. - """ + """Number of bytes per sample, alias for `sampling_rate`.""" return self._sample_width @property def channels(self): - """Number of channels of audio data. - """ + """Number of channels of audio data.""" return self._channels @property def ch(self): - """Number of channels of audio data, alias for `channels`. - """ + """Number of channels of audio data, alias for `channels`.""" return self._channels def play(self, progress_bar=False, player=None, **progress_bar_kwargs): @@ -730,9 +719,7 @@ self._data, progress_bar=progress_bar, **progress_bar_kwargs ) - def save( - self, file, audio_format=None, exists_ok=True, **audio_parameters - ): + def save(self, file, audio_format=None, exists_ok=True, **audio_parameters): """ Save audio region to file. @@ -918,8 +905,7 @@ @property def samples(self): - """Audio region as arrays of samples, one array per channel. - """ + """Audio region as arrays of samples, one array per channel.""" if self._samples is None: self._samples = signal.to_array( self._data, self.sample_width, self.channels @@ -1005,9 +991,7 @@ def __truediv__(self, n): if not isinstance(n, int) or n <= 0: - raise TypeError( - "AudioRegion can only be divided by a positive int" - ) + raise TypeError("AudioRegion can only be divided by a positive int") samples_per_sub_region, rest = divmod(len(self), n) onset = 0 sub_regions = [] @@ -1232,9 +1216,7 @@ ) if min_length <= 0 or min_length > max_length: - err_msg = ( - "'min_length' must be > 0 and <= 'max_length' (value={0})" - ) + err_msg = "'min_length' must be > 0 and <= 'max_length' (value={0})" raise ValueError(err_msg.format(min_length)) if max_continuous_silence >= max_length:
--- a/auditok/util.py Mon Mar 01 23:11:49 2021 +0100 +++ b/auditok/util.py Tue Mar 02 20:10:50 2021 +0100 @@ -509,9 +509,7 @@ "asrc", None ) kwargs["fn"] = kwargs.pop("filename", None) or kwargs.pop("fn", None) - kwargs["db"] = kwargs.pop("data_buffer", None) or kwargs.pop( - "db", None - ) + kwargs["db"] = kwargs.pop("data_buffer", None) or kwargs.pop("db", None) record = kwargs.pop("record", False) if not record:
--- a/demos/audio_tokenize_demo.py Mon Mar 01 23:11:49 2021 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ -""" -@author: Amine SEHILI <amine.sehili@gmail.com> -September, 2015 -""" - -from auditok import ( - ADSFactory, - AudioEnergyValidator, - StreamTokenizer, - player_for, - dataset, -) -import sys - -try: - - # We set the `record` argument to True so that we can rewind the source - asource = ADSFactory.ads( - filename=dataset.one_to_six_arabic_16000_mono_bc_noise, record=True - ) - - validator = AudioEnergyValidator( - sample_width=asource.get_sample_width(), energy_threshold=65 - ) - - # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) - # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms - # max_length=400 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds - # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms - tokenizer = StreamTokenizer( - validator=validator, - min_length=20, - max_length=400, - max_continuous_silence=30, - ) - - asource.open() - tokens = tokenizer.tokenize(asource) - - # Play detected regions back - player = player_for(asource) - - # Rewind and read the whole signal - asource.rewind() - original_signal = [] - - while True: - w = asource.read() - if w is None: - break - original_signal.append(w) - - original_signal = b"".join(original_signal) - player.play(original_signal) - - print("\n ** playing detected regions...\n") - for i, t in enumerate(tokens): - print( - "Token [{0}] starts at {1} and ends at {2}".format( - i + 1, t[1], t[2] - ) - ) - data = b"".join(t[0]) - player.play(data) - - assert len(tokens) == 8 - - asource.close() - player.stop() - -except KeyboardInterrupt: - - player.stop() - asource.close() - sys.exit(0) - -except Exception as e: - - sys.stderr.write(str(e) + "\n") - sys.exit(1)
--- a/demos/audio_trim_demo.py Mon Mar 01 23:11:49 2021 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,120 +0,0 @@ -""" -@author: Amine SEHILI <amine.sehili@gmail.com> -September, 2015 -""" - -# Trim leading and trailing silence from a record - -from auditok import ( - ADSFactory, - AudioEnergyValidator, - StreamTokenizer, - player_for, - dataset, -) -import pyaudio -import sys - -""" -The tokenizer in the following example is set up to remove the silence -that precedes the first acoustic activity or follows the last activity -in a record. It preserves whatever it founds between the two activities. -In other words, it removes the leading and trailing silence. - -Sampling rate is 44100 sample per second, we'll use an analysis window of 100 ms -(i.e. bloc_ksize == 4410) - -Energy threshold is 50. - -The tokenizer will start accumulating windows up from the moment it encounters -the first analysis window of an energy >= 50. ALL the following windows will be -kept regardless of their energy. At the end of the analysis, it will drop trailing - windows with an energy below 50. - -This is an interesting example because the audio file we're analyzing contains a very -brief noise that occurs within the leading silence. We certainly do want our tokenizer -to stop at this point and considers whatever it comes after as a useful signal. -To force the tokenizer to ignore that brief event we use two other parameters `init_min` -ans `init_max_silence`. By `init_min`=3 and `init_max_silence`=1 we tell the tokenizer -that a valid event must start with at least 3 noisy windows, between which there -is at most 1 silent window. - -Still with this configuration we can get the tokenizer detect that noise as a valid event -(if it actually contains 3 consecutive noisy frames). To circummvent this we use an enough -large analysis window (here of 100 ms) to ensure that the brief noise be surrounded by a much -longer silence and hence the energy of the overall analysis window will be below 50. - -When using a shorter analysis window (of 10ms for instance, block_size == 441), the brief -noise contributes more to energy calculation which yields an energy of over 50 for the window. -Again we can deal with this situation by using a higher energy threshold (55 for example) - -""" - -try: - # record = True so that we'll be able to rewind the source. - asource = ADSFactory.ads( - filename=dataset.was_der_mensch_saet_mono_44100_lead_tail_silence, - record=True, - block_size=4410, - ) - asource.open() - - original_signal = [] - # Read the whole signal - while True: - w = asource.read() - if w is None: - break - original_signal.append(w) - - original_signal = b"".join(original_signal) - - # rewind source - asource.rewind() - - # Create a validator with an energy threshold of 50 - validator = AudioEnergyValidator( - sample_width=asource.get_sample_width(), energy_threshold=50 - ) - - # Create a tokenizer with an unlimited token length and continuous silence within a token - # Note the DROP_TRAILING_SILENCE mode that will ensure removing trailing silence - trimmer = StreamTokenizer( - validator, - min_length=20, - max_length=99999999, - max_continuous_silence=9999999, - mode=StreamTokenizer.DROP_TRAILING_SILENCE, - init_min=3, - init_max_silence=1, - ) - - tokens = trimmer.tokenize(asource) - - # Make sure we only have one token - assert len(tokens) == 1, "Should have detected one single token" - - trimmed_signal = b"".join(tokens[0][0]) - - player = player_for(asource) - - print( - "\n ** Playing original signal (with leading and trailing silence)..." - ) - player.play(original_signal) - print("\n ** Playing trimmed signal...") - player.play(trimmed_signal) - - player.stop() - asource.close() - -except KeyboardInterrupt: - - player.stop() - asource.close() - sys.exit(0) - -except Exception as e: - - sys.stderr.write(str(e) + "\n") - sys.exit(1)
--- a/demos/echo.py Mon Mar 01 23:11:49 2021 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -from auditok import ( - ADSFactory, - AudioEnergyValidator, - StreamTokenizer, - player_for, -) -import pyaudio -import sys - -try: - - energy_threshold = 45 - duration = 10 # seconds - - if len(sys.argv) > 1: - energy_threshold = float(sys.argv[1]) - - if len(sys.argv) > 2: - duration = float(sys.argv[2]) - - # record = True so that we'll be able to rewind the source. - # max_time = 10: read 10 seconds from the microphone - asource = ADSFactory.ads(record=True, max_time=duration) - - validator = AudioEnergyValidator( - sample_width=asource.get_sample_width(), - energy_threshold=energy_threshold, - ) - tokenizer = StreamTokenizer( - validator=validator, - min_length=20, - max_length=250, - max_continuous_silence=30, - ) - - player = player_for(asource) - - def echo(data, start, end): - print("Acoustic activity at: {0}--{1}".format(start, end)) - player.play(b"".join(data)) - - asource.open() - - print( - "\n ** Make some noise (dur:{}, energy:{})...".format( - duration, energy_threshold - ) - ) - - tokenizer.tokenize(asource, callback=echo) - - asource.close() - player.stop() - -except KeyboardInterrupt: - - player.stop() - asource.close() - sys.exit(0) - -except Exception as e: - - sys.stderr.write(str(e) + "\n") - sys.exit(1)
--- a/doc/_static/css/custom_style.css Mon Mar 01 23:11:49 2021 +0100 +++ b/doc/_static/css/custom_style.css Tue Mar 02 20:10:50 2021 +0100 @@ -1,3 +1,7 @@ +div.wy-side-nav-search { + background-color: #000000; +} + div.wy-side-nav-search .version { color: #DDDDDD; font-weight: bold;
--- a/doc/conf.py Mon Mar 01 23:11:49 2021 +0100 +++ b/doc/conf.py Tue Mar 02 20:10:50 2021 +0100 @@ -137,7 +137,7 @@ html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] html_theme_options = { "logo_only": True, - "style_nav_header_background": "black", + "style_nav_header_background": "#000000", } # Theme options are theme-specific and customize the look and feel of a theme
--- a/doc/examples.rst Mon Mar 01 23:11:49 2021 +0100 +++ b/doc/examples.rst Tue Mar 02 20:10:50 2021 +0100 @@ -1,5 +1,5 @@ -Loading audio data ------------------- +Load audio data +--------------- Audio data is loaded with the :func:`load` function which can read from audio files, the microphone or use raw audio data. @@ -7,15 +7,15 @@ From a file =========== -If the first argument of :func:`load` is a string, it should be a path to an audio -file. +If the first argument of :func:`load` is a string, it should be a path to an +audio file. .. code:: python import auditok region = auditok.load("audio.ogg") -If input file contains a raw (headerless) audio data, passing `audio_format="raw"` +If input file contains raw (headerless) audio data, passing `audio_format="raw"` and other audio parameters (`sampling_rate`, `sample_width` and `channels`) is mandatory. In the following example we pass audio parameters with their short names: @@ -42,6 +42,8 @@ data = b"\0" * sr * sw * ch region = auditok.load(data, sr=sr, sw=sw, ch=ch) print(region) + # alternatively you can use + #region = auditok.AudioRegion(data, sr, sw, ch) output: @@ -74,15 +76,30 @@ Skip part of audio data ======================= -If the `skip` parameter is > 0, :func:`load` will skip that leading amount of audio -data: +If the `skip` parameter is > 0, :func:`load` will skip that amount in seconds +of leading audio data: .. code:: python import auditok region = auditok.load("audio.ogg", skip=2) # skip the first 2 seconds -This argument must be 0 when reading from the microphone. +This argument must be 0 when reading data from the microphone. + + +Limit the amount of read audio +============================== + +If the `max_read` parameter is > 0, :func:`load` will read at most that amount +in seconds of audio data: + +.. code:: python + + import auditok + region = auditok.load("audio.ogg", max_read=5) + assert region.duration <= 5 + +This argument is mandatory when reading data from the microphone. Basic split example @@ -188,8 +205,8 @@ seconds with the `max_read` argument. -Accessing recorded data after split ------------------------------------ +Access recorded data after split +-------------------------------- Using a :class:`Recorder` object you can get hold of acquired audio data: @@ -362,7 +379,7 @@ assert len(samples) == region.channels -If `numpy` is not installed you can use: +If `numpy` is installed you can use: .. code:: python