Mercurial > hg > auditok
changeset 179:45875e458c7e
Implement split function
author | Amine Sehili <amine.sehili@gmail.com> |
---|---|
date | Sun, 17 Mar 2019 18:36:19 +0100 |
parents | 11885f96acb2 |
children | 8ee7f461b82e |
files | auditok/core.py |
diffstat | 1 files changed, 180 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/auditok/core.py Sat Mar 16 18:33:14 2019 +0100 +++ b/auditok/core.py Sun Mar 17 18:36:19 2019 +0100 @@ -9,10 +9,188 @@ StreamTokenizer """ -from auditok.util import DataValidator +from auditok.util import AudioDataSource, DataValidator, AudioEnergyValidator from auditok.io import check_audio_data -__all__ = ["AudioRegion", "StreamTokenizer"] +__all__ = ["split", "AudioRegion", "StreamTokenizer"] + + +DEFAULT_ANALYSIS_WINDOW = 0.05 +DEFAULT_ENERGY_THRESHOLD = 50 + + +def split( + input, + min_dur=0.2, + max_dur=5, + max_silence=0.3, + drop_trailing_silence=False, + strict_min_length=False, + analysis_window=0.01, + **kwargs +): + """Splits audio data and returns a generator of `AudioRegion`s + TODO: implement max_trailing_silence + + :Parameters: + + input: str, bytes, AudioSource, AudioRegion, AudioDataSource + input audio data. If str, it should be a path to an existing audio + file. If bytes, input is considered as raw audio data. + audio_format: str + type of audio date (e.g., wav, ogg, raw, etc.). This will only be used + if ´input´ is a string path to audio file. If not given, audio type + will be guessed from file name extension or from file header. + min_dur: float + minimun duration in seconds of a detected audio event. Default: 0.2. + Using large values, very short audio events (e.g., very short 1-word + utterances like 'yes' or 'no') can be missed. + Using very short values might result in a high number of short, + unuseful audio events. + max_dur: float + maximum duration in seconds of a detected audio event. Default: 5. + max_silence: float + maximum duration of consecutive silence within an audio event. There + might be many silent gaps of this duration within an audio event. + drop_trailing_silence: bool + drop trailing silence from detected events + strict_min_length: bool + strict minimum length. Drop an event if it is shorter than ´min_length´ + even if it is continguous to the latest valid event. This happens if + the the latest event had reached ´max_length´. + analysis_window: float + duration of analysis window in seconds. Default: 0.05 second (50 ms). + A value up to 0.1 second (100 ms) should be good for most use-cases. + You might need a different value, especially if you use a custom + validator. + sampling_rate, sr: int + sampling rate of audio data. Only needed for raw audio files/data. + sample_width, sw: int + number of bytes used to encode an audio sample, typically 1, 2 or 4. + Only needed for raw audio files/data. + channels, ch: int + nuumber of channels of audio data. Only needed for raw audio files. + use_channel, uc: int, str + which channel to use if input has multichannel audio data. Can be an + int (0 being the first channel), or one of the following special str + values: + - 'left': first channel (equivalent to 0) + - 'right': second channel (equivalent to 1) + - 'mix': compute average channel + Default: 0, use the first channel. + max_read: float + maximum data to read in seconds. Default: `None`, read until there is no more + data to read. + validator: DataValidator + custom data validator. If ´None´ (default), an `AudioEnergyValidor` is + used with the given energy threshold. + energy_threshold: float + energy threshlod for audio activity detection, default: 50. If a custom + validator is given, this argumemt will be ignored. + """ + print( + "split:", min_dur, max_dur, max_silence, drop_trailing_silence, analysis_window + ) + + if isinstance(input, AudioDataSource): + source = input + else: + block_dur = kwargs.get("analysis_window", DEFAULT_ANALYSIS_WINDOW) + max_read = kwargs.get("max_read") + params = kwargs.copy() + if isinstance(input, AudioRegion): + params["sampling_rate"] = input.sr + params["sample_width"] = input.sw + params["channels"] = input.ch + input = bytes(input) + + source = AudioDataSource( + input, block_dur=block_dur, max_read=max_read, **params + ) + + validator = kwargs.get("validator") + if validator is None: + energy_threshold = kwargs.get("energy_threshold", DEFAULT_ENERGY_THRESHOLD) + validator = AudioEnergyValidator(source.sw, energy_threshold) + + mode = StreamTokenizer.DROP_TRAILING_SILENCE if drop_trailing_silence else 0 + if strict_min_length: + mode |= StreamTokenizer.STRICT_MIN_LENGTH + + min_length = _duration_to_nb_windows(min_dur, analysis_window) + max_length = _duration_to_nb_windows(max_dur, analysis_window) + max_continuous_silence = _duration_to_nb_windows(max_silence, analysis_window) + + print(min_length, max_length, max_continuous_silence) + tokenizer = StreamTokenizer( + validator, min_length, max_length, max_continuous_silence, mode=mode + ) + source.open() + token_gen = tokenizer.tokenize(source, generator=True) + region_gen = ( + _make_audio_region( + source.block_dur, token[1], token[0], source.sr, source.sw, source.ch + ) + for token in token_gen + ) + return region_gen + + +def _duration_to_nb_windows(duration, analysis_window): + """ + Converts a given duration into a positive integer on analysis windows. + if `duration / analysis_window` is not an integer, the result will be + rounded to the closest bigger integer. If `duration == 0`, returns `0`. + `duration` and `analysis_window` can be in seconds or milliseconds but + must be in the same unit. + + :Parameters: + + duration: float + a given duration in seconds or ms + analysis_window: float + size of analysis window, in the same unit as `duration` + + Returns: + -------- + nb_windows: int + minimum number of `analysis_window`'s to cover `durartion`. That means + that `analysis_window * nb_windows >= duration`. + """ + if duration == 0: + return 0 + if duration > analysis_window: + nb_windows, rest = divmod(duration, analysis_window) + if rest > 0: + nb_windows += 1 + return int(nb_windows) + + +def _make_audio_region( + frame_duration, start_frame, data_frames, sampling_rate, sample_width, channels +): + """Create and return an `AudioRegion`. + + :Parameters: + + frame_duration: float + duration of analysis window in seconds + start_frame: int + index of the fisrt analysis window + samling_rate: int + sampling rate of audio data + sample_width: int + number of bytes of one audio sample + channels: int + number of channels of audio data + + Returns: + audio_region: AudioRegion + AudioRegion whose start time is calculeted as `1000 * start_frame * frame_duration` + """ + start = start_frame * frame_duration + data = b"".join(data_frames) + return AudioRegion(data, start, sampling_rate, sample_width, channels) class AudioRegion(object):