Mercurial > hg > auditok
changeset 177:2acbdbd18327
Implement a generator version for tokenize
author | Amine Sehili <amine.sehili@gmail.com> |
---|---|
date | Sat, 16 Mar 2019 18:28:23 +0100 |
parents | c2fa3a12058e |
children | 11885f96acb2 |
files | auditok/core.py |
diffstat | 1 files changed, 37 insertions(+), 31 deletions(-) [+] |
line wrap: on
line diff
--- a/auditok/core.py Wed Mar 13 21:08:20 2019 +0100 +++ b/auditok/core.py Sat Mar 16 18:28:23 2019 +0100 @@ -342,6 +342,7 @@ self._current_frame = 0 def set_mode(self, mode): + # TODO: use properties and make these deprecated """ :Parameters: @@ -355,7 +356,7 @@ - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE` - - `0` + - `0` TODO: this mode should have a name See `StreamTokenizer.__init__` for more information about the mode. """ @@ -389,7 +390,7 @@ self._current_frame = -1 self._deliver = self._append_token - def tokenize(self, data_source, callback=None): + def tokenize(self, data_source, callback=None, generator=False): """ Read data from `data_source`, one frame a time, and process the read frames in order to detect sequences of frames that make up valid tokens. @@ -416,25 +417,28 @@ original data and `end` : index of the last frame. """ + token_gen = self._iter_tokens(data_source) + if callback: + for token in token_gen: + callback(*token) + return + if generator: + return token_gen + return list(token_gen) + def _iter_tokens(self, data_source): self._reinitialize() - - if callback is not None: - self._deliver = callback - while True: frame = data_source.read() + self._current_frame += 1 if frame is None: + token = self._post_process() + if token is not None: + yield token break - self._current_frame += 1 - self._process(frame) - - self._post_process() - - if callback is None: - _ret = self._tokens - self._tokens = None - return _ret + token = self._process(frame) + if token is not None: + yield token def _process(self, frame): @@ -452,7 +456,7 @@ if self._init_count >= self.init_min: self._state = self.NOISE if len(self._data) >= self.max_length: - self._process_end_of_detection(True) + return self._process_end_of_detection(True) else: self._state = self.POSSIBLE_NOISE @@ -465,7 +469,7 @@ if self._init_count >= self.init_min: self._state = self.NOISE if len(self._data) >= self.max_length: - self._process_end_of_detection(True) + return self._process_end_of_detection(True) else: self._silence_length += 1 @@ -483,14 +487,13 @@ if frame_is_valid: self._data.append(frame) if len(self._data) >= self.max_length: - self._process_end_of_detection(True) + return self._process_end_of_detection(True) elif self.max_continuous_silence <= 0: # max token reached at this frame will _deliver if _contiguous_token # and not _strict_min_length - self._process_end_of_detection() self._state = self.SILENCE - + return self._process_end_of_detection() else: # this is the first silent frame following a valid one # and it is tolerated @@ -498,7 +501,7 @@ self._data.append(frame) self._state = self.POSSIBLE_SILENCE if len(self._data) == self.max_length: - self._process_end_of_detection(True) + return self._process_end_of_detection(True) # don't reset _silence_length because we still # need to know the total number of silent frames @@ -509,29 +512,28 @@ self._silence_length = 0 self._state = self.NOISE if len(self._data) >= self.max_length: - self._process_end_of_detection(True) + return self._process_end_of_detection(True) else: if self._silence_length >= self.max_continuous_silence: + self._state = self.SILENCE if self._silence_length < len(self._data): # _deliver only gathered frames aren't all silent - self._process_end_of_detection() - else: - self._data = [] - self._state = self.SILENCE + return self._process_end_of_detection() + self._data = [] self._silence_length = 0 else: self._data.append(frame) self._silence_length += 1 if len(self._data) >= self.max_length: - self._process_end_of_detection(True) + return self._process_end_of_detection(True) # don't reset _silence_length because we still # need to know the total number of silent frames def _post_process(self): if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE: if len(self._data) > 0 and len(self._data) > self._silence_length: - self._process_end_of_detection() + return self._process_end_of_detection() def _process_end_of_detection(self, truncated=False): @@ -544,8 +546,11 @@ (len(self._data) > 0 and not self._strict_min_length and self._contiguous_token): - _end_frame = self._start_frame + len(self._data) - 1 - self._deliver(self._data, self._start_frame, _end_frame) + start_frame = self._start_frame + end_frame = self._start_frame + len(self._data) - 1 + data = self._data + self._data = [] + token = (data, start_frame, end_frame) if truncated: # next token (if any) will start at _current_frame + 1 @@ -554,10 +559,11 @@ self._contiguous_token = True else: self._contiguous_token = False + return token else: self._contiguous_token = False self._data = [] def _append_token(self, data, start, end): - self._tokens.append((data, start, end)) + self._tokens.append((data, start, end)) \ No newline at end of file