# HG changeset patch # User Amine Sehili # Date 1448406799 -3600 # Node ID c079c57dad692fece1c32a38183867e2a1af1138 # Parent 597494f0fd9aa3c68e78fd94af5da3f481ee57ee auditok as a command line program diff -r 597494f0fd9a -r c079c57dad69 auditok/cmdline.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/auditok/cmdline.py Wed Nov 25 00:13:19 2015 +0100 @@ -0,0 +1,678 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +auditok.auditok -- shortdesc + +auditok.auditok is a description + +It defines classes_and_methods + +@author: Mohamed El Amine SEHILI + +@copyright: 2015 Mohamed El Amine SEHILI + +@license: GPL v3 + +@contact: amine.sehili@gmail.com +@deffield updated: 24 nov 2015 +''' + +import sys +import os + +from optparse import OptionParser, OptionGroup +from threading import Thread +import tempfile +import wave +import time +import threading +from fileinput import filename + +try: + import future + from queue import Queue, Empty +except ImportError: + if sys.version_info >= (3, 0): + from queue import Queue, Empty + else: + from Queue import Queue, Empty + +try: + from pydub import AudioSegment + WITH_PYDUB = True +except ImportError: + WITH_PYDUB = False + + +from .core import StreamTokenizer +from .io import PyAudioSource, BufferAudioSource, StdinAudioSource, player_for +from .util import ADSFactory, AudioEnergyValidator + +__all__ = [] +__version__ = 0.1 +__date__ = '2015-11-23' +__updated__ = '2015-11-23' + +DEBUG = 0 +TESTRUN = 1 +PROFILE = 0 + +class AudioFileFormatError(Exception): + pass + + +class Worker(Thread): + + def __init__(self, timeout=0.2, debug=False): + self.timeout = timeout + self.debug = debug + self._inbox = Queue() + self._stop_request = Queue() + Thread.__init__(self) + + def _stop_requested(self): + + try: + message = self._stop_request.get_nowait() + if message == "stop": + return True + + except Empty: + return False + + def stop(self): + self._stop_request.put("stop") + self.join() + + def send(self, message): + self._inbox.put(message) + + def _get_message(self): + try: + message = self._inbox.get(timeout=self.timeout) + return message + except Empty: + return None + + +class TokenizerWorker(Worker): + + END_OF_PROCESSING = "END_OF_PROCESSING" + + def __init__(self, ads, tokenizer, analysis_window, observers): + Worker.__init__(self) + self.ads = ads + self.tokenizer = tokenizer + self.analysis_window = analysis_window + self.observers = observers + self._inbox = Queue() + self.count = 0 + + def run(self): + + def notify_observers(data, start, end): + audio_data = b''.join(data) + self.count += 1 + # notify observers + for observer in self.observers: + observer.notify({"id" : self.count, + "audio_data" : audio_data, + "start" : start, + "end" : end, + "start_time" : "{:5.2f}".format(start * self.analysis_window), + "end_time" : "{:5.2f}".format((end+1) * self.analysis_window), + "duration" : "{:5.2f}".format((end - start + 1) * self.analysis_window)} + ) + + self.ads.open() + self.tokenizer.tokenize(data_source=self, callback=notify_observers) + for observer in self.observers: + observer.notify(TokenizerWorker.END_OF_PROCESSING) + + def add_observer(self, observer): + self.observers.append(observer) + + def remove_observer(self, observer): + self.observers.remove(observer) + + def read(self): + if self._stop_requested(): + return None + else: + return self.ads.read() + + +class PlayerWorker(Worker): + + def __init__(self, player, timeout=0.2, debug=False): + self.player = player + Worker.__init__(self, timeout=timeout, debug=debug) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + start_time = message.pop("start_time", None) + end_time = message.pop("end_time", None) + dur = message.pop("duration", None) + _id = message.pop("id", None) + + if audio_data is not None: + self.player.play(audio_data) + if self.debug: + print("[PLAY]: playing detection {id} (start:{start}, end:{end}, dur:{dur})".format(id=_id, + start=start_time, end=end_time, dur=dur)) + + def notify(self, message): + self.send(message) + + +class CommandLineWorker(Worker): + + def __init__(self, command, timeout=0.2, debug=False): + self.command = command + Worker.__init__(self, timeout=timeout, debug=debug) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + _id = message.pop("id", None) + if audio_data is not None: + raw_audio_file = tempfile.NamedTemporaryFile(delete=False) + raw_audio_file.write(audio_data) + cmd = self.command.replace("$", raw_audio_file.name) + if self.debug: + print("[CMD]: (detection {id}) {cmd}".format(id=_id, cmd=cmd)) + os.system(cmd) + os.unlink(raw_audio_file.name) + + def notify(self, message): + self.send(message) + + +class TokenSaverWorker(Worker): + + def __init__(self, name_format, filetype, timeout=0.2, debug=False, **kwargs): + self.name_format = name_format + self.filetype = filetype + self.kwargs = kwargs + Worker.__init__(self, timeout=timeout, debug=debug) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + if message is not None: + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + start_time = message.pop("start_time", None) + end_time = message.pop("end_time", None) + _id = message.pop("id", None) + if audio_data is not None and len(audio_data) > 0: + fname = self.name_format.format(N=_id, start = start_time, end = end_time) + try: + save_audio_data(audio_data, fname, filetype=self.filetype, **self.kwargs) + if self.debug: + print("[SAVE]: saving detection {id} to {fname}".format(id=_id, fname=fname)) + except Exception as e: + sys.stderr.write(str(e) + "\n") + + def notify(self, message): + self.send(message) + + +class LogWorker(Worker): + + def __init__(self, timeout=0.2): + self.detections = [] + Worker.__init__(self, timeout=timeout) + + def run(self): + while True: + if self._stop_requested(): + break + + message = self._get_message() + + if message is not None: + + if message == TokenizerWorker.END_OF_PROCESSING: + break + + audio_data = message.pop("audio_data", None) + start = message.pop("start", None) + end = message.pop("end", None) + if audio_data is not None and len(audio_data) > 0: + self.detections.append((start, end)) + + def notify(self, message): + self.send(message) + + +def file_to_audio_source(filename, filetype=None, **kwargs): + + lower_fname = filename.lower() + rawdata = False + + if filetype is not None: + filetype = filetype.lower() + + if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")): + + srate = kwargs.pop("sampling_rate", None) + if srate is None: + srate = kwargs.pop("sr", None) + + swidth = kwargs.pop("sample_width", None) + if swidth is None: + swidth = kwargs.pop("sw", None) + + ch = kwargs.pop("channels", None) + if ch is None: + ch = kwargs.pop("ch", None) + + if None in (swidth, srate, ch): + raise Exception("All audio parameters are required for raw data") + + data = open(filename).read() + rawdata = True + + # try first with pydub + if WITH_PYDUB: + + use_channel = kwargs.pop("use_channel", None) + if use_channel is None: + use_channel = kwargs.pop("uc", None) + + if use_channel is None: + use_channel = 1 + else: + try: + use_channel = int(use_channel) + except ValueError: + pass + + if not isinstance(use_channel, (int)) and not use_channel.lower() in ["left", "right", "mix"] : + raise ValueError("channel must be an integer or one of 'left', 'right' or 'mix'") + + asegment = None + + if rawdata: + asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch) + if filetype in("wave", "wav") or (filetype is None and lower_fname.endswith(".wav")): + asegment = AudioSegment.from_wav(filename) + elif filetype == "mp3" or (filetype is None and lower_fname.endswith(".mp3")): + asegment = AudioSegment.from_mp3(filename) + elif filetype == "ogg" or (filetype is None and lower_fname.endswith(".ogg")): + asegment = AudioSegment.from_ogg(filename) + elif filetype == "flv" or (filetype is None and lower_fname.endswith(".flv")): + asegment = AudioSegment.from_flv(filename) + else: + asegment = AudioSegment.from_file(filename) + + if asegment.channels > 1: + + if isinstance(use_channel, int): + if use_channel > asegment.channels: + raise ValueError("Can not use channel '{0}', audio file has only {1} channels".format(use_channel, asegment.channels)) + else: + asegment = asegment.split_to_mono()[use_channel - 1] + else: + ch_lower = use_channel.lower() + + if ch_lower == "mix": + asegment = asegment.set_channels(1) + + elif use_channel.lower() == "left": + asegment = asegment.split_to_mono()[0] + + elif use_channel.lower() == "right": + asegment = asegment.split_to_mono()[1] + + return BufferAudioSource(data_buffer = asegment._data, + sampling_rate = asegment.frame_rate, + sample_width = asegment.sample_width, + channels = asegment.channels) + # fall back to standard python + else: + if rawdata: + if ch != 1: + raise ValueError("Cannot handle multi-channel audio without pydub") + return BufferAudioSource(data, srate, swidth, ch) + + if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")): + + wfp = wave.open(filename) + + ch = wfp.getnchannels() + if ch != 1: + wfp.close() + raise ValueError("Cannot handle multi-channel audio without pydub") + + srate = wfp.getframerate() + swidth = wfp.getsampwidth() + data = wfp.readframes(wfp.getnframes()) + wfp.close() + return BufferAudioSource(data, srate, swidth, ch) + + raise AudioFileFormatError("Cannot read audio file format") + + +def save_audio_data(data, filename, filetype=None, **kwargs): + + lower_fname = filename.lower() + if filetype is not None: + filetype = filetype.lower() + + # save raw data + if filetype == "raw" or (filetype is None and lower_fname.endswith(".raw")): + fp = open(filename, "w") + fp.write(data) + fp.close() + return + + # save other types of data + # requires all audio parameters + srate = kwargs.pop("sampling_rate", None) + if srate is None: + srate = kwargs.pop("sr", None) + + swidth = kwargs.pop("sample_width", None) + if swidth is None: + swidth = kwargs.pop("sw", None) + + ch = kwargs.pop("channels", None) + if ch is None: + ch = kwargs.pop("ch", None) + + if None in (swidth, srate, ch): + raise Exception("All audio parameters are required to save no raw data") + + if filetype in ("wav", "wave") or (filetype is None and lower_fname.endswith(".wav")): + # use standard python's wave module + fp = wave.open(filename, "w") + fp.setnchannels(ch) + fp.setsampwidth(swidth) + fp.setframerate(srate) + fp.writeframes(data) + fp.close() + + elif WITH_PYDUB: + + asegment = AudioSegment(data, sample_width=swidth, frame_rate=srate, channels=ch) + asegment.export(filename, format=filetype) + + else: + raise AudioFileFormatError("cannot write file format {0} (file name: {1})".format(filetype, filename)) + + +def plot_all(signal, sampling_rate, energy_as_amp, detections=[]): + + import matplotlib.pyplot as plt + import numpy as np + t = np.arange(0., np.ceil(float(len(signal))) / sampling_rate, 1./sampling_rate ) + if len(t) > len(signal): + t = t[: len(signal) - len(t)] + + for start, end in detections: + p = plt.axvspan(start, end, facecolor='g', ec = 'r', lw = 2, alpha=0.4) + + line = plt.axhline(y=energy_as_amp, lw=1, ls="--", c="r", label="Energy threshold as normalized amplitude") + plt.plot(t, signal) + legend = plt.legend(["Energy threshold as normalized amplitude"], loc=1, fontsize=28) + ax = plt.gca().add_artist(legend) + + plt.xlabel("Time (s)", fontsize=32) + plt.ylabel("Amplitude (normalized)", fontsize=32) + plt.show() + + +def main(argv=None): + '''Command line options.''' + + program_name = os.path.basename(sys.argv[0]) + program_version = "v0.1" + program_build_date = "%s" % __updated__ + + program_version_string = '%%prog %s (%s)' % (program_version, program_build_date) + #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse + program_longdesc = '''''' # optional - give further explanation about what the program does + program_license = "Copyright 2015 user_name (organization_name) \ + Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0" + + if argv is None: + argv = sys.argv[1:] + try: + # setup option parser + parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) + + group = OptionGroup(parser, "[Input-Outpu options]") + group.add_option("-i", "--input", dest="input", help="Input audio or video file. Use - for stdin [default: read from microphone using pyaudio]", metavar="FILE") + group.add_option("-t", "--input-type", dest="input_type", help="Input audio file type. Mandatory if file name has no extension [default: %default]", type=str, default=None, metavar="String") + group.add_option("-M", "--max_time", dest="max_time", help="Max data (in seconds) to read from microphone/file [default: read until the end of file/stream]", type=float, default=None, metavar="FLOAT") + group.add_option("-O", "--output-main", dest="output_main", help="Save main stream as. If omitted main stream will not be saved [default: omitted]", type=str, default=None, metavar="FILE") + group.add_option("-o", "--output-tokens", dest="output_tokens", help="Output file name format for detections. Use {N} and {start} and {end} to build file names, example: 'Det_{N}_{start}-{end}.wav'", type=str, default=None, metavar="STRING") + group.add_option("-T", "--output-type", dest="output_type", help="Audio type used to save detections and/or main stream. If not supplied will: (1). guess from extension or (2). use wav format", type=str, default=None, metavar="STRING") + group.add_option("-u", "--use-channel", dest="use_channel", help="Choose channel to use from a multi-channel audio file (requires pydub). 'left', 'right' and 'mix' are accepted values. [Default: 1 (i.e. 1st/left channel)]", type=str, default="1", metavar="STRING") + parser.add_option_group(group) + + + group = OptionGroup(parser, "[Tokenization options]", "Set tokenizer options and energy threshold.") + group.add_option("-a", "--analysis-window", dest="analysis_window", help="Size of analysis window in seconds [default: %default (10ms)]", type=float, default=0.01, metavar="FLOAT") + group.add_option("-n", "--min-duration", dest="min_duration", help="Min duration of a valid audio event in seconds [default: %default]", type=float, default=0.2, metavar="FLOAT") + group.add_option("-m", "--max-duration", dest="max_duration", help="Max duration of a valid audio event in seconds [default: %default]", type=float, default=5, metavar="FLOAT") + group.add_option("-s", "--max-silence", dest="max_silence", help="Max duration of a consecutive silence within a valid audio event in seconds [default: %default]", type=float, default=0.3, metavar="FLOAT") + parser.add_option("-d", "--drop-trailing-silence", dest="drop_trailing_silence", help="Drop trailing silence from a detection [default: do not play]", action="store_true", default=False) + group.add_option("-e", "--energy-threshold", dest="energy_threshold", help="Log energy threshold for detection [default: %default]", type=float, default=45, metavar="FLOAT") + parser.add_option_group(group) + + + group = OptionGroup(parser, "[Audio parameters]", "Define audio parameters if data is read from a headerless file (raw or stdin) or you want to use different microphone parameters.") + group.add_option("-r", "--rate", dest="sampling_rate", help="Sampling rate of audio data [default: %default]", type=int, default=16000, metavar="INT") + group.add_option("-c", "--channels", dest="channels", help="Number of channels of audio data [default: %default]", type=int, default=1, metavar="INT") + group.add_option("-w", "--width", dest="sample_width", help="Number of bytes per audio sample [default: %default]", type=int, default=2, metavar="INT") + parser.add_option_group(group) + + parser.add_option("-C", "--command", dest="command", help="Command to call when an audio detection occurs. Use $ to represent the file name to use with the command", default=None, type=str, metavar="STRING") + parser.add_option("-E", "--echo", dest="echo", help="Play back each detection immediately using pyaudio [default: do not play]", action="store_true", default=False) + parser.add_option("-p", "--plot", dest="plot", help="Plot audio signal and detections (requires matplotlib)", action="store_true", default=False) + parser.add_option("-D", "--debug", dest="debug", help="Print operations to STDOUT", action="store_true", default=False) + + + # process options + (opts, args) = parser.parse_args(argv) + + + if (opts.output_tokens, opts.command, opts.echo, opts.plot, opts.debug) == (None, None, False, False, False): + # nothing to do with audio data + sys.stderr.write("Nothing to do!\nType -h for more information\n") + sys.exit(1) + + print(opts.debug) + + + if opts.input == "-": + asource = StdinAudioSource(sampling_rate = opts.sampling_rate, + sample_width = opts.sample_width, + channels = opts.channels) + #read data from a file + elif opts.input is not None: + asource = file_to_audio_source(filename=opts.input, filetype=opts.input_type, uc=opts.use_channel) + + # read data from microphone via pyaudio + else: + try: + asource = PyAudioSource(sampling_rate = opts.sampling_rate, + sample_width = opts.sample_width, + channels = opts.channels) + except Exception: + sys.stderr.write("Cannot read data from audio device!\n") + sys.stderr.write("You should either install pyaudio or read data from STDIN\n") + sys.exit(2) + + record = opts.output_main is not None or opts.plot + + ads = ADSFactory.ads(audio_source = asource, block_dur = opts.analysis_window, max_time = opts.max_time, record = record) + validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=opts.energy_threshold) + + + if opts.drop_trailing_silence: + mode = StreamTokenizer.DROP_TRAILING_SILENCE + else: + mode = 0 + tokenizer = StreamTokenizer(validator=validator, min_length=opts.min_duration * 100, + max_length=int(opts.max_duration * 100), + max_continuous_silence=opts.max_silence * 100, + mode = mode) + + + observers = [] + tokenizer_worker = None + + if opts.output_tokens is not None: + + try: + # check user format is correct + fname = opts.output_tokens.format(N=0, start=0, end=0) + + # find file type for detections + tok_type = opts.output_type + if tok_type is None: + tok_type = os.path.splitext(opts.output_tokens)[1][1:] + if tok_type == "": + tok_type = "wav" + + token_saver = TokenSaverWorker(name_format = opts.output_tokens, filetype = tok_type, + debug = opts.debug, sr=asource.get_sampling_rate(), + sw = asource.get_sample_width(), + ch = asource.get_channels()) + observers.append(token_saver) + + except Exception: + sys.stderr.write("Wrong format for detections file name: '{0}'\n".format(opts.output_tokens)) + sys.exit(2) + + if opts.echo: + try: + player = player_for(asource) + player_worker = PlayerWorker(player = player, debug = opts.debug) + observers.append(player_worker) + except Exception: + sys.stderr.write("Cannot get a audio player!\n") + sys.stderr.write("You should either install pyaudio or supply a command (-C option) to play audio\n") + sys.exit(2) + + if opts.command is not None and len(opts.command) > 0: + cmd_worker = CommandLineWorker(command = opts.command, debug = opts.debug) + observers.append(cmd_worker) + + if opts.plot: + log_worker = LogWorker() + observers.append(log_worker) + + tokenizer_worker = TokenizerWorker(ads, tokenizer, opts.analysis_window, observers) + + def _save_main_stream(): + # find file type + main_type = opts.output_type + if main_type is None: + main_type = os.path.splitext(opts.output_main)[1][1:] + if main_type == "": + main_type = "wav" + ads.close() + ads.rewind() + data = ads.get_audio_source().get_data_buffer() + if len(data) > 0: + save_audio_data(data=data, filename=opts.output_main, filetype=main_type, sr=asource.get_sampling_rate(), + sw = asource.get_sample_width(), + ch = asource.get_channels()) + + def _plot(): + import numpy as np + ads.close() + ads.rewind() + data = ads.get_audio_source().get_data_buffer() + signal = AudioEnergyValidator._convert(data, asource.get_sample_width()) + detections = [(det[0] * opts.analysis_window, det[1] * opts.analysis_window) for det in log_worker.detections] + max_amplitude = 2**(asource.get_sample_width() * 8 - 1) - 1 + energy_as_amp = np.sqrt(np.exp(opts.energy_threshold * np.log(10) / 10)) / max_amplitude + plot_all(signal / max_amplitude, asource.get_sampling_rate(), energy_as_amp, detections) + + # start observer threads + for obs in observers: + obs.start() + # start tokenization threads + tokenizer_worker.start() + + while True: + time.sleep(1) + if len(threading.enumerate()) == 1: + break + + tokenizer_worker = None + + if opts.output_main is not None: + _save_main_stream() + if opts.plot: + _plot() + + return 0 + + except KeyboardInterrupt: + + if tokenizer_worker is not None: + tokenizer_worker.stop() + for obs in observers: + obs.stop() + + if opts.output_main is not None: + _save_main_stream() + if opts.plot: + _plot() + + return 0 + + except Exception as e: + sys.stderr.write(program_name + ": " + repr(e) + "\n") + sys.stderr.write("for help use -h\n") + + return 2 + + +if __name__ == "__main__": + if DEBUG: + sys.argv.append("-h") + if TESTRUN: + import doctest + doctest.testmod() + if PROFILE: + import cProfile + import pstats + profile_filename = 'auditok.auditok_profile.txt' + cProfile.run('main()', profile_filename) + statsfile = open("profile_stats.txt", "wb") + p = pstats.Stats(profile_filename, stream=statsfile) + stats = p.strip_dirs().sort_stats('cumulative') + stats.print_stats() + statsfile.close() + sys.exit(0) + sys.exit(main()) diff -r 597494f0fd9a -r c079c57dad69 auditok/io.py --- a/auditok/io.py Tue Nov 24 02:40:43 2015 +0100 +++ b/auditok/io.py Wed Nov 25 00:13:19 2015 +0100 @@ -6,7 +6,6 @@ """ from abc import ABCMeta, abstractmethod -from six import with_metaclass import wave import sys diff -r 597494f0fd9a -r c079c57dad69 doc/figures/figure_1.png Binary file doc/figures/figure_1.png has changed diff -r 597494f0fd9a -r c079c57dad69 doc/figures/figure_2.png Binary file doc/figures/figure_2.png has changed diff -r 597494f0fd9a -r c079c57dad69 setup.py --- a/setup.py Tue Nov 24 02:40:43 2015 +0100 +++ b/setup.py Wed Nov 25 00:13:19 2015 +0100 @@ -1,3 +1,4 @@ +import sys import re import ast from setuptools import setup @@ -5,9 +6,18 @@ _version_re = re.compile(r'__version__\s+=\s+(.*)') -with open('auditok/__init__.py', 'rb') as f: - version = str(ast.literal_eval(_version_re.search( - f.read().decode('utf-8')).group(1))) +if sys.version_info >= (3, 0): + with open('auditok/__init__.py', 'rt') as f: + version = str(ast.literal_eval(_version_re.search( + f.read()).group(1))) + long_desc = open('quickstart.rst', 'rt').read() + +else: + print("kll") + with open('auditok/__init__.py', 'rb') as f: + version = str(ast.literal_eval(_version_re.search( + f.read().decode('utf-8')).group(1))) + long_desc = open('quickstart.rst', 'rt').read().decode('utf-8') setup( @@ -18,7 +28,7 @@ author='Amine Sehili', author_email='amine.sehili@gmail.com', description='A module for Audio/Acoustic Activity Detection', - long_description= open('quickstart.rst').read().decode('utf-8'), + long_description= long_desc, packages=['auditok'], include_package_data=True, package_data={'auditok': ['data/*']}, @@ -49,5 +59,6 @@ 'Topic :: Multimedia :: Sound/Audio :: Analysis', 'Topic :: Scientific/Engineering :: Information Analysis' ], + entry_points = {'console_scripts': ['auditok = auditok.cmdline:main']} ) diff -r 597494f0fd9a -r c079c57dad69 tests/test_StreamTokenizer.py --- a/tests/test_StreamTokenizer.py Tue Nov 24 02:40:43 2015 +0100 +++ b/tests/test_StreamTokenizer.py Wed Nov 25 00:13:19 2015 +0100 @@ -32,8 +32,8 @@ data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA") - # ^ ^ ^ ^ - # 2 16 20 27 + # ^ ^ ^ ^ + # 2 16 20 27 tokens = tokenizer.tokenize(data_source) self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens)))