Mercurial > hg > smacpy
changeset 34:31fa7d0361df
Merge pull request #6 from danstowell/uselibrosa
Modernisation: replace scikits-audio dependency with librosa, & python3 compat
author | danstowell <danstowell@users.sourceforge.net> |
---|---|
date | Wed, 15 Mar 2023 07:09:51 +0000 |
parents | 659ebfa334e2 (current diff) 469e69bdc354 (diff) |
children | f094fc50ff04 |
files | |
diffstat | 2 files changed, 14 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/README.md Thu Jul 14 18:32:44 2022 +0200 +++ b/README.md Wed Mar 15 07:09:51 2023 +0000 @@ -12,14 +12,14 @@ 1. to provide a baseline against which to test more advanced audio classifiers; 2. to provide a simple code example of a classifier which people are free to build on. -It uses the very common workflow of taking audio, converting it frame-by-frame into MFCCs, and modelling the MFCC "bag of frames" with a GMM. +It uses a workflow which was very common before the age of deep learning, and might still be useful for low-complexity audio tasks: take an audio clip as input, converting it frame-by-frame into MFCCs, and modelling the MFCC "bag of frames" with a GMM. Requirements ------------ -* Python 2.7 or later (it uses the 'argparse' module, not available earlier) +* Python 2.7 or later, or Python 3 * Python modules: * numpy - * [scikits.audiolab](http://pypi.python.org/pypi/scikits.audiolab) + * [librosa](http://librosa.org/) * [sckikit-learn](http://scikit-learn.sourceforge.net/) It has been tested on python 2.7 (on ubuntu 11.10 and 12.04). Not yet tested on python3 but it should be fine...
--- a/smacpy.py Thu Jul 14 18:32:44 2022 +0200 +++ b/smacpy.py Wed Mar 15 07:09:51 2023 +0000 @@ -14,8 +14,7 @@ import numpy as np import argparse from glob import glob -from scikits.audiolab import Sndfile -from scikits.audiolab import Format +import librosa from sklearn.mixture import GaussianMixture as GMM from MFCC import melScaling @@ -47,7 +46,7 @@ 'wavfolder' is the base folder, to be prepended to all WAV paths. 'trainingdata' is a dictionary of wavpath:label pairs.""" - self.mfccMaker = melScaling(int(fs), framelen/2, 40) + self.mfccMaker = melScaling(int(fs), int(framelen/2), 40) self.mfccMaker.update() allfeatures = {wavpath:self.file_to_features(os.path.join(wavfolder, wavpath)) for wavpath in trainingdata} @@ -102,21 +101,19 @@ "Reads through a mono WAV file, converting each frame to the required features. Returns a 2D array." if verbose: print("Reading %s" % wavpath) if not os.path.isfile(wavpath): raise ValueError("path %s not found" % wavpath) - sf = Sndfile(wavpath, "r") - #if (sf.channels != 1) and verbose: print(" Sound file has multiple channels (%i) - channels will be mixed to mono." % sf.channels) - if sf.samplerate != fs: raise ValueError("wanted sample rate %g - got %g." % (fs, sf.samplerate)) + + audiodata, _ = librosa.load(wavpath, sr=fs, mono=True) window = np.hamming(framelen) features = [] + chunkpos = 0 while(True): try: - chunk = sf.read_frames(framelen, dtype=np.float32) + chunk = audiodata[chunkpos:chunkpos+framelen] if len(chunk) != framelen: - print("Not read sufficient samples - returning") + #print("Not read sufficient samples - assuming end of file") break - if sf.channels != 1: - chunk = np.mean(chunk, 1) # mixdown framespectrum = np.fft.fft(window * chunk) - magspec = abs(framespectrum[:framelen/2]) + magspec = abs(framespectrum[:int(framelen/2)]) # do the frequency warping and MFCC computation melSpectrum = self.mfccMaker.warpSpectrum(magspec) @@ -127,9 +124,11 @@ framefeatures = melCepstrum # todo: include deltas? that can be your homework. features.append(framefeatures) + + chunkpos += framelen except RuntimeError: break - sf.close() + if verbose: print(" Data shape: %s" % str(np.array(features).shape)) return np.array(features) #######################################################################