Mercurial > hg > smacpy

--- a/README.md	Thu Jul 14 18:32:44 2022 +0200
+++ b/README.md	Wed Mar 15 07:09:51 2023 +0000
@@ -12,14 +12,14 @@
 1. to provide a baseline against which to test more advanced audio classifiers;
 2. to provide a simple code example of a classifier which people are free to build on.

-It uses the very common workflow of taking audio, converting it frame-by-frame into MFCCs, and modelling the MFCC "bag of frames" with a GMM.
+It uses a workflow which was very common before the age of deep learning, and might still be useful for low-complexity audio tasks: take an audio clip as input, converting it frame-by-frame into MFCCs, and modelling the MFCC "bag of frames" with a GMM.

 Requirements
 ------------
-* Python 2.7 or later (it uses the 'argparse' module, not available earlier)
+* Python 2.7 or later, or Python 3
 * Python modules:
     * numpy
-    * [scikits.audiolab](http://pypi.python.org/pypi/scikits.audiolab)
+    * [librosa](http://librosa.org/)
     * [sckikit-learn](http://scikit-learn.sourceforge.net/)

 It has been tested on python 2.7 (on ubuntu 11.10 and 12.04). Not yet tested on python3 but it should be fine...
--- a/smacpy.py	Thu Jul 14 18:32:44 2022 +0200
+++ b/smacpy.py	Wed Mar 15 07:09:51 2023 +0000
@@ -14,8 +14,7 @@
 import numpy as np
 import argparse
 from glob import glob
-from scikits.audiolab import Sndfile
-from scikits.audiolab import Format
+import librosa
 from sklearn.mixture import GaussianMixture as GMM

 from MFCC import melScaling
@@ -47,7 +46,7 @@
 		'wavfolder' is the base folder, to be prepended to all WAV paths.
 		'trainingdata' is a dictionary of wavpath:label pairs."""

-		self.mfccMaker = melScaling(int(fs), framelen/2, 40)
+		self.mfccMaker = melScaling(int(fs), int(framelen/2), 40)
 		self.mfccMaker.update()

 		allfeatures = {wavpath:self.file_to_features(os.path.join(wavfolder, wavpath)) for wavpath in trainingdata}
@@ -102,21 +101,19 @@
 		"Reads through a mono WAV file, converting each frame to the required features. Returns a 2D array."
 		if verbose: print("Reading %s" % wavpath)
 		if not os.path.isfile(wavpath): raise ValueError("path %s not found" % wavpath)
-		sf = Sndfile(wavpath, "r")
-		#if (sf.channels != 1) and verbose: print(" Sound file has multiple channels (%i) - channels will be mixed to mono." % sf.channels)
-		if sf.samplerate != fs:         raise ValueError("wanted sample rate %g - got %g." % (fs, sf.samplerate))
+
+		audiodata, _ = librosa.load(wavpath, sr=fs, mono=True)
 		window = np.hamming(framelen)
 		features = []
+		chunkpos = 0
 		while(True):
 			try:
-				chunk = sf.read_frames(framelen, dtype=np.float32)
+				chunk = audiodata[chunkpos:chunkpos+framelen]
 				if len(chunk) != framelen:
-					print("Not read sufficient samples - returning")
+					#print("Not read sufficient samples - assuming end of file")
 					break
-				if sf.channels != 1:
-					chunk = np.mean(chunk, 1) # mixdown
 				framespectrum = np.fft.fft(window * chunk)
-				magspec = abs(framespectrum[:framelen/2])
+				magspec = abs(framespectrum[:int(framelen/2)])

 				# do the frequency warping and MFCC computation
 				melSpectrum = self.mfccMaker.warpSpectrum(magspec)
@@ -127,9 +124,11 @@
 				framefeatures = melCepstrum   # todo: include deltas? that can be your homework.

 				features.append(framefeatures)
+
+				chunkpos += framelen
 			except RuntimeError:
 				break
-		sf.close()
+		if verbose: print("  Data shape: %s" % str(np.array(features).shape))
 		return np.array(features)

 #######################################################################