changeset 255:9edaa3ce62e8

* Make MFCC able to accept already-FFT'd input, and simplify API a bit * Add log power value to MFCC, restore windowing, and avoid some heap allocs * In HMM, bail out of iteration if loglik hits NaN
author Chris Cannam <c.cannam@qmul.ac.uk>
date Fri, 18 Jan 2008 13:24:12 +0000
parents 52c1a295d775
children 43943a4382ef
files dsp/mfcc/MFCC.cpp dsp/mfcc/MFCC.h dsp/segmentation/ClusterMeltSegmenter.cpp dsp/transforms/FFT.cpp dsp/transforms/FFT.h hmm/hmm.c
diffstat 6 files changed, 104 insertions(+), 73 deletions(-) [+]
line wrap: on
line diff
--- a/dsp/mfcc/MFCC.cpp	Wed Jan 16 18:02:31 2008 +0000
+++ b/dsp/mfcc/MFCC.cpp	Fri Jan 18 13:24:12 2008 +0000
@@ -32,6 +32,7 @@
     fftSize           = config.fftsize;
 
     totalFilters      = linearFilters + logFilters;
+    logPower          = config.logpower;
   
     samplingRate      = config.FS;
   
@@ -50,12 +51,12 @@
  
     /* Allocate space for local vectors */
     mfccDCTMatrix     = (double**)calloc(nceps+1, sizeof(double*));
-    for (i=0;i<nceps+1; i++) {
+    for (i = 0; i < nceps+1; i++) {
         mfccDCTMatrix[i]= (double*)calloc(totalFilters, sizeof(double)); 
     }
 
     mfccFilterWeights = (double**)calloc(totalFilters, sizeof(double*));
-    for (i=0;i<totalFilters; i++) {
+    for (i = 0; i < totalFilters; i++) {
         mfccFilterWeights[i] = (double*)calloc(fftSize, sizeof(double)); 
     }
     
@@ -68,11 +69,11 @@
     triangleHeight = (double*)calloc(totalFilters,sizeof(double));
     fftFreqs       = (double*)calloc(fftSize,sizeof(double));
   
-    for (i=0;i<linearFilters;i++) {
+    for (i = 0; i < linearFilters; i++) {
         freqs[i] = lowestFrequency + ((double)i) * linearSpacing;
     }
   
-    for (i=linearFilters; i<totalFilters+2; i++) {
+    for (i = linearFilters; i < totalFilters+2; i++) {
         freqs[i] = freqs[linearFilters-1] * 
             pow(logSpacing, (double)(i-linearFilters+1));
     }
@@ -109,7 +110,8 @@
 
             if ((fftFreqs[j]>center[i]) && (fftFreqs[j]<upper[i])) {
 
-                mfccFilterWeights[i][j] = mfccFilterWeights[i][j] + triangleHeight[i] * (upper[i]-fftFreqs[j]) 
+                mfccFilterWeights[i][j] = mfccFilterWeights[i][j]
+                    + triangleHeight[i] * (upper[i]-fftFreqs[j]) 
                     / (upper[i]-center[i]);
             }
             else
@@ -127,24 +129,27 @@
 
     const double pi = 3.14159265358979323846264338327950288;
   
-    for (i=0; i<nceps+1; i++) {
-        for (j=0; j<totalFilters; j++) {
+    for (i = 0; i < nceps+1; i++) {
+        for (j = 0; j < totalFilters; j++) {
             mfccDCTMatrix[i][j] = (1./sqrt((double) totalFilters / 2.))  
                 * cos((double) i * ((double) j + 0.5) / (double) totalFilters * pi);
         }
     }
 
-    for (j=0;j<totalFilters;j++){
-        mfccDCTMatrix[0][j]     =  (sqrt(2.)/2.) * mfccDCTMatrix[0][j];
+    for (j = 0; j < totalFilters; j++){
+        mfccDCTMatrix[0][j] = (sqrt(2.)/2.) * mfccDCTMatrix[0][j];
     }
    
     /* The analysis window */
     window = new Window<double>(HammingWindow, fftSize);
 
     /* Allocate memory for the FFT */
-    imagIn      = (double*)calloc(fftSize,sizeof(double));
-    realOut     = (double*)calloc(fftSize,sizeof(double));
-    imagOut     = (double*)calloc(fftSize,sizeof(double));
+    imagIn      = (double*)calloc(fftSize, sizeof(double));
+    realOut     = (double*)calloc(fftSize, sizeof(double));
+    imagOut     = (double*)calloc(fftSize, sizeof(double));
+
+    earMag      = (double*)calloc(totalFilters, sizeof(double));
+    fftMag      = (double*)calloc(fftSize/2, sizeof(double));
   
     free(freqs);
     free(lower);
@@ -159,12 +164,12 @@
     int i;
   
     /* Free the structure */
-    for (i=0;i<nceps+1;i++) {
+    for (i = 0; i < nceps+1; i++) {
         free(mfccDCTMatrix[i]);
     }
     free(mfccDCTMatrix);
     
-    for (i=0;i<totalFilters; i++) {
+    for (i = 0; i < totalFilters; i++) {
         free(mfccFilterWeights[i]);
     }
     free(mfccFilterWeights);
@@ -174,6 +179,9 @@
     
     /* The analysis window */
     delete window;
+
+    free(earMag);
+    free(fftMag);
     
     /* Free the FFT */
     free(imagIn);
@@ -185,48 +193,47 @@
 /*
  * 
  * Extract the MFCC on the input frame 
- *
- * looks like we have to have length = fftSize ??????
  * 
  */ 
-int MFCC::process(int length, double *inframe, double *outceps)
+int MFCC::process(const double *inframe, double *outceps)
 {
-    int i,j;
-  
-    double *fftMag;
-    double *earMag;
+    double *inputData = (double *)malloc(fftSize * sizeof(double));
+    for (int i = 0; i < fftSize; ++i) inputData[i] = inframe[i];
 
-    double *inputData;
-  
-    double tmp;
-
-    earMag    = (double*)calloc(totalFilters, sizeof(double));
-    inputData = (double*)calloc(fftSize, sizeof(double)); 
-    fftMag    = (double*)calloc(fftSize, sizeof(double)); 
-    
-    /* Zero-pad if needed */
-    memcpy(inputData, inframe, length*sizeof(double));
-
-//!!!    window->cut(inputData);
+    window->cut(inputData);
   
     /* Calculate the fft on the input frame */
     FFT::process(fftSize, 0, inputData, imagIn, realOut, imagOut);
 
-    for (i = 0; i < fftSize; ++i) {
-        fftMag[i] = sqrt(realOut[i] * realOut[i] +
-                         imagOut[i] * imagOut[i]);
+    free(inputData);
+
+    return process(realOut, imagOut, outceps);
+}
+
+int MFCC::process(const double *real, const double *imag, double *outceps)
+{
+    int i, j;
+
+    for (i = 0; i < fftSize/2; ++i) {
+        fftMag[i] = sqrt(real[i] * real[i] + imag[i] * imag[i]);
+    }
+
+    for (i = 0; i < totalFilters; ++i) {
+        earMag[i] = 0.0;
     }
 
     /* Multiply by mfccFilterWeights */
-    for (i=0;i<totalFilters;i++) {
-        tmp = 0.;
-        for(j=0;j<fftSize/2; j++) {
-            tmp = tmp + (mfccFilterWeights[i][j]*fftMag[j]);
+    for (i = 0; i < totalFilters; i++) {
+        double tmp = 0.0;
+        for (j = 0; j < fftSize/2; j++) {
+            tmp = tmp + (mfccFilterWeights[i][j] * fftMag[j]);
         }
-        if (tmp>0)
-            earMag[i] = log10(tmp);
-	else
-            earMag[i] = 0.0;
+        if (tmp > 0) earMag[i] = log10(tmp);
+	else earMag[i] = 0.0;
+
+        if (logPower != 1.0) {
+            earMag[i] = pow(earMag[i], logPower);
+        }
     }
 
     /*
@@ -236,31 +243,27 @@
      *
      */
   
-    if (WANT_C0==1) {
+    if (WANT_C0 == 1) {
      
-        for (i=0;i<nceps+1;i++) {
-            tmp = 0.;
-            for (j=0;j<totalFilters;j++){
-                tmp = tmp + mfccDCTMatrix[i][j]*earMag[j];
+        for (i = 0; i < nceps+1; i++) {
+            double tmp = 0.;
+            for (j = 0; j < totalFilters; j++){
+                tmp = tmp + mfccDCTMatrix[i][j] * earMag[j];
             }
             outceps[i] = tmp;
         }
     }
     else 
     {  
-        for (i=1;i<nceps+1;i++) {
-            tmp = 0.;
-            for (j=0;j<totalFilters;j++){
-                tmp = tmp + mfccDCTMatrix[i][j]*earMag[j];
+        for (i = 1; i < nceps+1; i++) {
+            double tmp = 0.;
+            for (j = 0; j < totalFilters; j++){
+                tmp = tmp + mfccDCTMatrix[i][j] * earMag[j];
             }
             outceps[i-1] = tmp;
         }
     }
     
-    free(fftMag);
-    free(earMag);
-    free(inputData);
-
     return nceps;
 }
 
--- a/dsp/mfcc/MFCC.h	Wed Jan 16 18:02:31 2008 +0000
+++ b/dsp/mfcc/MFCC.h	Fri Jan 18 13:24:12 2008 +0000
@@ -17,7 +17,10 @@
     int FS;
     int fftsize;
     int nceps;
+    double logpower;
     bool want_c0;
+    MFCCConfig(int _FS) :
+        FS(_FS), fftsize(2048), nceps(19), logpower(1.0), want_c0(true) { }
 };
 
 class MFCC
@@ -26,7 +29,20 @@
     MFCC(MFCCConfig config);
     virtual ~MFCC();
 
-    int process(int length, double *inframe, double *outceps);
+    /**
+     * Process time-domain input data.  inframe must contain
+     * getfftlength() samples.  outceps must contain space for nceps
+     * values, plus one if want_c0 is specified.
+     */
+    int process(const double *inframe, double *outceps);
+
+    /**
+     * Process time-domain input data.  real and imag must contain
+     * getfftlength()/2+1 elements (i.e. the conjugate half of the FFT
+     * is not expected).  outceps must contain space for nceps values,
+     * plus one if want_c0 is specified.
+     */
+    int process(const double *real, const double *imag, double *outceps);
 
     int getfftlength() const { return fftSize; }
 
@@ -42,6 +58,7 @@
     int     fftSize;
     
     int     totalFilters;
+    double  logPower;
     
     /* Misc. */
     int     samplingRate;
@@ -57,10 +74,12 @@
     Window<double> *window;
     
     /* For the FFT */
-    double* imagIn;		// always zero
-    double* realOut;
-    double* imagOut;
-    
+    double *imagIn;		// always zero
+    double *realOut;
+    double *imagOut;
+    double *fftMag;
+    double *earMag;
+
     /* Set if user want C0 */
     int WANT_C0;
 };
--- a/dsp/segmentation/ClusterMeltSegmenter.cpp	Wed Jan 16 18:02:31 2008 +0000
+++ b/dsp/segmentation/ClusterMeltSegmenter.cpp	Fri Jan 18 13:24:12 2008 +0000
@@ -92,8 +92,7 @@
             decimator = new Decimator(getWindowsize(), decimationFactor);
         }
 
-        MFCCConfig config;
-        config.FS = samplerate / decimationFactor;
+        MFCCConfig config(samplerate / decimationFactor);
         config.fftsize = 2048;
         config.nceps = 19;
         config.want_c0 = true;
@@ -278,7 +277,7 @@
             }
         }
 
-        mfcc->process(fftsize, frame, ccout);
+        mfcc->process(frame, ccout);
 	
         for (int i = 0; i < ncoeff; ++i) {
             cc[i] += ccout[i];
--- a/dsp/transforms/FFT.cpp	Wed Jan 16 18:02:31 2008 +0000
+++ b/dsp/transforms/FFT.cpp	Fri Jan 18 13:24:12 2008 +0000
@@ -24,7 +24,9 @@
 
 }
 
-void FFT::process(unsigned int p_nSamples, bool p_bInverseTransform, double *p_lpRealIn, double *p_lpImagIn, double *p_lpRealOut, double *p_lpImagOut)
+void FFT::process(unsigned int p_nSamples, bool p_bInverseTransform,
+                  const double *p_lpRealIn, const double *p_lpImagIn,
+                  double *p_lpRealOut, double *p_lpImagOut)
 {
 
     if(!p_lpRealIn || !p_lpRealOut || !p_lpImagOut) return;
--- a/dsp/transforms/FFT.h	Wed Jan 16 18:02:31 2008 +0000
+++ b/dsp/transforms/FFT.h	Fri Jan 18 13:24:12 2008 +0000
@@ -13,16 +13,16 @@
 class FFT  
 {
 public:
-    static void process(unsigned int p_nSamples, bool p_bInverseTransform,
-                        double *p_lpRealIn, double *p_lpImagIn,
-                        double *p_lpRealOut, double *p_lpImagOut);
+    static void process(unsigned int nSamples, bool bInverseTransform,
+                        const double *lpRealIn, const double *lpImagIn,
+                        double *lpRealOut, double *lpImagOut);
     FFT();
     virtual ~FFT();
 
 protected:
-    static unsigned int reverseBits(unsigned int p_nIndex, unsigned int p_nBits);
-    static unsigned int numberOfBitsNeeded( unsigned int p_nSamples );
-    static bool isPowerOfTwo( unsigned int p_nX );
+    static unsigned int reverseBits(unsigned int nIndex, unsigned int nBits);
+    static unsigned int numberOfBitsNeeded( unsigned int nSamples );
+    static bool isPowerOfTwo( unsigned int nX );
 };
 
 #endif
--- a/hmm/hmm.c	Wed Jan 16 18:02:31 2008 +0000
+++ b/hmm/hmm.c	Fri Jan 18 13:24:12 2008 +0000
@@ -159,7 +159,9 @@
 	int niter = 50;	
 	int iter = 0;
 	double loglik1, loglik2;
-	while(iter < niter && !(iter > 1 && (loglik - loglik1) < thresh * (loglik1 - loglik2)))	
+	int foundnan = 0;
+
+	while (iter < niter && !foundnan && !(iter > 1 && (loglik - loglik1) < thresh * (loglik1 - loglik2)))	
 	{
 		++iter;
 		
@@ -199,6 +201,11 @@
 		fprintf(stderr, "iteration %d: loglik = %f\n", iter, loglik);		
 		fprintf(stderr, "re-estimation...\n");
 		fflush(stderr);
+
+		if (isnan(loglik)) {
+		    foundnan = 1;
+		    continue;
+		}
 		
 		baum_welch(p0, a, mu, cov, N, T, L, x, xi, gamma);
 			
@@ -278,6 +285,7 @@
 		for (j = 0; j < N; j++)
 		{
 			a[i][j] = 0;
+			if (sum_gamma[i] == 0.) continue;
 			for (t = 0; t < T-1; t++)
 				a[i][j] += xi[t][i][j];
 			//s += a[i][j];