view gmm_baseline_experiments/gmm/GMM_methods.py @ 3:0147bf388eb8

Increase number of feature vectors sampled from 2000 to 20000.
author peterf
date Sun, 19 Jul 2015 22:11:19 +0100
parents cb535b80218a
children 39258b875228
line wrap: on
line source
#
# exper002.py:
#    Estimate GMMs
#
# Author: Adam Stark, Peter Foster
# (c) 2014 Adam Stark
# (c) 2015 Peter Foster
#

import numpy as np
import time
from sklearn import mixture
import pdb

#=================================================================
def scale(X):

	m = np.mean(X,0)
	std = np.std(X,0)

	X = X-m
	X = X / std

	return [X,m,std]


#=================================================================
def GMMTrainAndTest(data,numComponents, covarianceType='full'):
        
	X = data['trainingData']
	Y = data['trainingTargets']

	# scale data
	[X,trainMean,trainStd] = scale(X)

	numTrainingExamplesOfEachType = 20000
	
	X = X.tolist()
	Y = Y.tolist()

	#Get label set
	Labels = set(Y)
	#Partition data according to labels
	DataByLabel = {label:[] for label in Labels}
	for x,y in zip(X, Y):
	    DataByLabel[y].append(x)	
	    
	#Sample data
	for label in Labels:
	    DataByLabel[label] = np.array(DataByLabel[label])
	    I = np.random.choice(DataByLabel[label].shape[0],numTrainingExamplesOfEachType)
	    DataByLabel[label] = DataByLabel[label][I]	    

	#print "Training..."
	GMMS = {}
	for label in Labels:
	    GMMS[label] = mixture.GMM(n_components=numComponents, covariance_type=covarianceType)
	    GMMS[label].fit(DataByLabel[label])
	#print "Done!"

	startTime = time.time()

	allFileScores = {label:[] for label in Labels}
	i = 0
	for fileFeatures in data['testDataPerFile']:
		#print "Testing file ", (i+1)
      		tmp = fileFeatures
		tmp = tmp - trainMean
		tmp = tmp / trainStd
		
                for label in Labels:
                    if tmp.size > 0:
                        #Average score across all frames
                        nonNan = np.all(~np.isnan(tmp), axis=1)
                        score = np.nanmax((np.mean(GMMS[label].score(tmp[nonNan,:])), -100000000))
                        allFileScores[label].append(score)
                    else:
                        score = -100000000
                        allFileScores[label].append(score)
                i +=1
                	
	# store the running time
	runningTime = time.time()-startTime

	# create a suffix (for saving files)
	suffix = "_M=" + str(numComponents)

	result = {}
	result['fileScores'] = allFileScores
	result['algorithm'] = "GMM"
	result['numComponents'] = numComponents
	
	result['runningTime'] = runningTime
	result['suffix'] = suffix	

	return result