Mercurial > hg > segmentation

#!/usr/bin/env python
# encoding: utf-8
"""
SegEval.py

The main segmentation program.

Created by mi tian on 2015-04-02.
Copyright (c) 2015 __MyCompanyName__. All rights reserved.
"""

# Load starndard python libs
import sys, os, optparse, csv
from itertools import combinations
from os.path import join, isdir, isfile, abspath, dirname, basename, split, splitext
from copy import copy

import matplotlib
# matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import scipy as sp
from scipy.signal import correlate2d, convolve2d, filtfilt, resample
from scipy.ndimage.filters import *
from sklearn.decomposition import PCA
from sklearn.mixture import GMM
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import pairwise_distances

# Load dependencies
from utils.SegUtil import getMean, getStd, getDelta, getSSM, reduceSSM, upSample, normaliseFeature
from utils.PeakPickerUtil import PeakPicker
from utils.gmmdist import *
from utils.GmmMetrics import GmmDistance
from utils.RankClustering import rClustering
from utils.kmeans import Kmeans
from utils.PathTracker import PathTracker

# Load bourdary retrieval utilities
import cnmf as cnmf_S
import foote as foote_S
import sf as sf_S
import fmc2d as fmc2d_S
import novelty as novelty_S

# Algorithm params
h = 8               # Size of median filter for features in C-NMF
R = 15              # Size of the median filter for the activation matrix C-NMF
rank = 4            # Rank of decomposition for the boundaries
rank_labels = 6     # Rank of decomposition for the labels
R_labels = 6        # Size of the median filter for the labels
# Foote
M = 2           # Median filter for the audio features (in beats)
Mg = 32         # Gaussian kernel size
L = 16          # Size of the median filter for the adaptive threshold
# 2D-FMC
N = 8          # Size of the fixed length segments (for 2D-FMC)


# Define arg parser
def parse_args():
	op = optparse.OptionParser()
	# IO options
	op.add_option('-g', '--gammatonegram-features', action="store", dest="GF", default='/Volumes/c4dm-03/people/mit/features/gammatonegram/qupujicheng/2048', type="str", help="Loading gammatone features from.." )
	op.add_option('-s', '--spectrogram-features', action="store", dest="SF", default='/Volumes/c4dm-03/people/mit/features/spectrogram/qupujicheng/2048', type="str", help="Loading spectral features from.." )
	op.add_option('-t', '--tempogram-features', action="store", dest="TF", default='/Volumes/c4dm-03/people/mit/features/tempogram/qupujicheng/tempo_features_6s', type="str", help="Loading tempogram features from.." )
	op.add_option('-f', '--featureset', action="store", dest="FEATURES", default='[0, 1, 2, 3]', type="str", help="Choose feature subsets (input a list of integers) used for segmentation -- gammtone, chroma, timbre, tempo -- 0, 1, 2, 3." )
	op.add_option('-a', '--annotations', action="store", dest="GT", default='/Volumes/c4dm-03/people/mit/annotation/qupujicheng/lowercase', type="str", help="Loading annotation files from.. ")
	op.add_option('-o', '--ouput', action="store", dest="OUTPUT", default='/Volumes/c4dm-03/people/mit/segmentation/gammatone/qupujicheng', type="str", help="Write segmentation results to ")

	# boundary retrieval options
	op.add_option('-b', '--bounrary-method', action="store", dest="BOUNDARY", type='choice', choices=['novelty', 'cnmf', 'foote', 'sf'], default='novelty', help="Choose boundary retrieval algorithm ('novelty', 'cnmf', 'sf', 'fmc2d')." )
	op.add_option('-l', '--labeling-method', action="store", dest="LABEL", type='choice', choices=['cnmf', 'fmc2d'], default='cnmf', help="Choose boundary labeling algorithm ('cnmf', 'fmc2d')." )

	# Plot/print/mode options
	op.add_option('-p', '--plot', action="store_true", dest="PLOT", default=False, help="Save plots")
	op.add_option('-e', '--test-mode', action="store_true", dest="TEST", default=False, help="Test mode")
	op.add_option('-v', '--verbose-mode', action="store_true", dest="VERBOSE", default=False, help="Print results in verbose mode.")

	return op.parse_args()
options, args = parse_args()

class FeatureObj() :
	__slots__ = ['key', 'audio', 'timestamps', 'gammatone_features', 'tempo_features', 'timbre_features', 'harmonic_features', 'gammatone_ssm', 'tempo_ssm', 'timbre_features', 'harmonic_ssm', 'ssm_timestamps']

class AudioObj():
	__slots__ = ['name', 'feature_list', 'gt', 'label', 'gammatone_features', 'tempo_features', 'timbre_features', 'harmonic_features', 'combined_features',\
	'gammatone_ssm', 'tempo_ssm', 'timbre_ssm', 'harmonic_ssm', 'combined_ssm', 'ssm', 'ssm_timestamps', 'tempo_timestamps']

class EvalObj():
	__slots__ = ['TP', 'FP', 'FN', 'P', 'R', 'F', 'AD', 'DA']


class SSMseg(object):
	'''The main segmentation object'''
	def __init__(self):
		self.SampleRate = 44100
		self.NqHz = self.SampleRate/2
		self.timestamp = []
		self.previousSample = 0.0
		self.featureWindow = 6.0
		self.featureStep = 3.0
		self.kernel_size = 64 # Adjust this param according to the feature resolution.pq
		self.blockSize = 2048
		self.stepSize = 1024

		'''NOTE: Match the following params with those used for feature extraction!'''

		'''NOTE: Unlike spectrogram ones, Gammatone features are extracted without taking an FFT. The windowing is done under the purpose of chunking
		the audio to facilitate the gammatone filtering with the specified blockSize and stepSize. The resulting gammatonegram is aggregated every
		gammatoneLen without overlap.'''
		self.gammatoneLen = 2048
		self.gammatoneBandGroups = [0, 2, 6, 10, 13, 17, 20]
		self.nGammatoneBands = 20
		self.lowFreq = 100
		self.highFreq = self.SampleRate / 4

		'''Settings for extracting tempogram features.'''
		self.tempoWindow = 6.0
		self.bpmBands = [30, 45, 60, 80, 100, 120, 180, 240, 400, 600]

		'''Peak picking settings'''
		self.threshold = 50
		self.confidence_threshold = 0.5
		self.delta_threshold = 0.0
		self.backtracking_threshold = 1.9
		self.polyfitting_on = True
		self.medfilter_on = True
		self.LPfilter_on = True
		self.whitening_on = False
		self.aCoeffs = [1.0000, -0.5949, 0.2348]
		self.bCoeffs = [0.1600,	 0.3200, 0.1600]
		self.cutoff = 0.34
		self.medianWin = 7


	def pairwiseF(self, annotation, detection, tolerance=3.0, combine=1.0):
		'''Pairwise F measure evaluation of detection rates.'''

		# print 'detection', detection
		detection = np.append(detection, annotation[-1])
		res = EvalObj()
		res.TP = 0	# Total number of matched ground truth and experimental data points
		gt = len(annotation)	# Total number of ground truth data points
		dt = len(detection) # Total number of experimental data points
		foundIdx = []
		D_AD = np.zeros(gt)
		D_DA = np.zeros(dt)

		for dtIdx in xrange(dt):
			D_DA[dtIdx] = np.min(abs(detection[dtIdx] - annotation))
		for gtIdx in xrange(gt):
			D_AD[gtIdx] = np.min(abs(annotation[gtIdx] - detection))
			for dtIdx in xrange(dt):
				if (annotation[gtIdx] >= detection[dtIdx] - tolerance/2.0) and (annotation[gtIdx] <= detection[dtIdx] + tolerance/2.0):
					res.TP = res.TP + 1.0
					foundIdx.append(gtIdx)
		foundIdx = list(set(foundIdx))
		res.TP = len(foundIdx)
		res.FP = max(0, dt - res.TP)
		res.FN = max(0, gt - res.TP)

		res.AD = np.mean(D_AD)
		res.DA = np.mean(D_DA)

		res.P, res.R, res.F = 0.0, 0.0, 0.0

		if res.TP == 0:
			return res

		res.P = res.TP / float(dt)
		res.R = res.TP / float(gt)
		res.F = 2 * res.P * res.R / (res.P + res.R)
		return res


	def process(self):
		'''For the aggregated input features, discard a propertion each time as the pairwise distances within the feature space descending.
		In the meanwhile evaluate the segmentation result and track the trend of perfomance changing by measuring the feature selection
		threshold - segmentation f measure curve.
		'''

		peak_picker = PeakPicker()
		peak_picker.params.alpha = 9.0 # Alpha norm
		peak_picker.params.delta = self.delta_threshold # Adaptive thresholding delta
		peak_picker.params.QuadThresh_a = (100 - self.threshold) / 1000.0
		peak_picker.params.QuadThresh_b = 0.0
		peak_picker.params.QuadThresh_c = (100 - self.threshold) / 1500.0
		peak_picker.params.rawSensitivity = 20
		peak_picker.params.aCoeffs = self.aCoeffs
		peak_picker.params.bCoeffs = self.bCoeffs
		peak_picker.params.preWin = self.medianWin
		peak_picker.params.postWin = self.medianWin + 1
		peak_picker.params.LP_on = self.LPfilter_on
		peak_picker.params.Medfilt_on = self.medfilter_on
		peak_picker.params.Polyfit_on = self.polyfitting_on
		peak_picker.params.isMedianPositive = False

		# Settings used for feature extraction
		feature_window_frame = int(self.SampleRate / self.gammatoneLen * self.featureWindow)
		feature_step_frame = int(0.5 * self.SampleRate / self.gammatoneLen * self.featureStep)
		aggregation_window, aggregation_step = 100, 50
		featureRate = float(self.SampleRate) / self.stepSize

		audio_files = [x for x in os.listdir(options.GT) if not x.startswith(".") ]
		# audio_files = audio_files[:2]
		audio_files.sort()
		audio_list = []

		gammatone_feature_list = [i for i in os.listdir(options.GF) if not i.startswith('.')]
		gammatone_feature_list = ['contrast4', 'rolloff', 'dct']
		tempo_feature_list = [i for i in os.listdir(options.TF) if not i.startswith('.')]
		tempo_feature_list = ['intensity_bpm', 'loudness_bpm']
		timbre_feature_list = ['mfcc']
		harmonic_feature_list = ['nnls']

		gammatone_feature_list = [join(options.GF, f) for f in gammatone_feature_list]
		timbre_feature_list = [join(options.SF, f) for f in timbre_feature_list]
		tempo_feature_list = [join(options.TF, f) for f in tempo_feature_list]
		harmonic_feature_list = [join(options.SF, f) for f in harmonic_feature_list]

		fobj_list = []

		# For each audio file, load specific features
		for audio in audio_files:
			ao = AudioObj()
			ao.name = splitext(audio)[0]
			print ao.name
			# annotation_file = join(options.GT, ao.name+'.txt') # iso, salami
			# ao.gt = np.genfromtxt(annotation_file, usecols=0)
			# ao.label = np.genfromtxt(annotation_file, usecols=1, dtype=str)
			annotation_file = join(options.GT, ao.name+'.csv') # qupujicheng
			ao.gt = np.genfromtxt(annotation_file, usecols=0, delimiter=',')
			ao.label = np.genfromtxt(annotation_file, usecols=1, delimiter=',', dtype=str)

			gammatone_featureset, timbre_featureset, tempo_featureset, harmonic_featureset = [], [], [], []
			for feature in gammatone_feature_list:
				for f in os.listdir(feature):
					if f[:f.find('_vamp')]==ao.name:
						gammatone_featureset.append(np.genfromtxt(join(feature, f), delimiter=',',filling_values=0.0)[:,1:])
						break
			if len(gammatone_feature_list) > 1:
				n_frame = np.min([x.shape[0] for x in gammatone_featureset])
				gammatone_featureset = [x[:n_frame,:] for x in gammatone_featureset]
				ao.gammatone_features = np.hstack((gammatone_featureset))
			else:
				ao.gammatone_features = gammatone_featureset[0]

			for feature in timbre_feature_list:
				for f in os.listdir(feature):
					if f[:f.find('_vamp')]==ao.name:
						timbre_featureset.append(np.genfromtxt(join(feature, f), delimiter=',',filling_values=0.0)[:,1:])
						break
			if len(timbre_feature_list) > 1:
				n_frame = np.min([x.shape[0] for x in timbre_featureset])
				timbre_featureset = [x[:n_frame,:] for x in timbre_featureset]
				ao.timbre_features = np.hstack((timbre_featureset))
			else:
				ao.timbre_features = timbre_featureset[0]
			for feature in tempo_feature_list:
				for f in os.listdir(feature):
					if f[:f.find('_vamp')]==ao.name:
						tempo_featureset.append(np.genfromtxt(join(feature, f), delimiter=',',filling_values=0.0)[1:,1:])
						ao.tempo_timestamps = np.genfromtxt(join(feature, f), delimiter=',',filling_values=0.0)[1:,0]
						break
			if len(tempo_feature_list) > 1:
				n_frame = np.min([x.shape[0] for x in tempo_featureset])
				tempo_featureset = [x[:n_frame,:] for x in tempo_featureset]
				ao.tempo_features = np.hstack((tempo_featureset))
			else:
				ao.tempo_features = tempo_featureset[0]
			for feature in harmonic_feature_list:
				for f in os.listdir(feature):
					if f[:f.find('_vamp')]==ao.name:
						harmonic_featureset.append(np.genfromtxt(join(feature, f), delimiter=',',filling_values=0.0)[:,1:])
						break
			if len(harmonic_feature_list) > 1:
				n_frame = np.min([x.shape[0] for x in harmonic_featureset])
				harmonic_featureset = [x[:n_frame,:] for x in harmonic_featureset]
				ao.harmonic_features = np.hstack((harmonic_featureset))
			else:
				ao.harmonic_features = harmonic_featureset[0]

			# Get aggregated features for computing ssm
			aggregation_window, aggregation_step = 1,1
			featureRate = float(self.SampleRate) /self.stepSize
			pca = PCA(n_components=5)

			# Resample and normalise features
			ao.gammatone_features = resample(ao.gammatone_features, step)
			ao.gammatone_features = normaliseFeature(ao.gammatone_features)
			ao.timbre_features = resample(ao.timbre_features, step)
			ao.timbre_features = normaliseFeature(ao.timbre_features)
			ao.harmonic_features = resample(ao.harmonic_features, step)
			ao.harmonic_features = normaliseFeature(ao.harmonic_features)
			ao.tempo_features = normaliseFeature(ao.harmonic_features)

			pca.fit(ao.gammatone_features)
			ao.gammatone_features = pca.transform(ao.gammatone_features)
			ao.gammatone_ssm = getSSM(ao.gammatone_features)

			pca.fit(ao.tempo_features)
			ao.tempo_features = pca.transform(ao.tempo_features)
			ao.tempo_ssm = getSSM(ao.tempo_features)

			pca.fit(ao.timbre_features)
			ao.timbre_features = pca.transform(ao.timbre_features)
			ao.timbre_ssm = getSSM(ao.timbre_features)

			pca.fit(ao.harmonic_features)
			ao.harmonic_features = pca.transform(ao.harmonic_features)
			ao.harmonic_ssm = getSSM(ao.harmonic_features)

			ao.ssm_timestamps = np.array(map(lambda x: ao.tempo_timestamps[aggregation_step*x], np.arange(0, ao.gammatone_ssm.shape[0])))

			audio_list.append(ao)

		# Segment input audio using specified boundary retrieval method.
		print 'Segmenting using %s method' %options.BOUNDARY
		for i,ao in enumerate(audio_list):
			print 'processing: %s' %ao.name

			# Experiment 1: segmentation using individual features.
			if options.BOUNDARY == 'novelty':
				# Peak picking from the novelty curve
				gammatone_novelty, smoothed_gammatone_novelty, gammatone_bound_idxs = novelty_S.process(ao.gammatone_ssm, self.kernel_size, peak_picker)
				timbre_novelty, smoothed_timbre_novelty, timbre_bound_idxs = novelty_S.process(ao.timbre_ssm, self.kernel_size, peak_picker)
				tempo_novelty, smoothed_harmonic_novelty, tempo_bound_idxs = novelty_S.process(ao.tempo_ssm, self.kernel_size, peak_picker)
				harmonic_novelty, smoothed_tempo_novelty, harmonic_bound_idxs = novelty_S.process(ao.harmonic_ssm, self.kernel_size, peak_picker)

			if options.BOUNDARY == 'cnmf':
				gammatone_bound_idxs = cnmf_S.segmentation(ao.gammatone_features, rank=rank, R=R, h=8, niter=300)
				timbre_bound_idxs = cnmf_S.segmentation(ao.timbre_features, rank=rank, R=R, h=h, niter=300)
				tempo_bound_idxs = cnmf_S.segmentation(ao.tempo_features, rank=rank, R=R, h=h, niter=300)
				harmonic_bound_idxs = cnmf_S.segmentation(ao.harmonic_features, rank=rank, R=R, h=h, niter=300)

			if options.BOUNDARY == 'foote':
				gammatone_bound_idxs = foote_S.segmentation(ao.gammatone_features, M=M, Mg=Mg, L=L)
				timbre_bound_idxs = foote_S.segmentation(ao.timbre_features, M=M, Mg=Mg, L=L)
				tempo_bound_idxs = foote_S.segmentation(ao.tempo_features, M=M, Mg=Mg, L=L)
				harmonic_bound_idxs = foote_S.segmentation(ao.harmonic_features, M=M, Mg=Mg, L=L)

			if options.BOUNDARY == 'sf':
				gammatone_bound_idxs = sf_S.segmentation(ao.gammatone_features)
				timbre_bound_idxs = sf_S.segmentation(ao.timbre_features)
				tempo_bound_idxs = sf_S.segmentation(ao.tempo_features)
				harmonic_bound_idxs = sf_S.segmentation(ao.harmonic_features)

			if options.LABEL == 'fmc2d':
				gammatone_bound_labels = fmc2d_S.compute_similarity(gammatone_bound_idxs, xmeans=True, N=N)
				timbre_bound_labels = fmc2d_S.compute_similarity(timbre_bound_idxs, xmeans=True, N=N)
				tempo_bound_labels = fmc2d_S.compute_similarity(tempo_bound_idxs, xmeans=True, N=N)
				harmonic_bound_labels = fmc2d_S.compute_similarity(harmonic_bound_idxs, xmeans=True, N=N)

			if options.LABEL == 'cnmf':
				gammatone_bound_labels = cnmf_S.compute_labels(gammatone_bound_idxs, est_bound_idxs, nFrames)
				timbre_bound_labels = cnmf_S.compute_labels(timbre_bound_idxs, est_bound_idxs, nFrames)
				tempo_bound_labels = cnmf_S.compute_labels(tempo_bound_idxs, est_bound_idxs, nFrames)
				harmonic_bound_labels = cnmf_S.compute_labels(harmonic_bound_idxs, est_bound_idxs, nFrames)

			gammatone_detection = [0.0] + [ao.ssm_timestamps[int(np.rint(i))] for i in gammatone_novelty_peaks]
			timbre_detection = [0.0] + [ao.ssm_timestamps[int(np.rint(i))] for i in timbre_novelty_peaks]
			harmonic_detection = [0.0] + [ao.ssm_timestamps[int(np.rint(i))] for i in harmonic_novelty_peaks]
			tempo_detection = [0.0] + [ao.ssm_timestamps[int(np.rint(i))] for i in tempo_novelty_peaks]

			# Experiment 2: Trying combined features using the best boundary retrieval method
			ao_featureset = [ao.gammatone_features, ao.harmonic_features, ao.timbre_features, ao.tempo_features]
			feature_sel = [int(x) for x in options.FEATURES if x.isdigit()]
			fused_featureset = [ao_featureset[i] for i in feature_sel]


def main():

	segmenter = SSMseg()
	segmenter.process()


if __name__ == '__main__':
	main()
author	mi tian
date	Fri, 03 Apr 2015 15:44:32 +0100
parents	c11ea9e0357f
children	bac230fcd7bd