changeset 16:8b814fe5781d

added segmentation method applying nmf to ssm
author mitian
date Wed, 17 Jun 2015 18:02:33 +0100
parents 289a4b2b2b16
children c01fcb752221
files snmf.py utils/SegUtil.py utils/plotSSM.py
diffstat 3 files changed, 176 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snmf.py	Wed Jun 17 18:02:33 2015 +0100
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+snmf.py
+-- A modified version of C-NMF by Nieto. Input is no longer feature matrix but SSM.
+
+C-NMF method for segmentation, modified from here:
+
+Nieto, O., Jehan, T., Convex Non-negative Matrix Factorization For Automatic
+Music Structure Identification. Proc. of the 38th IEEE International Conference
+on Acoustics, Speech, and Signal Processing (ICASSP). Vancouver, Canada, 2013.
+
+"""
+
+import sys
+import os
+import pymf
+from utils import SegUtil
+from utils.SegUtil import median_filter
+import numpy as np
+
+# Algorithm params
+h = 8				# Size of median filter for SSMs in C-NMF
+R = 16				# Size of the median filter for the activation matrix C-NMF
+rank = 3			# Rank of decomposition for the boundaries
+rank_labels = 16	 # Rank of decomposition for the labels
+R_labels = 4		# Size of the median filter for the labels
+
+
+def cnmf(S, rank, niter=500):
+	"""(Convex) Non-Negative Matrix Factorization.
+
+	Parameters
+	----------
+	S: np.array(N, N)
+	   SSM.
+	rank: int
+		Rank of decomposition
+	niter: int
+		Number of iterations to be used
+
+	Returns
+	-------
+	F: np.array
+		Cluster matrix (decomposed matrix)
+	G: np.array
+		Activation matrix (decomposed matrix)
+		(s.t. S ~= F * G)
+	"""
+	nmf_mdl = pymf.CNMF(S, num_bases=rank)
+	nmf_mdl.factorize(niter=niter)
+	F = np.asarray(nmf_mdl.W)
+	G = np.asarray(nmf_mdl.H)
+	return F, G
+
+def nmf(S, rank, nither=500):
+	nmf_mdl = pymf.NMF(S, num_bases=rank, niter=nither)
+	nmf_mdl.factorize()
+	F = np.asarray(nmf_mdl.W)
+	G = np.asarray(nmf_mdl.H)
+	return F, G
+	
+	
+def most_frequent(x):
+	"""Returns the most frequent value in x."""
+	return np.argmax(np.bincount(x))
+
+
+def compute_labels(X, rank, R, bound_idxs, niter=300):
+	"""Computes the labels using the bounds."""
+
+	X = X.T
+	try:
+		F, G = cnmf(X, rank, niter=niter)
+	except:
+		return [1]
+
+	label_frames = filter_activation_matrix(G.T, R)
+	label_frames = np.asarray(label_frames, dtype=int)
+
+	# Get labels from the label frames
+	labels = []
+	bound_inters = zip(bound_idxs[:-1], bound_idxs[1:])
+	for bound_inter in bound_inters:
+		if bound_inter[1] - bound_inter[0] <= 0:
+			labels.append(np.max(label_frames) + 1)
+		else:
+			labels.append(most_frequent(
+				label_frames[bound_inter[0]:bound_inter[1]]))
+
+	return labels
+
+
+def filter_activation_matrix(G, R):
+	"""Filters the activation matrix G, and returns a flattened copy."""
+	idx = np.argmax(G, axis=1)
+	max_idx = np.arange(G.shape[0])
+	max_idx = (max_idx, idx.flatten())
+	G[:, :] = 0
+	G[max_idx] = idx + 1
+	G = np.sum(G, axis=1)
+	G = SegUtil.median_filter(G[:, np.newaxis], R)
+	return G.flatten()
+
+
+def segmentation(X, rank=4, R=15, h=8, niter=300, CNMF=True):
+	"""
+	Gets the segmentation (boundaries and labels) from the factorization
+	matrices.
+
+	Parameters
+	----------
+	X: np.array()
+		Features matrix (e.g. chromagram)
+	rank: int
+		Rank of decomposition
+	R: int
+		Size of the median filter for activation matrix
+	niter: int
+		Number of iterations for k-means
+	bound_idxs : list
+		Use previously found boundaries (None to detect them)
+	CNMF : bool
+		If True, use CNMF; otherwise use NMF
+		
+	Returns
+	-------
+	bounds_idx: np.array
+		Bound indeces found
+	labels: np.array
+		Indeces of the labels representing the similarity between segments.
+	"""
+
+	# Filter
+	X = median_filter(X, M=h)
+	X = X.T
+
+	# Find non filtered boundaries
+	bound_idxs = None
+	while True:
+		if bound_idxs is None:
+			try:
+				if CNMF: F, G = cnmf(X, rank, niter=niter)
+				else: F, G = nmf(X, rank, niter=niter)
+			except:
+				return np.empty(0), [1]
+
+			# Filter G
+			G = filter_activation_matrix(G.T, R)
+			if bound_idxs is None:
+				bound_idxs = np.where(np.diff(G) != 0)[0] + 1
+
+		if len(np.unique(bound_idxs)) <= 2:
+			rank += 1
+			bound_idxs = None
+		else:
+			break
+
+	return G, bound_idxs
--- a/utils/SegUtil.py	Fri Jun 12 17:45:11 2015 +0100
+++ b/utils/SegUtil.py	Wed Jun 17 18:02:33 2015 +0100
@@ -363,7 +363,7 @@
 			  Use this when homogeneity in the SSM is expressed by SMALL value. 
 			  (eg. When cosine metric and exp normalization and used for distance computation.)'''
 
-	ssm_lp = lp(enhanced_ssm, fc=fc)
+	ssm_lp = lp(ssm, fc=fc)
 	
 	# Use scipy.ndimage.filters.median_filter instead
 	ssm_med = med_filter(ssm_lp, size=med_size)
--- a/utils/plotSSM.py	Fri Jun 12 17:45:11 2015 +0100
+++ b/utils/plotSSM.py	Wed Jun 17 18:02:33 2015 +0100
@@ -34,7 +34,7 @@
 from skimage.morphology import disk
 
 from PeakPickerUtil import PeakPicker
-from SegUtil import getMean, getStd, getDelta, getSSM, reduceSSM, upSample, normaliseFeature
+from SegUtil import getMean, getStd, getDelta, getSSM, enhanceSSM, upSample, normaliseFeature
 
 def parse_args():
 	# define parser
@@ -483,18 +483,18 @@
 		for audio in audio_files:
 			ao = AudioObj()
 			ao.name = splitext(audio)[0]
-			annotation_file = join(options.GT, ao.name+'.txt') # iso, salami
-			ao.gt = np.genfromtxt(annotation_file, usecols=0)	
-			ao.label = np.genfromtxt(annotation_file, usecols=1, dtype=str)
+			# annotation_file = join(options.GT, ao.name+'.txt') # iso, salami
+			# ao.gt = np.genfromtxt(annotation_file, usecols=0)	
+			# ao.label = np.genfromtxt(annotation_file, usecols=1, dtype=str)
 	
 			# annotation_file = join(options.GT, ao.name+'.csv') # qupujicheng
 			# ao.gt = np.genfromtxt(annotation_file, usecols=0, delimiter=',')	
 			# ao.label = np.genfromtxt(annotation_file, usecols=1, delimiter=',', dtype=str)
 
-			# annotation_file = join(options.GT, ao.name+'.lab') # beatles
-			# ao.gt = np.genfromtxt(annotation_file, usecols=(0,1))
-			# ao.gt = np.unique(np.ndarray.flatten(ao.gt))
-			# ao.label = np.genfromtxt(annotation_file, usecols=2, dtype=str)
+			annotation_file = join(options.GT, ao.name+'.lab') # beatles
+			ao.gt = np.genfromtxt(annotation_file, usecols=(0,1))
+			ao.gt = np.unique(np.ndarray.flatten(ao.gt))
+			ao.label = np.genfromtxt(annotation_file, usecols=2, dtype=str)
 
 			gammatone_featureset, timbre_featureset, lpc_featureset, tempo_featureset, harmonic_featureset = [], [], [], [], []
 			for feature in gammatone_feature_list:
@@ -596,16 +596,19 @@
 			pca.fit(ao.gammatone_features)
 			ao.gammatone_features = pca.transform(ao.gammatone_features)
 			ao.gammatone_ssm = getSSM(ao.gammatone_features)
+			ao.gammatone_ssm = enhanceSSM(ao.gammatone_ssm)
 			
 			ao.tempo_features = getMean(ao.tempo_features, winlen=aggregation_window, stepsize=aggregation_step)
 			pca.fit(ao.tempo_features)
 			ao.tempo_features = pca.transform(ao.tempo_features)
 			ao.tempo_ssm = getSSM(ao.tempo_features)
+			ao.tempo_ssm = enhanceSSM(ao.tempo_ssm)
 			
 			ao.timbre_features = getMean(ao.timbre_features, winlen=aggregation_window, stepsize=aggregation_step)
 			pca.fit(ao.timbre_features)
 			ao.timbre_features = pca.transform(ao.timbre_features)
 			ao.timbre_ssm = getSSM(ao.timbre_features)
+			ao.timbre_ssm = enhanceSSM(ao.timbre_ssm)
 
 			# ao.lpc_features = self.getMean(ao.lpc_features, winlen=aggregation_window, stepsize=aggregation_step)
 			# pca.fit(ao.lpc_features)
@@ -616,6 +619,7 @@
 			pca.fit(ao.harmonic_features)
 			ao.harmonic_features = pca.transform(ao.harmonic_features)
 			ao.harmonic_ssm = getSSM(ao.harmonic_features)
+			ao.harmonic_ssm = enhanceSSM(ao.harmonic_ssm)
 			
 			ao.ssm_timestamps = np.array(map(lambda x: ao.tempo_timestamps[aggregation_step*x], np.arange(0, ao.gammatone_ssm.shape[0])))
 			
@@ -624,21 +628,21 @@
 			plt.figure(figsize=(10, 10))
 			plt.vlines(ao.gt / ao.gt[-1] * ao.gammatone_ssm.shape[0], 0, ao.gammatone_ssm.shape[0])
 			plt.imshow(ao.gammatone_ssm)
-			plt.savefig(join(options.OUTPUT, ao.name+'-gammatone.pdf'),format='pdf')
+			plt.savefig(join(options.OUTPUT, ao.name+'-enhanced-gammatone.pdf'),format='pdf')
 			plt.close()
 			
 			# tempo_ssm = self.reduceSSM(ao.tempo_ssm)
 			plt.figure(figsize=(10, 10))
 			plt.vlines(ao.gt / ao.gt[-1] * ao.tempo_ssm.shape[0], 0, ao.tempo_ssm.shape[0])
 			plt.imshow(ao.tempo_ssm)
-			plt.savefig(join(options.OUTPUT, ao.name+'-hpss_tempo.pdf'),format='pdf')
+			plt.savefig(join(options.OUTPUT, ao.name+'-enhanced-hpss_tempo.pdf'),format='pdf')
 			plt.close()
 			
 			# timbre_ssm = self.reduceSSM(ao.timbre_ssm)
 			plt.figure(figsize=(10, 10))
 			plt.vlines(ao.gt / ao.gt[-1] * ao.timbre_ssm.shape[0], 0, ao.timbre_ssm.shape[0])
 			plt.imshow(ao.timbre_ssm)
-			plt.savefig(join(options.OUTPUT, ao.name+'-hpss_mfcc.pdf'),format='pdf')
+			plt.savefig(join(options.OUTPUT, ao.name+'-enhanced-hpss_mfcc.pdf'),format='pdf')
 			plt.close()
 
 			# # lpc_ssm = self.reduceSSM(ao.lpc_ssm)
@@ -653,7 +657,7 @@
 			plt.figure(figsize=(10, 10))
 			plt.vlines(ao.gt / ao.gt[-1] * ao.harmonic_ssm.shape[0], 0, ao.harmonic_ssm.shape[0])
 			plt.imshow(ao.harmonic_ssm)
-			plt.savefig(join(options.OUTPUT, ao.name+'-hpss_chroma.pdf'),format='pdf')
+			plt.savefig(join(options.OUTPUT, ao.name+'-enhanced-hpss_chroma.pdf'),format='pdf')
 			plt.close()
 
 			if options.VERBOSE: