Mercurial > hg > segmentation
changeset 16:8b814fe5781d
added segmentation method applying nmf to ssm
author | mitian |
---|---|
date | Wed, 17 Jun 2015 18:02:33 +0100 |
parents | 289a4b2b2b16 |
children | c01fcb752221 |
files | snmf.py utils/SegUtil.py utils/plotSSM.py |
diffstat | 3 files changed, 176 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snmf.py Wed Jun 17 18:02:33 2015 +0100 @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +snmf.py +-- A modified version of C-NMF by Nieto. Input is no longer feature matrix but SSM. + +C-NMF method for segmentation, modified from here: + +Nieto, O., Jehan, T., Convex Non-negative Matrix Factorization For Automatic +Music Structure Identification. Proc. of the 38th IEEE International Conference +on Acoustics, Speech, and Signal Processing (ICASSP). Vancouver, Canada, 2013. + +""" + +import sys +import os +import pymf +from utils import SegUtil +from utils.SegUtil import median_filter +import numpy as np + +# Algorithm params +h = 8 # Size of median filter for SSMs in C-NMF +R = 16 # Size of the median filter for the activation matrix C-NMF +rank = 3 # Rank of decomposition for the boundaries +rank_labels = 16 # Rank of decomposition for the labels +R_labels = 4 # Size of the median filter for the labels + + +def cnmf(S, rank, niter=500): + """(Convex) Non-Negative Matrix Factorization. + + Parameters + ---------- + S: np.array(N, N) + SSM. + rank: int + Rank of decomposition + niter: int + Number of iterations to be used + + Returns + ------- + F: np.array + Cluster matrix (decomposed matrix) + G: np.array + Activation matrix (decomposed matrix) + (s.t. S ~= F * G) + """ + nmf_mdl = pymf.CNMF(S, num_bases=rank) + nmf_mdl.factorize(niter=niter) + F = np.asarray(nmf_mdl.W) + G = np.asarray(nmf_mdl.H) + return F, G + +def nmf(S, rank, nither=500): + nmf_mdl = pymf.NMF(S, num_bases=rank, niter=nither) + nmf_mdl.factorize() + F = np.asarray(nmf_mdl.W) + G = np.asarray(nmf_mdl.H) + return F, G + + +def most_frequent(x): + """Returns the most frequent value in x.""" + return np.argmax(np.bincount(x)) + + +def compute_labels(X, rank, R, bound_idxs, niter=300): + """Computes the labels using the bounds.""" + + X = X.T + try: + F, G = cnmf(X, rank, niter=niter) + except: + return [1] + + label_frames = filter_activation_matrix(G.T, R) + label_frames = np.asarray(label_frames, dtype=int) + + # Get labels from the label frames + labels = [] + bound_inters = zip(bound_idxs[:-1], bound_idxs[1:]) + for bound_inter in bound_inters: + if bound_inter[1] - bound_inter[0] <= 0: + labels.append(np.max(label_frames) + 1) + else: + labels.append(most_frequent( + label_frames[bound_inter[0]:bound_inter[1]])) + + return labels + + +def filter_activation_matrix(G, R): + """Filters the activation matrix G, and returns a flattened copy.""" + idx = np.argmax(G, axis=1) + max_idx = np.arange(G.shape[0]) + max_idx = (max_idx, idx.flatten()) + G[:, :] = 0 + G[max_idx] = idx + 1 + G = np.sum(G, axis=1) + G = SegUtil.median_filter(G[:, np.newaxis], R) + return G.flatten() + + +def segmentation(X, rank=4, R=15, h=8, niter=300, CNMF=True): + """ + Gets the segmentation (boundaries and labels) from the factorization + matrices. + + Parameters + ---------- + X: np.array() + Features matrix (e.g. chromagram) + rank: int + Rank of decomposition + R: int + Size of the median filter for activation matrix + niter: int + Number of iterations for k-means + bound_idxs : list + Use previously found boundaries (None to detect them) + CNMF : bool + If True, use CNMF; otherwise use NMF + + Returns + ------- + bounds_idx: np.array + Bound indeces found + labels: np.array + Indeces of the labels representing the similarity between segments. + """ + + # Filter + X = median_filter(X, M=h) + X = X.T + + # Find non filtered boundaries + bound_idxs = None + while True: + if bound_idxs is None: + try: + if CNMF: F, G = cnmf(X, rank, niter=niter) + else: F, G = nmf(X, rank, niter=niter) + except: + return np.empty(0), [1] + + # Filter G + G = filter_activation_matrix(G.T, R) + if bound_idxs is None: + bound_idxs = np.where(np.diff(G) != 0)[0] + 1 + + if len(np.unique(bound_idxs)) <= 2: + rank += 1 + bound_idxs = None + else: + break + + return G, bound_idxs
--- a/utils/SegUtil.py Fri Jun 12 17:45:11 2015 +0100 +++ b/utils/SegUtil.py Wed Jun 17 18:02:33 2015 +0100 @@ -363,7 +363,7 @@ Use this when homogeneity in the SSM is expressed by SMALL value. (eg. When cosine metric and exp normalization and used for distance computation.)''' - ssm_lp = lp(enhanced_ssm, fc=fc) + ssm_lp = lp(ssm, fc=fc) # Use scipy.ndimage.filters.median_filter instead ssm_med = med_filter(ssm_lp, size=med_size)
--- a/utils/plotSSM.py Fri Jun 12 17:45:11 2015 +0100 +++ b/utils/plotSSM.py Wed Jun 17 18:02:33 2015 +0100 @@ -34,7 +34,7 @@ from skimage.morphology import disk from PeakPickerUtil import PeakPicker -from SegUtil import getMean, getStd, getDelta, getSSM, reduceSSM, upSample, normaliseFeature +from SegUtil import getMean, getStd, getDelta, getSSM, enhanceSSM, upSample, normaliseFeature def parse_args(): # define parser @@ -483,18 +483,18 @@ for audio in audio_files: ao = AudioObj() ao.name = splitext(audio)[0] - annotation_file = join(options.GT, ao.name+'.txt') # iso, salami - ao.gt = np.genfromtxt(annotation_file, usecols=0) - ao.label = np.genfromtxt(annotation_file, usecols=1, dtype=str) + # annotation_file = join(options.GT, ao.name+'.txt') # iso, salami + # ao.gt = np.genfromtxt(annotation_file, usecols=0) + # ao.label = np.genfromtxt(annotation_file, usecols=1, dtype=str) # annotation_file = join(options.GT, ao.name+'.csv') # qupujicheng # ao.gt = np.genfromtxt(annotation_file, usecols=0, delimiter=',') # ao.label = np.genfromtxt(annotation_file, usecols=1, delimiter=',', dtype=str) - # annotation_file = join(options.GT, ao.name+'.lab') # beatles - # ao.gt = np.genfromtxt(annotation_file, usecols=(0,1)) - # ao.gt = np.unique(np.ndarray.flatten(ao.gt)) - # ao.label = np.genfromtxt(annotation_file, usecols=2, dtype=str) + annotation_file = join(options.GT, ao.name+'.lab') # beatles + ao.gt = np.genfromtxt(annotation_file, usecols=(0,1)) + ao.gt = np.unique(np.ndarray.flatten(ao.gt)) + ao.label = np.genfromtxt(annotation_file, usecols=2, dtype=str) gammatone_featureset, timbre_featureset, lpc_featureset, tempo_featureset, harmonic_featureset = [], [], [], [], [] for feature in gammatone_feature_list: @@ -596,16 +596,19 @@ pca.fit(ao.gammatone_features) ao.gammatone_features = pca.transform(ao.gammatone_features) ao.gammatone_ssm = getSSM(ao.gammatone_features) + ao.gammatone_ssm = enhanceSSM(ao.gammatone_ssm) ao.tempo_features = getMean(ao.tempo_features, winlen=aggregation_window, stepsize=aggregation_step) pca.fit(ao.tempo_features) ao.tempo_features = pca.transform(ao.tempo_features) ao.tempo_ssm = getSSM(ao.tempo_features) + ao.tempo_ssm = enhanceSSM(ao.tempo_ssm) ao.timbre_features = getMean(ao.timbre_features, winlen=aggregation_window, stepsize=aggregation_step) pca.fit(ao.timbre_features) ao.timbre_features = pca.transform(ao.timbre_features) ao.timbre_ssm = getSSM(ao.timbre_features) + ao.timbre_ssm = enhanceSSM(ao.timbre_ssm) # ao.lpc_features = self.getMean(ao.lpc_features, winlen=aggregation_window, stepsize=aggregation_step) # pca.fit(ao.lpc_features) @@ -616,6 +619,7 @@ pca.fit(ao.harmonic_features) ao.harmonic_features = pca.transform(ao.harmonic_features) ao.harmonic_ssm = getSSM(ao.harmonic_features) + ao.harmonic_ssm = enhanceSSM(ao.harmonic_ssm) ao.ssm_timestamps = np.array(map(lambda x: ao.tempo_timestamps[aggregation_step*x], np.arange(0, ao.gammatone_ssm.shape[0]))) @@ -624,21 +628,21 @@ plt.figure(figsize=(10, 10)) plt.vlines(ao.gt / ao.gt[-1] * ao.gammatone_ssm.shape[0], 0, ao.gammatone_ssm.shape[0]) plt.imshow(ao.gammatone_ssm) - plt.savefig(join(options.OUTPUT, ao.name+'-gammatone.pdf'),format='pdf') + plt.savefig(join(options.OUTPUT, ao.name+'-enhanced-gammatone.pdf'),format='pdf') plt.close() # tempo_ssm = self.reduceSSM(ao.tempo_ssm) plt.figure(figsize=(10, 10)) plt.vlines(ao.gt / ao.gt[-1] * ao.tempo_ssm.shape[0], 0, ao.tempo_ssm.shape[0]) plt.imshow(ao.tempo_ssm) - plt.savefig(join(options.OUTPUT, ao.name+'-hpss_tempo.pdf'),format='pdf') + plt.savefig(join(options.OUTPUT, ao.name+'-enhanced-hpss_tempo.pdf'),format='pdf') plt.close() # timbre_ssm = self.reduceSSM(ao.timbre_ssm) plt.figure(figsize=(10, 10)) plt.vlines(ao.gt / ao.gt[-1] * ao.timbre_ssm.shape[0], 0, ao.timbre_ssm.shape[0]) plt.imshow(ao.timbre_ssm) - plt.savefig(join(options.OUTPUT, ao.name+'-hpss_mfcc.pdf'),format='pdf') + plt.savefig(join(options.OUTPUT, ao.name+'-enhanced-hpss_mfcc.pdf'),format='pdf') plt.close() # # lpc_ssm = self.reduceSSM(ao.lpc_ssm) @@ -653,7 +657,7 @@ plt.figure(figsize=(10, 10)) plt.vlines(ao.gt / ao.gt[-1] * ao.harmonic_ssm.shape[0], 0, ao.harmonic_ssm.shape[0]) plt.imshow(ao.harmonic_ssm) - plt.savefig(join(options.OUTPUT, ao.name+'-hpss_chroma.pdf'),format='pdf') + plt.savefig(join(options.OUTPUT, ao.name+'-enhanced-hpss_chroma.pdf'),format='pdf') plt.close() if options.VERBOSE: