Mercurial > hg > camir-aes2014
diff toolboxes/MIRtoolbox1.3.2/MIRToolbox/mirsegment.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/toolboxes/MIRtoolbox1.3.2/MIRToolbox/mirsegment.m Tue Feb 10 15:05:51 2015 +0000 @@ -0,0 +1,402 @@ +function [f,p,m,fe] = mirsegment(x,varargin) +% f = mirsegment(a) segments an audio signal. It can also be the name of an +% audio file or 'Folder', for the analysis of the audio files in the +% current folder. The segmentation of audio signal already decomposed +% into frames is not available for the moment. +% f = mirsegment(...,'Novelty') segments using a self-similarity matrix +% (Foote & Cooper, 2003) (by default) +% f = mirsegment(...,feature) bases the segmentation strategy on a +% specific feature. +% 'Spectrum': from FFT spectrum (by default) +% 'MFCC': from MFCCs +% 'Keystrength': from the key strength profile +% 'AutocorPitch': from the autocorrelation function computed as +% for pitch extraction. +% The option related to this feature extraction can be specified. +% Example: mirsegment(...,'Spectrum','Window','bartlett') +% mirsegment(...,'MFCC','Rank',1:10) +% mirsegment(...,'Keystrength','Weight',.5) +% These feature need to be frame-based, in order to appreciate their +% temporal evolution. Therefore, the audio signal x is first +% decomposed into frames. This decomposition can be controled +% using the 'Frame' keyword. +% The options available for the chosen strategies can be specified +% directly as options of the segment function. +% Example: mirsegment(a,'Novelty','KernelSize',10) +% f = mirsegment(...,'HCDF') segments using the Harmonic Change Detection +% Function (Harte & Sandler, 2006) +% f = mirsegment(...,'RMS') segments at positions of long silences. A +% frame decomposed RMS is computed using mirrms (with default +% options), and segments are selected from temporal positions +% where the RMS rises to a given 'On' threshold, until temporal +% positions where the RMS drops back to a given 'Off' threshold. +% f = mirsegment(...,'Off',t1) specifies the RMS 'Off' threshold. +% Default value: t1 = .01 +% f = mirsegment(...,'On',t2) specifies the RMS 'On' threshold. +% Default value: t2 = .02 +% +% f = mirsegment(a,s) segments a using the results of a segmentation +% analysis s. s can be the peaks detected on an analysis of the +% audio for instance. +% +% f = mirsegment(a,v) where v is an array of numbers, segments a using +% the temporal positions specified in v (in s.) +% +% Foote, J. & Cooper, M. (2003). Media Segmentation using Self-Similarity +% Decomposition,. In Proc. SPIE Storage and Retrieval for Multimedia +% Databases, Vol. 5021, pp. 167-75. +% Harte, C. A. & Sandler, M. B. (2006). Detecting harmonic change in +% musical audio, in Proceedings of Audio and Music Computing for +% Multimedia Workshop, Santa Barbara, CA. + + +% [f,p] = mirsegment(...) also displays the analysis produced by the chosen +% strategy. +% For 'Novelty', p is the novelty curve. +% For 'HCDF', p is the Harmonic Change Detection Function. +% [f,p,m] = mirsegment(...) also displays the preliminary analysis +% undertaken in the chosen strategy. +% For 'Novelty', m is the similarity matrix. +% For 'HCDF', m is the tonal centroid. +% [f,p,m,fe] = mirsegment(...) also displays the temporal evolution of the +% feature used for the analysis. + +% f = mirsegment(...,'Novelty') + + mfc.key = {'Rank','MFCC'}; + mfc.type = 'Integers'; + mfc.default = 0; + mfc.keydefault = 1:13; + option.mfc = mfc; + + K.key = 'KernelSize'; + K.type = 'Integer'; + K.default = 128; + option.K = K; + + distance.key = 'Distance'; + distance.type = 'String'; + distance.default = 'cosine'; + option.distance = distance; + + measure.key = {'Measure','Similarity'}; + measure.type = 'String'; + measure.default = 'exponential'; + option.measure = measure; + + tot.key = 'Total'; + tot.type = 'Integer'; + tot.default = Inf; + option.tot = tot; + + cthr.key = 'Contrast'; + cthr.type = 'Integer'; + cthr.default = .1; + option.cthr = cthr; + + frame.key = 'Frame'; + frame.type = 'Integer'; + frame.number = 2; + frame.default = [0 0]; + frame.keydefault = [3 .1]; + option.frame = frame; + + ana.type = 'String'; + ana.choice = {'Spectrum','Keystrength','AutocorPitch','Pitch'}; + ana.default = 0; + option.ana = ana; + +% f = mirsegment(...,'Spectrum') + + band.choice = {'Mel','Bark','Freq'}; + band.type = 'String'; + band.default = 'Freq'; + option.band = band; + + mi.key = 'Min'; + mi.type = 'Integer'; + mi.default = 0; + option.mi = mi; + + ma.key = 'Max'; + ma.type = 'Integer'; + ma.default = 0; + option.ma = ma; + + norm.key = 'Normal'; + norm.type = 'Boolean'; + norm.default = 0; + option.norm = norm; + + win.key = 'Window'; + win.type = 'String'; + win.default = 'hamming'; + option.win = win; + +% f = mirsegment(...,'Silence') + + throff.key = 'Off'; + throff.type = 'Integer'; + throff.default = .01; + option.throff = throff; + + thron.key = 'On'; + thron.type = 'Integer'; + thron.default = .02; + option.thron = thron; + + strat.choice = {'Novelty','HCDF','RMS'}; % should remain as last field + strat.default = 'Novelty'; + strat.position = 2; + option.strat = strat; + +specif.option = option; + + +p = {}; +m = {}; +fe = {}; + +if isa(x,'mirdesign') + if not(get(x,'Eval')) + % During bottom-up construction of the general design + + [unused option] = miroptions(@mirframe,x,specif,varargin); + type = get(x,'Type'); + f = mirdesign(@mirsegment,x,option,{},struct,type); + + sg = get(x,'Segment'); + if not(isempty(sg)) + f = set(f,'Segment',sg); + else + f = set(f,'Segment',option.strat); + end + + else + % During top-down evaluation initiation + + f = evaleach(x); + if iscell(f) + f = f{1}; + end + p = x; + end +elseif isa(x,'mirdata') + [unused option] = miroptions(@mirframe,x,specif,varargin); + if ischar(option.strat) + dx = get(x,'Data'); + if size(dx{1},2) > 1 + error('ERROR IN MIRSEGMENT: The segmentation of audio signal already decomposed into frames is not available for the moment.'); + end + if strcmpi(option.strat,'Novelty') + if not(option.frame.length.val) + if strcmpi(option.ana,'Keystrength') + option.frame.length.val = .5; + option.frame.hop.val = .2; + elseif strcmpi(option.ana,'AutocorPitch') ... + || strcmpi(option.ana,'Pitch') + option.frame.length.val = .05; + option.frame.hop.val = .01; + else + option.frame.length.val = .05; + option.frame.hop.val = 1; + end + end + fr = mirframenow(x,option); + if not(isequal(option.mfc,0)) + fe = mirmfcc(fr,'Rank',option.mfc); + elseif strcmpi(option.ana,'Spectrum') + fe = mirspectrum(fr,'Min',option.mi,'Max',option.ma,... + 'Normal',option.norm,option.band,... + 'Window',option.win); + elseif strcmpi(option.ana,'Keystrength') + fe = mirkeystrength(fr); + elseif strcmpi(option.ana,'AutocorPitch') ... + || strcmpi(option.ana,'Pitch') + [unused,fe] = mirpitch(x,'Frame'); + else + fe = fr; + end + [n m] = mirnovelty(fe,'Distance',option.distance,... + 'Measure',option.measure,... + 'KernelSize',option.K); + p = mirpeaks(n,'Total',option.tot,... + 'Contrast',option.cthr,... + 'Chrono','NoBegin','NoEnd'); + elseif strcmpi(option.strat,'HCDF') + if not(option.frame.length.val) + option.frame.length.val = .743; + option.frame.hop.val = 1/8; + end + fr = mirframenow(x,option); + %[df m fe] = mirhcdf(fr); + df = mirhcdf(fr); + p = mirpeaks(df); + elseif strcmpi(option.strat,'RMS') + if not(option.frame.length.val) + option.frame.length.val = .05; + option.frame.hop.val = .5; + end + fr = mirframenow(x,option); + %[df m fe] = mirhcdf(fr); + df = mirrms(fr); + fp = get(df,'FramePos'); + p = mircompute(@findsilence,df,fp,option.throff,option.thron); + end + f = mirsegment(x,p); + else + dx = get(x,'Data'); + dt = get(x,'Time'); + + if isa(option.strat,'mirscalar') + ds = get(option.strat,'PeakPos'); + fp = get(option.strat,'FramePos'); + elseif isa(option.strat,'mirdata') + ds = get(option.strat,'AttackPos'); + if isempty(ds) || isempty(ds{1}) + ds = get(option.strat,'PeakPos'); + end + xx = get(option.strat,'Pos'); + else + ds = option.strat; + fp = cell(1,length(dx)); + end + st = cell(1,length(dx)); + sx = cell(1,length(dx)); + cl = cell(1,length(dx)); + for k = 1:length(dx) + dxk = dx{k}{1}; % values in kth audio file + dtk = dt{k}{1}; % time positions in kth audio file + if isa(option.strat,'mirdata') + dsk = ds{k}{1}; % segmentation times in kth audio file + else + dsk = {ds}; + end + fsk = []; % the structured array of segmentation times + % needs to be flatten + for j = 1:length(dsk) + if isa(option.strat,'mirdata') + dsj = dsk{j}; % segmentation times in jth segment + else + dsj = ds; + end + if not(iscell(dsj)) + dsj = {dsj}; + end + for m = 1:length(dsj) + % segmentation times in mth bank channel + if isa(option.strat,'mirscalar') + dsm = fp{k}{m}(1,dsj{m}); + elseif isa(option.strat,'mirdata') + dsm = xx{k}{m}(dsj{m}); + else + dsm = dsj{m}; + end + if iscell(dsm) + dsm = dsm{1}; + end + dsm(:,find(dsm(1,:) < dtk(1))) = []; + dsm(:,find(dsm(end,:) > dtk(end))) = []; + % It is presupposed here that the segmentations times + % for a given channel are not decomposed per frames, + % because the segmentation of the frame decomposition + % is something that does not seem very clear. + % Practically, the peak picking for instance is based + % therefore on a frame analysis (such as novelty), and + % segmentation are inferred between these frames... + if size(dsm,2) == 1 + dsm = dsm'; + end + fsk = [fsk dsm]; + end + end + + fsk = sort(fsk); % Here is the chronological ordering + + if isempty(fsk) + ffsk = {[0;dtk(end)]}; + sxk = {dxk}; + stk = {dtk}; + n = 1; + elseif size(fsk,1) == 1 + ffsk = cell(1,length(fsk)+1); + ffsk{1} = [dtk(1);fsk(1)]; + for h = 1:length(fsk)-1 + ffsk{h+1} = [fsk(h);fsk(h+1)]; + end + ffsk{end} = [fsk(end);dtk(end)]; + + n = length(ffsk); + + crd = zeros(1,n+1); % the sample positions of the + % segmentations in the channel + crd0 = 0; + for i = 1:n + crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1); + crd(i) = crd0; + end + crd(n+1) = size(dxk,1)+1; + + sxk = cell(1,n); % each cell contains a segment + stk = cell(1,n); % each cell contains + % the corresponding time positions + + for i = 1:n + sxk{i} = dxk(crd(i):crd(i+1)-1,1,:); + stk{i} = dtk(crd(i):crd(i+1)-1); + end + + elseif size(fsk,1) == 2 + ffsk = cell(1,size(fsk,2)); + for h = 1:length(fsk) + ffsk{h} = [fsk(1,h);fsk(2,h)]; + end + n = length(ffsk); + crd = zeros(2,n); % the sample positions of the + % segmentations in the channel + crd0 = 0; + for i = 1:n + crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1); + crd(i,1) = crd0; + crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(2),1); + crd(i,2) = crd0; + end + sxk = cell(1,n); % each cell contains a segment + stk = cell(1,n); % each cell contains + % the corresponding time positions + for i = 1:n + sxk{i} = dxk(crd(i,1):crd(i,2),1,:); + stk{i} = dtk(crd(i,1):crd(i,2)); + end + end + sx{k} = sxk; + st{k} = stk; + fp{k} = ffsk; + cl{k} = 1:n; + end + f = set(x,'Data',sx,'Time',st,'FramePos',fp,'Clusters',cl); + p = strat; + m = {}; + fe = {}; + end +else + [f p] = mirsegment(miraudio(x),varargin{:}); +end + + +function p = findsilence(d,fp,throff,thron) +d = [0 d 0]; +begseg = find(d(1:end-1)<thron & d(2:end)>=thron); +nseg = length(begseg); +endseg = zeros(1,nseg); +removed = []; +for i = 1:nseg + endseg(i) = begseg(i) + find(d(begseg(i)+1:end)<=throff, 1)-1; + if i>1 && endseg(i) == endseg(i-1) + removed = [removed i]; + end +end +begseg(removed) = []; +%endseg(removed) = []; +%endseg(end) = min(endseg(end),length(d)+1); +p = fp(1,begseg); %; fp(2,endseg-1)]; \ No newline at end of file