diff toolboxes/MIRtoolbox1.3.2/MIRToolbox/mirsegment.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/toolboxes/MIRtoolbox1.3.2/MIRToolbox/mirsegment.m	Tue Feb 10 15:05:51 2015 +0000
@@ -0,0 +1,402 @@
+function [f,p,m,fe] = mirsegment(x,varargin)
+%   f = mirsegment(a) segments an audio signal. It can also be the name of an
+%       audio file or 'Folder', for the analysis of the audio files in the
+%       current folder. The segmentation of audio signal already decomposed
+%       into frames is not available for the moment.
+%   f = mirsegment(...,'Novelty') segments using a self-similarity matrix
+%           (Foote & Cooper, 2003)     (by default)
+%       f = mirsegment(...,feature) bases the segmentation strategy on a
+%           specific feature.
+%           'Spectrum': from FFT spectrum (by default)
+%           'MFCC': from MFCCs
+%           'Keystrength': from the key strength profile
+%           'AutocorPitch': from the autocorrelation function computed as
+%               for pitch extraction.
+%           The option related to this feature extraction can be specified.
+%           Example: mirsegment(...,'Spectrum','Window','bartlett')
+%                    mirsegment(...,'MFCC','Rank',1:10)
+%                    mirsegment(...,'Keystrength','Weight',.5)
+%       These feature need to be frame-based, in order to appreciate their
+%           temporal evolution. Therefore, the audio signal x is first
+%           decomposed into frames. This decomposition can be controled
+%           using the 'Frame' keyword.  
+%       The options available for the chosen strategies can be specified
+%           directly as options of the segment function.
+%           Example: mirsegment(a,'Novelty','KernelSize',10)
+%   f = mirsegment(...,'HCDF') segments using the Harmonic Change Detection  
+%           Function (Harte & Sandler, 2006)
+%   f = mirsegment(...,'RMS') segments at positions of long silences. A
+%       frame decomposed RMS is computed using mirrms (with default
+%       options), and segments are selected from temporal positions
+%       where the RMS rises to a given 'On' threshold, until temporal
+%       positions where the RMS drops back to a given 'Off' threshold.
+%       f = mirsegment(...,'Off',t1) specifies the RMS 'Off' threshold.
+%           Default value: t1 = .01
+%       f = mirsegment(...,'On',t2) specifies the RMS 'On' threshold.
+%           Default value: t2 = .02
+%
+%   f = mirsegment(a,s) segments a using the results of a segmentation
+%       analysis s. s can be the peaks detected on an analysis of the
+%       audio for instance.
+%
+%   f = mirsegment(a,v) where v is an array of numbers, segments a using
+%       the temporal positions specified in v (in s.)
+%
+%   Foote, J. & Cooper, M. (2003). Media Segmentation using Self-Similarity
+%       Decomposition,. In Proc. SPIE Storage and Retrieval for Multimedia
+%       Databases, Vol. 5021, pp. 167-75.
+%   Harte, C. A. & Sandler, M. B. (2006). Detecting harmonic change in
+%       musical audio, in Proceedings of Audio and Music Computing for 
+%       Multimedia Workshop, Santa Barbara, CA.
+
+
+%   [f,p] = mirsegment(...) also displays the analysis produced by the chosen
+%       strategy.
+%           For 'Novelty', p is the novelty curve.
+%           For 'HCDF', p is the Harmonic Change Detection Function.
+%   [f,p,m] = mirsegment(...) also displays the preliminary analysis
+%       undertaken in the chosen strategy.
+%           For 'Novelty', m is the similarity matrix.
+%           For 'HCDF', m is the tonal centroid.
+%   [f,p,m,fe] = mirsegment(...) also displays the temporal evolution of the
+%       feature used for the analysis.
+ 
+%   f = mirsegment(...,'Novelty')
+
+        mfc.key = {'Rank','MFCC'};
+        mfc.type = 'Integers';
+        mfc.default = 0;
+        mfc.keydefault = 1:13;
+    option.mfc = mfc;
+
+        K.key = 'KernelSize';
+        K.type = 'Integer';
+        K.default = 128;
+    option.K = K;
+    
+        distance.key = 'Distance';
+        distance.type = 'String';
+        distance.default = 'cosine';
+    option.distance = distance;
+
+        measure.key = {'Measure','Similarity'};
+        measure.type = 'String';
+        measure.default = 'exponential';
+    option.measure = measure;
+
+        tot.key = 'Total';
+        tot.type = 'Integer';
+        tot.default = Inf;
+    option.tot = tot;
+
+        cthr.key = 'Contrast';
+        cthr.type = 'Integer';
+        cthr.default = .1;
+    option.cthr = cthr;
+
+        frame.key = 'Frame';
+        frame.type = 'Integer';
+        frame.number = 2;
+        frame.default = [0 0];
+        frame.keydefault = [3 .1];
+    option.frame = frame;
+
+        ana.type = 'String';
+        ana.choice = {'Spectrum','Keystrength','AutocorPitch','Pitch'};
+        ana.default = 0;
+    option.ana = ana;
+    
+%       f = mirsegment(...,'Spectrum')    
+    
+            band.choice = {'Mel','Bark','Freq'};
+            band.type = 'String';
+            band.default = 'Freq';
+        option.band = band;
+
+            mi.key = 'Min';
+            mi.type = 'Integer';
+            mi.default = 0;
+        option.mi = mi;
+
+            ma.key = 'Max';
+            ma.type = 'Integer';
+            ma.default = 0;
+        option.ma = ma;
+
+            norm.key = 'Normal';
+            norm.type = 'Boolean';
+            norm.default = 0;
+        option.norm = norm;
+
+            win.key = 'Window';
+            win.type = 'String';
+            win.default = 'hamming';
+        option.win = win;
+    
+%       f = mirsegment(...,'Silence')    
+    
+            throff.key = 'Off';
+            throff.type = 'Integer';
+            throff.default = .01;
+        option.throff = throff;
+
+            thron.key = 'On';
+            thron.type = 'Integer';
+            thron.default = .02;
+        option.thron = thron;
+
+        strat.choice = {'Novelty','HCDF','RMS'}; % should remain as last field
+        strat.default = 'Novelty';
+        strat.position = 2;
+    option.strat = strat;
+   
+specif.option = option;
+
+
+p = {};
+m = {};
+fe = {};
+
+if isa(x,'mirdesign')
+    if not(get(x,'Eval'))
+        % During bottom-up construction of the general design
+
+        [unused option] = miroptions(@mirframe,x,specif,varargin);
+        type = get(x,'Type');
+        f = mirdesign(@mirsegment,x,option,{},struct,type);
+        
+        sg = get(x,'Segment');
+        if not(isempty(sg))
+            f = set(f,'Segment',sg);
+        else
+            f = set(f,'Segment',option.strat);
+        end
+        
+    else
+        % During top-down evaluation initiation
+        
+        f = evaleach(x);
+        if iscell(f)
+            f = f{1};
+        end
+        p = x;
+    end
+elseif isa(x,'mirdata')
+    [unused option] = miroptions(@mirframe,x,specif,varargin);
+    if ischar(option.strat)
+        dx = get(x,'Data');
+        if size(dx{1},2) > 1
+            error('ERROR IN MIRSEGMENT: The segmentation of audio signal already decomposed into frames is not available for the moment.');
+        end
+        if strcmpi(option.strat,'Novelty')
+            if not(option.frame.length.val)
+                if strcmpi(option.ana,'Keystrength')
+                    option.frame.length.val = .5;
+                    option.frame.hop.val = .2;
+                elseif strcmpi(option.ana,'AutocorPitch') ...
+                        || strcmpi(option.ana,'Pitch')
+                    option.frame.length.val = .05;
+                    option.frame.hop.val = .01;
+                else
+                    option.frame.length.val = .05;
+                    option.frame.hop.val = 1;
+                end
+            end
+            fr = mirframenow(x,option);
+            if not(isequal(option.mfc,0))
+                fe = mirmfcc(fr,'Rank',option.mfc);
+            elseif strcmpi(option.ana,'Spectrum')
+                fe = mirspectrum(fr,'Min',option.mi,'Max',option.ma,...
+                                    'Normal',option.norm,option.band,...
+                                    'Window',option.win);
+            elseif strcmpi(option.ana,'Keystrength')
+                    fe = mirkeystrength(fr);
+            elseif strcmpi(option.ana,'AutocorPitch') ...
+                    || strcmpi(option.ana,'Pitch')
+                [unused,fe] = mirpitch(x,'Frame');
+            else
+                fe = fr;
+            end
+            [n m] = mirnovelty(fe,'Distance',option.distance,...
+                                  'Measure',option.measure,...
+                                  'KernelSize',option.K);
+            p = mirpeaks(n,'Total',option.tot,...
+                           'Contrast',option.cthr,...
+                           'Chrono','NoBegin','NoEnd');
+        elseif strcmpi(option.strat,'HCDF')
+            if not(option.frame.length.val)
+                option.frame.length.val = .743;
+                option.frame.hop.val = 1/8;
+            end
+            fr = mirframenow(x,option);
+            %[df m fe] = mirhcdf(fr);
+            df = mirhcdf(fr);
+            p = mirpeaks(df);
+        elseif strcmpi(option.strat,'RMS')
+            if not(option.frame.length.val)
+                option.frame.length.val = .05;
+                option.frame.hop.val = .5;
+            end
+            fr = mirframenow(x,option);
+            %[df m fe] = mirhcdf(fr);
+            df = mirrms(fr);
+            fp = get(df,'FramePos');
+            p = mircompute(@findsilence,df,fp,option.throff,option.thron);
+        end
+        f = mirsegment(x,p);
+    else
+        dx = get(x,'Data');
+        dt = get(x,'Time');
+
+        if isa(option.strat,'mirscalar')
+            ds = get(option.strat,'PeakPos');
+            fp = get(option.strat,'FramePos');
+        elseif isa(option.strat,'mirdata')
+            ds = get(option.strat,'AttackPos');
+            if isempty(ds) || isempty(ds{1})
+                ds = get(option.strat,'PeakPos');
+            end
+            xx = get(option.strat,'Pos');
+        else
+            ds = option.strat;
+            fp = cell(1,length(dx));
+        end
+        st = cell(1,length(dx));
+        sx = cell(1,length(dx));
+        cl = cell(1,length(dx));
+        for k = 1:length(dx)
+            dxk = dx{k}{1}; % values in kth audio file
+            dtk = dt{k}{1}; % time positions in kth audio file
+            if isa(option.strat,'mirdata')
+                dsk = ds{k}{1}; % segmentation times in kth audio file
+            else
+                dsk = {ds};
+            end
+            fsk = [];   % the structured array of segmentation times 
+                         % needs to be flatten
+            for j = 1:length(dsk)
+                if isa(option.strat,'mirdata')
+                    dsj = dsk{j}; % segmentation times in jth segment
+                else
+                    dsj = ds;
+                end
+                if not(iscell(dsj))
+                    dsj = {dsj};
+                end
+                for m = 1:length(dsj)
+                    % segmentation times in mth bank channel
+                    if isa(option.strat,'mirscalar')
+                        dsm = fp{k}{m}(1,dsj{m});
+                    elseif isa(option.strat,'mirdata')
+                        dsm = xx{k}{m}(dsj{m});
+                    else
+                        dsm = dsj{m};
+                    end
+                    if iscell(dsm)
+                        dsm = dsm{1};
+                    end
+                    dsm(:,find(dsm(1,:) < dtk(1))) = [];
+                    dsm(:,find(dsm(end,:) > dtk(end))) = [];
+                    % It is presupposed here that the segmentations times
+                    % for a given channel are not decomposed per frames,
+                    % because the segmentation of the frame decomposition
+                    % is something that does not seem very clear.
+                    % Practically, the peak picking for instance is based 
+                    % therefore on a frame analysis (such as novelty), and
+                    % segmentation are inferred between these frames...
+                    if size(dsm,2) == 1
+                        dsm = dsm';
+                    end
+                    fsk = [fsk dsm];
+                end
+            end
+
+            fsk = sort(fsk); % Here is the chronological ordering
+            
+            if isempty(fsk)
+                ffsk = {[0;dtk(end)]};
+                sxk = {dxk};
+                stk = {dtk};
+                n = 1;
+            elseif size(fsk,1) == 1
+                ffsk = cell(1,length(fsk)+1);
+                ffsk{1} = [dtk(1);fsk(1)];
+                for h = 1:length(fsk)-1
+                    ffsk{h+1} = [fsk(h);fsk(h+1)];
+                end
+                ffsk{end} = [fsk(end);dtk(end)];
+                
+                n = length(ffsk);
+
+                crd = zeros(1,n+1); % the sample positions of the
+                                    % segmentations in the channel
+                crd0 = 0;
+                for i = 1:n
+                    crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1);
+                    crd(i) = crd0;
+                end
+                crd(n+1) = size(dxk,1)+1;
+
+                sxk = cell(1,n); % each cell contains a segment
+                stk = cell(1,n); % each cell contains
+                                 % the corresponding time positions
+
+                for i = 1:n
+                    sxk{i} = dxk(crd(i):crd(i+1)-1,1,:);
+                    stk{i} = dtk(crd(i):crd(i+1)-1);
+                end
+
+            elseif size(fsk,1) == 2
+                ffsk = cell(1,size(fsk,2));
+                for h = 1:length(fsk)
+                    ffsk{h} = [fsk(1,h);fsk(2,h)];
+                end
+                n = length(ffsk);
+                crd = zeros(2,n); % the sample positions of the
+                                  % segmentations in the channel
+                crd0 = 0;
+                for i = 1:n
+                    crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1);
+                    crd(i,1) = crd0;
+                    crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(2),1);
+                    crd(i,2) = crd0;                    
+                end
+                sxk = cell(1,n); % each cell contains a segment
+                stk = cell(1,n); % each cell contains
+                                 % the corresponding time positions
+                for i = 1:n
+                    sxk{i} = dxk(crd(i,1):crd(i,2),1,:);
+                    stk{i} = dtk(crd(i,1):crd(i,2));
+                end
+            end
+            sx{k} = sxk;
+            st{k} = stk;
+            fp{k} = ffsk;
+            cl{k} = 1:n;
+        end
+        f = set(x,'Data',sx,'Time',st,'FramePos',fp,'Clusters',cl);
+        p = strat;
+        m = {};
+        fe = {};
+    end
+else
+    [f p] = mirsegment(miraudio(x),varargin{:});
+end 
+
+
+function p = findsilence(d,fp,throff,thron)
+d = [0 d 0];
+begseg = find(d(1:end-1)<thron & d(2:end)>=thron);
+nseg = length(begseg);
+endseg = zeros(1,nseg);
+removed = [];
+for i = 1:nseg
+    endseg(i) = begseg(i) + find(d(begseg(i)+1:end)<=throff, 1)-1;
+    if i>1 && endseg(i) == endseg(i-1)
+        removed = [removed i];
+    end
+end
+begseg(removed) = [];
+%endseg(removed) = [];
+%endseg(end) = min(endseg(end),length(d)+1);
+p = fp(1,begseg); %; fp(2,endseg-1)];
\ No newline at end of file