view toolboxes/MIRtoolbox1.3.2/MIRToolbox/mirsegment.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
line wrap: on
line source
function [f,p,m,fe] = mirsegment(x,varargin)
%   f = mirsegment(a) segments an audio signal. It can also be the name of an
%       audio file or 'Folder', for the analysis of the audio files in the
%       current folder. The segmentation of audio signal already decomposed
%       into frames is not available for the moment.
%   f = mirsegment(...,'Novelty') segments using a self-similarity matrix
%           (Foote & Cooper, 2003)     (by default)
%       f = mirsegment(...,feature) bases the segmentation strategy on a
%           specific feature.
%           'Spectrum': from FFT spectrum (by default)
%           'MFCC': from MFCCs
%           'Keystrength': from the key strength profile
%           'AutocorPitch': from the autocorrelation function computed as
%               for pitch extraction.
%           The option related to this feature extraction can be specified.
%           Example: mirsegment(...,'Spectrum','Window','bartlett')
%                    mirsegment(...,'MFCC','Rank',1:10)
%                    mirsegment(...,'Keystrength','Weight',.5)
%       These feature need to be frame-based, in order to appreciate their
%           temporal evolution. Therefore, the audio signal x is first
%           decomposed into frames. This decomposition can be controled
%           using the 'Frame' keyword.  
%       The options available for the chosen strategies can be specified
%           directly as options of the segment function.
%           Example: mirsegment(a,'Novelty','KernelSize',10)
%   f = mirsegment(...,'HCDF') segments using the Harmonic Change Detection  
%           Function (Harte & Sandler, 2006)
%   f = mirsegment(...,'RMS') segments at positions of long silences. A
%       frame decomposed RMS is computed using mirrms (with default
%       options), and segments are selected from temporal positions
%       where the RMS rises to a given 'On' threshold, until temporal
%       positions where the RMS drops back to a given 'Off' threshold.
%       f = mirsegment(...,'Off',t1) specifies the RMS 'Off' threshold.
%           Default value: t1 = .01
%       f = mirsegment(...,'On',t2) specifies the RMS 'On' threshold.
%           Default value: t2 = .02
%
%   f = mirsegment(a,s) segments a using the results of a segmentation
%       analysis s. s can be the peaks detected on an analysis of the
%       audio for instance.
%
%   f = mirsegment(a,v) where v is an array of numbers, segments a using
%       the temporal positions specified in v (in s.)
%
%   Foote, J. & Cooper, M. (2003). Media Segmentation using Self-Similarity
%       Decomposition,. In Proc. SPIE Storage and Retrieval for Multimedia
%       Databases, Vol. 5021, pp. 167-75.
%   Harte, C. A. & Sandler, M. B. (2006). Detecting harmonic change in
%       musical audio, in Proceedings of Audio and Music Computing for 
%       Multimedia Workshop, Santa Barbara, CA.


%   [f,p] = mirsegment(...) also displays the analysis produced by the chosen
%       strategy.
%           For 'Novelty', p is the novelty curve.
%           For 'HCDF', p is the Harmonic Change Detection Function.
%   [f,p,m] = mirsegment(...) also displays the preliminary analysis
%       undertaken in the chosen strategy.
%           For 'Novelty', m is the similarity matrix.
%           For 'HCDF', m is the tonal centroid.
%   [f,p,m,fe] = mirsegment(...) also displays the temporal evolution of the
%       feature used for the analysis.
 
%   f = mirsegment(...,'Novelty')

        mfc.key = {'Rank','MFCC'};
        mfc.type = 'Integers';
        mfc.default = 0;
        mfc.keydefault = 1:13;
    option.mfc = mfc;

        K.key = 'KernelSize';
        K.type = 'Integer';
        K.default = 128;
    option.K = K;
    
        distance.key = 'Distance';
        distance.type = 'String';
        distance.default = 'cosine';
    option.distance = distance;

        measure.key = {'Measure','Similarity'};
        measure.type = 'String';
        measure.default = 'exponential';
    option.measure = measure;

        tot.key = 'Total';
        tot.type = 'Integer';
        tot.default = Inf;
    option.tot = tot;

        cthr.key = 'Contrast';
        cthr.type = 'Integer';
        cthr.default = .1;
    option.cthr = cthr;

        frame.key = 'Frame';
        frame.type = 'Integer';
        frame.number = 2;
        frame.default = [0 0];
        frame.keydefault = [3 .1];
    option.frame = frame;

        ana.type = 'String';
        ana.choice = {'Spectrum','Keystrength','AutocorPitch','Pitch'};
        ana.default = 0;
    option.ana = ana;
    
%       f = mirsegment(...,'Spectrum')    
    
            band.choice = {'Mel','Bark','Freq'};
            band.type = 'String';
            band.default = 'Freq';
        option.band = band;

            mi.key = 'Min';
            mi.type = 'Integer';
            mi.default = 0;
        option.mi = mi;

            ma.key = 'Max';
            ma.type = 'Integer';
            ma.default = 0;
        option.ma = ma;

            norm.key = 'Normal';
            norm.type = 'Boolean';
            norm.default = 0;
        option.norm = norm;

            win.key = 'Window';
            win.type = 'String';
            win.default = 'hamming';
        option.win = win;
    
%       f = mirsegment(...,'Silence')    
    
            throff.key = 'Off';
            throff.type = 'Integer';
            throff.default = .01;
        option.throff = throff;

            thron.key = 'On';
            thron.type = 'Integer';
            thron.default = .02;
        option.thron = thron;

        strat.choice = {'Novelty','HCDF','RMS'}; % should remain as last field
        strat.default = 'Novelty';
        strat.position = 2;
    option.strat = strat;
   
specif.option = option;


p = {};
m = {};
fe = {};

if isa(x,'mirdesign')
    if not(get(x,'Eval'))
        % During bottom-up construction of the general design

        [unused option] = miroptions(@mirframe,x,specif,varargin);
        type = get(x,'Type');
        f = mirdesign(@mirsegment,x,option,{},struct,type);
        
        sg = get(x,'Segment');
        if not(isempty(sg))
            f = set(f,'Segment',sg);
        else
            f = set(f,'Segment',option.strat);
        end
        
    else
        % During top-down evaluation initiation
        
        f = evaleach(x);
        if iscell(f)
            f = f{1};
        end
        p = x;
    end
elseif isa(x,'mirdata')
    [unused option] = miroptions(@mirframe,x,specif,varargin);
    if ischar(option.strat)
        dx = get(x,'Data');
        if size(dx{1},2) > 1
            error('ERROR IN MIRSEGMENT: The segmentation of audio signal already decomposed into frames is not available for the moment.');
        end
        if strcmpi(option.strat,'Novelty')
            if not(option.frame.length.val)
                if strcmpi(option.ana,'Keystrength')
                    option.frame.length.val = .5;
                    option.frame.hop.val = .2;
                elseif strcmpi(option.ana,'AutocorPitch') ...
                        || strcmpi(option.ana,'Pitch')
                    option.frame.length.val = .05;
                    option.frame.hop.val = .01;
                else
                    option.frame.length.val = .05;
                    option.frame.hop.val = 1;
                end
            end
            fr = mirframenow(x,option);
            if not(isequal(option.mfc,0))
                fe = mirmfcc(fr,'Rank',option.mfc);
            elseif strcmpi(option.ana,'Spectrum')
                fe = mirspectrum(fr,'Min',option.mi,'Max',option.ma,...
                                    'Normal',option.norm,option.band,...
                                    'Window',option.win);
            elseif strcmpi(option.ana,'Keystrength')
                    fe = mirkeystrength(fr);
            elseif strcmpi(option.ana,'AutocorPitch') ...
                    || strcmpi(option.ana,'Pitch')
                [unused,fe] = mirpitch(x,'Frame');
            else
                fe = fr;
            end
            [n m] = mirnovelty(fe,'Distance',option.distance,...
                                  'Measure',option.measure,...
                                  'KernelSize',option.K);
            p = mirpeaks(n,'Total',option.tot,...
                           'Contrast',option.cthr,...
                           'Chrono','NoBegin','NoEnd');
        elseif strcmpi(option.strat,'HCDF')
            if not(option.frame.length.val)
                option.frame.length.val = .743;
                option.frame.hop.val = 1/8;
            end
            fr = mirframenow(x,option);
            %[df m fe] = mirhcdf(fr);
            df = mirhcdf(fr);
            p = mirpeaks(df);
        elseif strcmpi(option.strat,'RMS')
            if not(option.frame.length.val)
                option.frame.length.val = .05;
                option.frame.hop.val = .5;
            end
            fr = mirframenow(x,option);
            %[df m fe] = mirhcdf(fr);
            df = mirrms(fr);
            fp = get(df,'FramePos');
            p = mircompute(@findsilence,df,fp,option.throff,option.thron);
        end
        f = mirsegment(x,p);
    else
        dx = get(x,'Data');
        dt = get(x,'Time');

        if isa(option.strat,'mirscalar')
            ds = get(option.strat,'PeakPos');
            fp = get(option.strat,'FramePos');
        elseif isa(option.strat,'mirdata')
            ds = get(option.strat,'AttackPos');
            if isempty(ds) || isempty(ds{1})
                ds = get(option.strat,'PeakPos');
            end
            xx = get(option.strat,'Pos');
        else
            ds = option.strat;
            fp = cell(1,length(dx));
        end
        st = cell(1,length(dx));
        sx = cell(1,length(dx));
        cl = cell(1,length(dx));
        for k = 1:length(dx)
            dxk = dx{k}{1}; % values in kth audio file
            dtk = dt{k}{1}; % time positions in kth audio file
            if isa(option.strat,'mirdata')
                dsk = ds{k}{1}; % segmentation times in kth audio file
            else
                dsk = {ds};
            end
            fsk = [];   % the structured array of segmentation times 
                         % needs to be flatten
            for j = 1:length(dsk)
                if isa(option.strat,'mirdata')
                    dsj = dsk{j}; % segmentation times in jth segment
                else
                    dsj = ds;
                end
                if not(iscell(dsj))
                    dsj = {dsj};
                end
                for m = 1:length(dsj)
                    % segmentation times in mth bank channel
                    if isa(option.strat,'mirscalar')
                        dsm = fp{k}{m}(1,dsj{m});
                    elseif isa(option.strat,'mirdata')
                        dsm = xx{k}{m}(dsj{m});
                    else
                        dsm = dsj{m};
                    end
                    if iscell(dsm)
                        dsm = dsm{1};
                    end
                    dsm(:,find(dsm(1,:) < dtk(1))) = [];
                    dsm(:,find(dsm(end,:) > dtk(end))) = [];
                    % It is presupposed here that the segmentations times
                    % for a given channel are not decomposed per frames,
                    % because the segmentation of the frame decomposition
                    % is something that does not seem very clear.
                    % Practically, the peak picking for instance is based 
                    % therefore on a frame analysis (such as novelty), and
                    % segmentation are inferred between these frames...
                    if size(dsm,2) == 1
                        dsm = dsm';
                    end
                    fsk = [fsk dsm];
                end
            end

            fsk = sort(fsk); % Here is the chronological ordering
            
            if isempty(fsk)
                ffsk = {[0;dtk(end)]};
                sxk = {dxk};
                stk = {dtk};
                n = 1;
            elseif size(fsk,1) == 1
                ffsk = cell(1,length(fsk)+1);
                ffsk{1} = [dtk(1);fsk(1)];
                for h = 1:length(fsk)-1
                    ffsk{h+1} = [fsk(h);fsk(h+1)];
                end
                ffsk{end} = [fsk(end);dtk(end)];
                
                n = length(ffsk);

                crd = zeros(1,n+1); % the sample positions of the
                                    % segmentations in the channel
                crd0 = 0;
                for i = 1:n
                    crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1);
                    crd(i) = crd0;
                end
                crd(n+1) = size(dxk,1)+1;

                sxk = cell(1,n); % each cell contains a segment
                stk = cell(1,n); % each cell contains
                                 % the corresponding time positions

                for i = 1:n
                    sxk{i} = dxk(crd(i):crd(i+1)-1,1,:);
                    stk{i} = dtk(crd(i):crd(i+1)-1);
                end

            elseif size(fsk,1) == 2
                ffsk = cell(1,size(fsk,2));
                for h = 1:length(fsk)
                    ffsk{h} = [fsk(1,h);fsk(2,h)];
                end
                n = length(ffsk);
                crd = zeros(2,n); % the sample positions of the
                                  % segmentations in the channel
                crd0 = 0;
                for i = 1:n
                    crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1);
                    crd(i,1) = crd0;
                    crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(2),1);
                    crd(i,2) = crd0;                    
                end
                sxk = cell(1,n); % each cell contains a segment
                stk = cell(1,n); % each cell contains
                                 % the corresponding time positions
                for i = 1:n
                    sxk{i} = dxk(crd(i,1):crd(i,2),1,:);
                    stk{i} = dtk(crd(i,1):crd(i,2));
                end
            end
            sx{k} = sxk;
            st{k} = stk;
            fp{k} = ffsk;
            cl{k} = 1:n;
        end
        f = set(x,'Data',sx,'Time',st,'FramePos',fp,'Clusters',cl);
        p = strat;
        m = {};
        fe = {};
    end
else
    [f p] = mirsegment(miraudio(x),varargin{:});
end 


function p = findsilence(d,fp,throff,thron)
d = [0 d 0];
begseg = find(d(1:end-1)<thron & d(2:end)>=thron);
nseg = length(begseg);
endseg = zeros(1,nseg);
removed = [];
for i = 1:nseg
    endseg(i) = begseg(i) + find(d(begseg(i)+1:end)<=throff, 1)-1;
    if i>1 && endseg(i) == endseg(i-1)
        removed = [removed i];
    end
end
begseg(removed) = [];
%endseg(removed) = [];
%endseg(end) = min(endseg(end),length(d)+1);
p = fp(1,begseg); %; fp(2,endseg-1)];