annotate toolboxes/MIRtoolbox1.3.2/MIRToolbox/mirsegment.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
rev   line source
wolffd@0 1 function [f,p,m,fe] = mirsegment(x,varargin)
wolffd@0 2 % f = mirsegment(a) segments an audio signal. It can also be the name of an
wolffd@0 3 % audio file or 'Folder', for the analysis of the audio files in the
wolffd@0 4 % current folder. The segmentation of audio signal already decomposed
wolffd@0 5 % into frames is not available for the moment.
wolffd@0 6 % f = mirsegment(...,'Novelty') segments using a self-similarity matrix
wolffd@0 7 % (Foote & Cooper, 2003) (by default)
wolffd@0 8 % f = mirsegment(...,feature) bases the segmentation strategy on a
wolffd@0 9 % specific feature.
wolffd@0 10 % 'Spectrum': from FFT spectrum (by default)
wolffd@0 11 % 'MFCC': from MFCCs
wolffd@0 12 % 'Keystrength': from the key strength profile
wolffd@0 13 % 'AutocorPitch': from the autocorrelation function computed as
wolffd@0 14 % for pitch extraction.
wolffd@0 15 % The option related to this feature extraction can be specified.
wolffd@0 16 % Example: mirsegment(...,'Spectrum','Window','bartlett')
wolffd@0 17 % mirsegment(...,'MFCC','Rank',1:10)
wolffd@0 18 % mirsegment(...,'Keystrength','Weight',.5)
wolffd@0 19 % These feature need to be frame-based, in order to appreciate their
wolffd@0 20 % temporal evolution. Therefore, the audio signal x is first
wolffd@0 21 % decomposed into frames. This decomposition can be controled
wolffd@0 22 % using the 'Frame' keyword.
wolffd@0 23 % The options available for the chosen strategies can be specified
wolffd@0 24 % directly as options of the segment function.
wolffd@0 25 % Example: mirsegment(a,'Novelty','KernelSize',10)
wolffd@0 26 % f = mirsegment(...,'HCDF') segments using the Harmonic Change Detection
wolffd@0 27 % Function (Harte & Sandler, 2006)
wolffd@0 28 % f = mirsegment(...,'RMS') segments at positions of long silences. A
wolffd@0 29 % frame decomposed RMS is computed using mirrms (with default
wolffd@0 30 % options), and segments are selected from temporal positions
wolffd@0 31 % where the RMS rises to a given 'On' threshold, until temporal
wolffd@0 32 % positions where the RMS drops back to a given 'Off' threshold.
wolffd@0 33 % f = mirsegment(...,'Off',t1) specifies the RMS 'Off' threshold.
wolffd@0 34 % Default value: t1 = .01
wolffd@0 35 % f = mirsegment(...,'On',t2) specifies the RMS 'On' threshold.
wolffd@0 36 % Default value: t2 = .02
wolffd@0 37 %
wolffd@0 38 % f = mirsegment(a,s) segments a using the results of a segmentation
wolffd@0 39 % analysis s. s can be the peaks detected on an analysis of the
wolffd@0 40 % audio for instance.
wolffd@0 41 %
wolffd@0 42 % f = mirsegment(a,v) where v is an array of numbers, segments a using
wolffd@0 43 % the temporal positions specified in v (in s.)
wolffd@0 44 %
wolffd@0 45 % Foote, J. & Cooper, M. (2003). Media Segmentation using Self-Similarity
wolffd@0 46 % Decomposition,. In Proc. SPIE Storage and Retrieval for Multimedia
wolffd@0 47 % Databases, Vol. 5021, pp. 167-75.
wolffd@0 48 % Harte, C. A. & Sandler, M. B. (2006). Detecting harmonic change in
wolffd@0 49 % musical audio, in Proceedings of Audio and Music Computing for
wolffd@0 50 % Multimedia Workshop, Santa Barbara, CA.
wolffd@0 51
wolffd@0 52
wolffd@0 53 % [f,p] = mirsegment(...) also displays the analysis produced by the chosen
wolffd@0 54 % strategy.
wolffd@0 55 % For 'Novelty', p is the novelty curve.
wolffd@0 56 % For 'HCDF', p is the Harmonic Change Detection Function.
wolffd@0 57 % [f,p,m] = mirsegment(...) also displays the preliminary analysis
wolffd@0 58 % undertaken in the chosen strategy.
wolffd@0 59 % For 'Novelty', m is the similarity matrix.
wolffd@0 60 % For 'HCDF', m is the tonal centroid.
wolffd@0 61 % [f,p,m,fe] = mirsegment(...) also displays the temporal evolution of the
wolffd@0 62 % feature used for the analysis.
wolffd@0 63
wolffd@0 64 % f = mirsegment(...,'Novelty')
wolffd@0 65
wolffd@0 66 mfc.key = {'Rank','MFCC'};
wolffd@0 67 mfc.type = 'Integers';
wolffd@0 68 mfc.default = 0;
wolffd@0 69 mfc.keydefault = 1:13;
wolffd@0 70 option.mfc = mfc;
wolffd@0 71
wolffd@0 72 K.key = 'KernelSize';
wolffd@0 73 K.type = 'Integer';
wolffd@0 74 K.default = 128;
wolffd@0 75 option.K = K;
wolffd@0 76
wolffd@0 77 distance.key = 'Distance';
wolffd@0 78 distance.type = 'String';
wolffd@0 79 distance.default = 'cosine';
wolffd@0 80 option.distance = distance;
wolffd@0 81
wolffd@0 82 measure.key = {'Measure','Similarity'};
wolffd@0 83 measure.type = 'String';
wolffd@0 84 measure.default = 'exponential';
wolffd@0 85 option.measure = measure;
wolffd@0 86
wolffd@0 87 tot.key = 'Total';
wolffd@0 88 tot.type = 'Integer';
wolffd@0 89 tot.default = Inf;
wolffd@0 90 option.tot = tot;
wolffd@0 91
wolffd@0 92 cthr.key = 'Contrast';
wolffd@0 93 cthr.type = 'Integer';
wolffd@0 94 cthr.default = .1;
wolffd@0 95 option.cthr = cthr;
wolffd@0 96
wolffd@0 97 frame.key = 'Frame';
wolffd@0 98 frame.type = 'Integer';
wolffd@0 99 frame.number = 2;
wolffd@0 100 frame.default = [0 0];
wolffd@0 101 frame.keydefault = [3 .1];
wolffd@0 102 option.frame = frame;
wolffd@0 103
wolffd@0 104 ana.type = 'String';
wolffd@0 105 ana.choice = {'Spectrum','Keystrength','AutocorPitch','Pitch'};
wolffd@0 106 ana.default = 0;
wolffd@0 107 option.ana = ana;
wolffd@0 108
wolffd@0 109 % f = mirsegment(...,'Spectrum')
wolffd@0 110
wolffd@0 111 band.choice = {'Mel','Bark','Freq'};
wolffd@0 112 band.type = 'String';
wolffd@0 113 band.default = 'Freq';
wolffd@0 114 option.band = band;
wolffd@0 115
wolffd@0 116 mi.key = 'Min';
wolffd@0 117 mi.type = 'Integer';
wolffd@0 118 mi.default = 0;
wolffd@0 119 option.mi = mi;
wolffd@0 120
wolffd@0 121 ma.key = 'Max';
wolffd@0 122 ma.type = 'Integer';
wolffd@0 123 ma.default = 0;
wolffd@0 124 option.ma = ma;
wolffd@0 125
wolffd@0 126 norm.key = 'Normal';
wolffd@0 127 norm.type = 'Boolean';
wolffd@0 128 norm.default = 0;
wolffd@0 129 option.norm = norm;
wolffd@0 130
wolffd@0 131 win.key = 'Window';
wolffd@0 132 win.type = 'String';
wolffd@0 133 win.default = 'hamming';
wolffd@0 134 option.win = win;
wolffd@0 135
wolffd@0 136 % f = mirsegment(...,'Silence')
wolffd@0 137
wolffd@0 138 throff.key = 'Off';
wolffd@0 139 throff.type = 'Integer';
wolffd@0 140 throff.default = .01;
wolffd@0 141 option.throff = throff;
wolffd@0 142
wolffd@0 143 thron.key = 'On';
wolffd@0 144 thron.type = 'Integer';
wolffd@0 145 thron.default = .02;
wolffd@0 146 option.thron = thron;
wolffd@0 147
wolffd@0 148 strat.choice = {'Novelty','HCDF','RMS'}; % should remain as last field
wolffd@0 149 strat.default = 'Novelty';
wolffd@0 150 strat.position = 2;
wolffd@0 151 option.strat = strat;
wolffd@0 152
wolffd@0 153 specif.option = option;
wolffd@0 154
wolffd@0 155
wolffd@0 156 p = {};
wolffd@0 157 m = {};
wolffd@0 158 fe = {};
wolffd@0 159
wolffd@0 160 if isa(x,'mirdesign')
wolffd@0 161 if not(get(x,'Eval'))
wolffd@0 162 % During bottom-up construction of the general design
wolffd@0 163
wolffd@0 164 [unused option] = miroptions(@mirframe,x,specif,varargin);
wolffd@0 165 type = get(x,'Type');
wolffd@0 166 f = mirdesign(@mirsegment,x,option,{},struct,type);
wolffd@0 167
wolffd@0 168 sg = get(x,'Segment');
wolffd@0 169 if not(isempty(sg))
wolffd@0 170 f = set(f,'Segment',sg);
wolffd@0 171 else
wolffd@0 172 f = set(f,'Segment',option.strat);
wolffd@0 173 end
wolffd@0 174
wolffd@0 175 else
wolffd@0 176 % During top-down evaluation initiation
wolffd@0 177
wolffd@0 178 f = evaleach(x);
wolffd@0 179 if iscell(f)
wolffd@0 180 f = f{1};
wolffd@0 181 end
wolffd@0 182 p = x;
wolffd@0 183 end
wolffd@0 184 elseif isa(x,'mirdata')
wolffd@0 185 [unused option] = miroptions(@mirframe,x,specif,varargin);
wolffd@0 186 if ischar(option.strat)
wolffd@0 187 dx = get(x,'Data');
wolffd@0 188 if size(dx{1},2) > 1
wolffd@0 189 error('ERROR IN MIRSEGMENT: The segmentation of audio signal already decomposed into frames is not available for the moment.');
wolffd@0 190 end
wolffd@0 191 if strcmpi(option.strat,'Novelty')
wolffd@0 192 if not(option.frame.length.val)
wolffd@0 193 if strcmpi(option.ana,'Keystrength')
wolffd@0 194 option.frame.length.val = .5;
wolffd@0 195 option.frame.hop.val = .2;
wolffd@0 196 elseif strcmpi(option.ana,'AutocorPitch') ...
wolffd@0 197 || strcmpi(option.ana,'Pitch')
wolffd@0 198 option.frame.length.val = .05;
wolffd@0 199 option.frame.hop.val = .01;
wolffd@0 200 else
wolffd@0 201 option.frame.length.val = .05;
wolffd@0 202 option.frame.hop.val = 1;
wolffd@0 203 end
wolffd@0 204 end
wolffd@0 205 fr = mirframenow(x,option);
wolffd@0 206 if not(isequal(option.mfc,0))
wolffd@0 207 fe = mirmfcc(fr,'Rank',option.mfc);
wolffd@0 208 elseif strcmpi(option.ana,'Spectrum')
wolffd@0 209 fe = mirspectrum(fr,'Min',option.mi,'Max',option.ma,...
wolffd@0 210 'Normal',option.norm,option.band,...
wolffd@0 211 'Window',option.win);
wolffd@0 212 elseif strcmpi(option.ana,'Keystrength')
wolffd@0 213 fe = mirkeystrength(fr);
wolffd@0 214 elseif strcmpi(option.ana,'AutocorPitch') ...
wolffd@0 215 || strcmpi(option.ana,'Pitch')
wolffd@0 216 [unused,fe] = mirpitch(x,'Frame');
wolffd@0 217 else
wolffd@0 218 fe = fr;
wolffd@0 219 end
wolffd@0 220 [n m] = mirnovelty(fe,'Distance',option.distance,...
wolffd@0 221 'Measure',option.measure,...
wolffd@0 222 'KernelSize',option.K);
wolffd@0 223 p = mirpeaks(n,'Total',option.tot,...
wolffd@0 224 'Contrast',option.cthr,...
wolffd@0 225 'Chrono','NoBegin','NoEnd');
wolffd@0 226 elseif strcmpi(option.strat,'HCDF')
wolffd@0 227 if not(option.frame.length.val)
wolffd@0 228 option.frame.length.val = .743;
wolffd@0 229 option.frame.hop.val = 1/8;
wolffd@0 230 end
wolffd@0 231 fr = mirframenow(x,option);
wolffd@0 232 %[df m fe] = mirhcdf(fr);
wolffd@0 233 df = mirhcdf(fr);
wolffd@0 234 p = mirpeaks(df);
wolffd@0 235 elseif strcmpi(option.strat,'RMS')
wolffd@0 236 if not(option.frame.length.val)
wolffd@0 237 option.frame.length.val = .05;
wolffd@0 238 option.frame.hop.val = .5;
wolffd@0 239 end
wolffd@0 240 fr = mirframenow(x,option);
wolffd@0 241 %[df m fe] = mirhcdf(fr);
wolffd@0 242 df = mirrms(fr);
wolffd@0 243 fp = get(df,'FramePos');
wolffd@0 244 p = mircompute(@findsilence,df,fp,option.throff,option.thron);
wolffd@0 245 end
wolffd@0 246 f = mirsegment(x,p);
wolffd@0 247 else
wolffd@0 248 dx = get(x,'Data');
wolffd@0 249 dt = get(x,'Time');
wolffd@0 250
wolffd@0 251 if isa(option.strat,'mirscalar')
wolffd@0 252 ds = get(option.strat,'PeakPos');
wolffd@0 253 fp = get(option.strat,'FramePos');
wolffd@0 254 elseif isa(option.strat,'mirdata')
wolffd@0 255 ds = get(option.strat,'AttackPos');
wolffd@0 256 if isempty(ds) || isempty(ds{1})
wolffd@0 257 ds = get(option.strat,'PeakPos');
wolffd@0 258 end
wolffd@0 259 xx = get(option.strat,'Pos');
wolffd@0 260 else
wolffd@0 261 ds = option.strat;
wolffd@0 262 fp = cell(1,length(dx));
wolffd@0 263 end
wolffd@0 264 st = cell(1,length(dx));
wolffd@0 265 sx = cell(1,length(dx));
wolffd@0 266 cl = cell(1,length(dx));
wolffd@0 267 for k = 1:length(dx)
wolffd@0 268 dxk = dx{k}{1}; % values in kth audio file
wolffd@0 269 dtk = dt{k}{1}; % time positions in kth audio file
wolffd@0 270 if isa(option.strat,'mirdata')
wolffd@0 271 dsk = ds{k}{1}; % segmentation times in kth audio file
wolffd@0 272 else
wolffd@0 273 dsk = {ds};
wolffd@0 274 end
wolffd@0 275 fsk = []; % the structured array of segmentation times
wolffd@0 276 % needs to be flatten
wolffd@0 277 for j = 1:length(dsk)
wolffd@0 278 if isa(option.strat,'mirdata')
wolffd@0 279 dsj = dsk{j}; % segmentation times in jth segment
wolffd@0 280 else
wolffd@0 281 dsj = ds;
wolffd@0 282 end
wolffd@0 283 if not(iscell(dsj))
wolffd@0 284 dsj = {dsj};
wolffd@0 285 end
wolffd@0 286 for m = 1:length(dsj)
wolffd@0 287 % segmentation times in mth bank channel
wolffd@0 288 if isa(option.strat,'mirscalar')
wolffd@0 289 dsm = fp{k}{m}(1,dsj{m});
wolffd@0 290 elseif isa(option.strat,'mirdata')
wolffd@0 291 dsm = xx{k}{m}(dsj{m});
wolffd@0 292 else
wolffd@0 293 dsm = dsj{m};
wolffd@0 294 end
wolffd@0 295 if iscell(dsm)
wolffd@0 296 dsm = dsm{1};
wolffd@0 297 end
wolffd@0 298 dsm(:,find(dsm(1,:) < dtk(1))) = [];
wolffd@0 299 dsm(:,find(dsm(end,:) > dtk(end))) = [];
wolffd@0 300 % It is presupposed here that the segmentations times
wolffd@0 301 % for a given channel are not decomposed per frames,
wolffd@0 302 % because the segmentation of the frame decomposition
wolffd@0 303 % is something that does not seem very clear.
wolffd@0 304 % Practically, the peak picking for instance is based
wolffd@0 305 % therefore on a frame analysis (such as novelty), and
wolffd@0 306 % segmentation are inferred between these frames...
wolffd@0 307 if size(dsm,2) == 1
wolffd@0 308 dsm = dsm';
wolffd@0 309 end
wolffd@0 310 fsk = [fsk dsm];
wolffd@0 311 end
wolffd@0 312 end
wolffd@0 313
wolffd@0 314 fsk = sort(fsk); % Here is the chronological ordering
wolffd@0 315
wolffd@0 316 if isempty(fsk)
wolffd@0 317 ffsk = {[0;dtk(end)]};
wolffd@0 318 sxk = {dxk};
wolffd@0 319 stk = {dtk};
wolffd@0 320 n = 1;
wolffd@0 321 elseif size(fsk,1) == 1
wolffd@0 322 ffsk = cell(1,length(fsk)+1);
wolffd@0 323 ffsk{1} = [dtk(1);fsk(1)];
wolffd@0 324 for h = 1:length(fsk)-1
wolffd@0 325 ffsk{h+1} = [fsk(h);fsk(h+1)];
wolffd@0 326 end
wolffd@0 327 ffsk{end} = [fsk(end);dtk(end)];
wolffd@0 328
wolffd@0 329 n = length(ffsk);
wolffd@0 330
wolffd@0 331 crd = zeros(1,n+1); % the sample positions of the
wolffd@0 332 % segmentations in the channel
wolffd@0 333 crd0 = 0;
wolffd@0 334 for i = 1:n
wolffd@0 335 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1);
wolffd@0 336 crd(i) = crd0;
wolffd@0 337 end
wolffd@0 338 crd(n+1) = size(dxk,1)+1;
wolffd@0 339
wolffd@0 340 sxk = cell(1,n); % each cell contains a segment
wolffd@0 341 stk = cell(1,n); % each cell contains
wolffd@0 342 % the corresponding time positions
wolffd@0 343
wolffd@0 344 for i = 1:n
wolffd@0 345 sxk{i} = dxk(crd(i):crd(i+1)-1,1,:);
wolffd@0 346 stk{i} = dtk(crd(i):crd(i+1)-1);
wolffd@0 347 end
wolffd@0 348
wolffd@0 349 elseif size(fsk,1) == 2
wolffd@0 350 ffsk = cell(1,size(fsk,2));
wolffd@0 351 for h = 1:length(fsk)
wolffd@0 352 ffsk{h} = [fsk(1,h);fsk(2,h)];
wolffd@0 353 end
wolffd@0 354 n = length(ffsk);
wolffd@0 355 crd = zeros(2,n); % the sample positions of the
wolffd@0 356 % segmentations in the channel
wolffd@0 357 crd0 = 0;
wolffd@0 358 for i = 1:n
wolffd@0 359 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1);
wolffd@0 360 crd(i,1) = crd0;
wolffd@0 361 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(2),1);
wolffd@0 362 crd(i,2) = crd0;
wolffd@0 363 end
wolffd@0 364 sxk = cell(1,n); % each cell contains a segment
wolffd@0 365 stk = cell(1,n); % each cell contains
wolffd@0 366 % the corresponding time positions
wolffd@0 367 for i = 1:n
wolffd@0 368 sxk{i} = dxk(crd(i,1):crd(i,2),1,:);
wolffd@0 369 stk{i} = dtk(crd(i,1):crd(i,2));
wolffd@0 370 end
wolffd@0 371 end
wolffd@0 372 sx{k} = sxk;
wolffd@0 373 st{k} = stk;
wolffd@0 374 fp{k} = ffsk;
wolffd@0 375 cl{k} = 1:n;
wolffd@0 376 end
wolffd@0 377 f = set(x,'Data',sx,'Time',st,'FramePos',fp,'Clusters',cl);
wolffd@0 378 p = strat;
wolffd@0 379 m = {};
wolffd@0 380 fe = {};
wolffd@0 381 end
wolffd@0 382 else
wolffd@0 383 [f p] = mirsegment(miraudio(x),varargin{:});
wolffd@0 384 end
wolffd@0 385
wolffd@0 386
wolffd@0 387 function p = findsilence(d,fp,throff,thron)
wolffd@0 388 d = [0 d 0];
wolffd@0 389 begseg = find(d(1:end-1)<thron & d(2:end)>=thron);
wolffd@0 390 nseg = length(begseg);
wolffd@0 391 endseg = zeros(1,nseg);
wolffd@0 392 removed = [];
wolffd@0 393 for i = 1:nseg
wolffd@0 394 endseg(i) = begseg(i) + find(d(begseg(i)+1:end)<=throff, 1)-1;
wolffd@0 395 if i>1 && endseg(i) == endseg(i-1)
wolffd@0 396 removed = [removed i];
wolffd@0 397 end
wolffd@0 398 end
wolffd@0 399 begseg(removed) = [];
wolffd@0 400 %endseg(removed) = [];
wolffd@0 401 %endseg(end) = min(endseg(end),length(d)+1);
wolffd@0 402 p = fp(1,begseg); %; fp(2,endseg-1)];