comparison toolboxes/MIRtoolbox1.3.2/MIRToolbox/mirsegment.m @ 0:e9a9cd732c1e tip

first hg version after svn
author wolffd
date Tue, 10 Feb 2015 15:05:51 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e9a9cd732c1e
1 function [f,p,m,fe] = mirsegment(x,varargin)
2 % f = mirsegment(a) segments an audio signal. It can also be the name of an
3 % audio file or 'Folder', for the analysis of the audio files in the
4 % current folder. The segmentation of audio signal already decomposed
5 % into frames is not available for the moment.
6 % f = mirsegment(...,'Novelty') segments using a self-similarity matrix
7 % (Foote & Cooper, 2003) (by default)
8 % f = mirsegment(...,feature) bases the segmentation strategy on a
9 % specific feature.
10 % 'Spectrum': from FFT spectrum (by default)
11 % 'MFCC': from MFCCs
12 % 'Keystrength': from the key strength profile
13 % 'AutocorPitch': from the autocorrelation function computed as
14 % for pitch extraction.
15 % The option related to this feature extraction can be specified.
16 % Example: mirsegment(...,'Spectrum','Window','bartlett')
17 % mirsegment(...,'MFCC','Rank',1:10)
18 % mirsegment(...,'Keystrength','Weight',.5)
19 % These feature need to be frame-based, in order to appreciate their
20 % temporal evolution. Therefore, the audio signal x is first
21 % decomposed into frames. This decomposition can be controled
22 % using the 'Frame' keyword.
23 % The options available for the chosen strategies can be specified
24 % directly as options of the segment function.
25 % Example: mirsegment(a,'Novelty','KernelSize',10)
26 % f = mirsegment(...,'HCDF') segments using the Harmonic Change Detection
27 % Function (Harte & Sandler, 2006)
28 % f = mirsegment(...,'RMS') segments at positions of long silences. A
29 % frame decomposed RMS is computed using mirrms (with default
30 % options), and segments are selected from temporal positions
31 % where the RMS rises to a given 'On' threshold, until temporal
32 % positions where the RMS drops back to a given 'Off' threshold.
33 % f = mirsegment(...,'Off',t1) specifies the RMS 'Off' threshold.
34 % Default value: t1 = .01
35 % f = mirsegment(...,'On',t2) specifies the RMS 'On' threshold.
36 % Default value: t2 = .02
37 %
38 % f = mirsegment(a,s) segments a using the results of a segmentation
39 % analysis s. s can be the peaks detected on an analysis of the
40 % audio for instance.
41 %
42 % f = mirsegment(a,v) where v is an array of numbers, segments a using
43 % the temporal positions specified in v (in s.)
44 %
45 % Foote, J. & Cooper, M. (2003). Media Segmentation using Self-Similarity
46 % Decomposition,. In Proc. SPIE Storage and Retrieval for Multimedia
47 % Databases, Vol. 5021, pp. 167-75.
48 % Harte, C. A. & Sandler, M. B. (2006). Detecting harmonic change in
49 % musical audio, in Proceedings of Audio and Music Computing for
50 % Multimedia Workshop, Santa Barbara, CA.
51
52
53 % [f,p] = mirsegment(...) also displays the analysis produced by the chosen
54 % strategy.
55 % For 'Novelty', p is the novelty curve.
56 % For 'HCDF', p is the Harmonic Change Detection Function.
57 % [f,p,m] = mirsegment(...) also displays the preliminary analysis
58 % undertaken in the chosen strategy.
59 % For 'Novelty', m is the similarity matrix.
60 % For 'HCDF', m is the tonal centroid.
61 % [f,p,m,fe] = mirsegment(...) also displays the temporal evolution of the
62 % feature used for the analysis.
63
64 % f = mirsegment(...,'Novelty')
65
66 mfc.key = {'Rank','MFCC'};
67 mfc.type = 'Integers';
68 mfc.default = 0;
69 mfc.keydefault = 1:13;
70 option.mfc = mfc;
71
72 K.key = 'KernelSize';
73 K.type = 'Integer';
74 K.default = 128;
75 option.K = K;
76
77 distance.key = 'Distance';
78 distance.type = 'String';
79 distance.default = 'cosine';
80 option.distance = distance;
81
82 measure.key = {'Measure','Similarity'};
83 measure.type = 'String';
84 measure.default = 'exponential';
85 option.measure = measure;
86
87 tot.key = 'Total';
88 tot.type = 'Integer';
89 tot.default = Inf;
90 option.tot = tot;
91
92 cthr.key = 'Contrast';
93 cthr.type = 'Integer';
94 cthr.default = .1;
95 option.cthr = cthr;
96
97 frame.key = 'Frame';
98 frame.type = 'Integer';
99 frame.number = 2;
100 frame.default = [0 0];
101 frame.keydefault = [3 .1];
102 option.frame = frame;
103
104 ana.type = 'String';
105 ana.choice = {'Spectrum','Keystrength','AutocorPitch','Pitch'};
106 ana.default = 0;
107 option.ana = ana;
108
109 % f = mirsegment(...,'Spectrum')
110
111 band.choice = {'Mel','Bark','Freq'};
112 band.type = 'String';
113 band.default = 'Freq';
114 option.band = band;
115
116 mi.key = 'Min';
117 mi.type = 'Integer';
118 mi.default = 0;
119 option.mi = mi;
120
121 ma.key = 'Max';
122 ma.type = 'Integer';
123 ma.default = 0;
124 option.ma = ma;
125
126 norm.key = 'Normal';
127 norm.type = 'Boolean';
128 norm.default = 0;
129 option.norm = norm;
130
131 win.key = 'Window';
132 win.type = 'String';
133 win.default = 'hamming';
134 option.win = win;
135
136 % f = mirsegment(...,'Silence')
137
138 throff.key = 'Off';
139 throff.type = 'Integer';
140 throff.default = .01;
141 option.throff = throff;
142
143 thron.key = 'On';
144 thron.type = 'Integer';
145 thron.default = .02;
146 option.thron = thron;
147
148 strat.choice = {'Novelty','HCDF','RMS'}; % should remain as last field
149 strat.default = 'Novelty';
150 strat.position = 2;
151 option.strat = strat;
152
153 specif.option = option;
154
155
156 p = {};
157 m = {};
158 fe = {};
159
160 if isa(x,'mirdesign')
161 if not(get(x,'Eval'))
162 % During bottom-up construction of the general design
163
164 [unused option] = miroptions(@mirframe,x,specif,varargin);
165 type = get(x,'Type');
166 f = mirdesign(@mirsegment,x,option,{},struct,type);
167
168 sg = get(x,'Segment');
169 if not(isempty(sg))
170 f = set(f,'Segment',sg);
171 else
172 f = set(f,'Segment',option.strat);
173 end
174
175 else
176 % During top-down evaluation initiation
177
178 f = evaleach(x);
179 if iscell(f)
180 f = f{1};
181 end
182 p = x;
183 end
184 elseif isa(x,'mirdata')
185 [unused option] = miroptions(@mirframe,x,specif,varargin);
186 if ischar(option.strat)
187 dx = get(x,'Data');
188 if size(dx{1},2) > 1
189 error('ERROR IN MIRSEGMENT: The segmentation of audio signal already decomposed into frames is not available for the moment.');
190 end
191 if strcmpi(option.strat,'Novelty')
192 if not(option.frame.length.val)
193 if strcmpi(option.ana,'Keystrength')
194 option.frame.length.val = .5;
195 option.frame.hop.val = .2;
196 elseif strcmpi(option.ana,'AutocorPitch') ...
197 || strcmpi(option.ana,'Pitch')
198 option.frame.length.val = .05;
199 option.frame.hop.val = .01;
200 else
201 option.frame.length.val = .05;
202 option.frame.hop.val = 1;
203 end
204 end
205 fr = mirframenow(x,option);
206 if not(isequal(option.mfc,0))
207 fe = mirmfcc(fr,'Rank',option.mfc);
208 elseif strcmpi(option.ana,'Spectrum')
209 fe = mirspectrum(fr,'Min',option.mi,'Max',option.ma,...
210 'Normal',option.norm,option.band,...
211 'Window',option.win);
212 elseif strcmpi(option.ana,'Keystrength')
213 fe = mirkeystrength(fr);
214 elseif strcmpi(option.ana,'AutocorPitch') ...
215 || strcmpi(option.ana,'Pitch')
216 [unused,fe] = mirpitch(x,'Frame');
217 else
218 fe = fr;
219 end
220 [n m] = mirnovelty(fe,'Distance',option.distance,...
221 'Measure',option.measure,...
222 'KernelSize',option.K);
223 p = mirpeaks(n,'Total',option.tot,...
224 'Contrast',option.cthr,...
225 'Chrono','NoBegin','NoEnd');
226 elseif strcmpi(option.strat,'HCDF')
227 if not(option.frame.length.val)
228 option.frame.length.val = .743;
229 option.frame.hop.val = 1/8;
230 end
231 fr = mirframenow(x,option);
232 %[df m fe] = mirhcdf(fr);
233 df = mirhcdf(fr);
234 p = mirpeaks(df);
235 elseif strcmpi(option.strat,'RMS')
236 if not(option.frame.length.val)
237 option.frame.length.val = .05;
238 option.frame.hop.val = .5;
239 end
240 fr = mirframenow(x,option);
241 %[df m fe] = mirhcdf(fr);
242 df = mirrms(fr);
243 fp = get(df,'FramePos');
244 p = mircompute(@findsilence,df,fp,option.throff,option.thron);
245 end
246 f = mirsegment(x,p);
247 else
248 dx = get(x,'Data');
249 dt = get(x,'Time');
250
251 if isa(option.strat,'mirscalar')
252 ds = get(option.strat,'PeakPos');
253 fp = get(option.strat,'FramePos');
254 elseif isa(option.strat,'mirdata')
255 ds = get(option.strat,'AttackPos');
256 if isempty(ds) || isempty(ds{1})
257 ds = get(option.strat,'PeakPos');
258 end
259 xx = get(option.strat,'Pos');
260 else
261 ds = option.strat;
262 fp = cell(1,length(dx));
263 end
264 st = cell(1,length(dx));
265 sx = cell(1,length(dx));
266 cl = cell(1,length(dx));
267 for k = 1:length(dx)
268 dxk = dx{k}{1}; % values in kth audio file
269 dtk = dt{k}{1}; % time positions in kth audio file
270 if isa(option.strat,'mirdata')
271 dsk = ds{k}{1}; % segmentation times in kth audio file
272 else
273 dsk = {ds};
274 end
275 fsk = []; % the structured array of segmentation times
276 % needs to be flatten
277 for j = 1:length(dsk)
278 if isa(option.strat,'mirdata')
279 dsj = dsk{j}; % segmentation times in jth segment
280 else
281 dsj = ds;
282 end
283 if not(iscell(dsj))
284 dsj = {dsj};
285 end
286 for m = 1:length(dsj)
287 % segmentation times in mth bank channel
288 if isa(option.strat,'mirscalar')
289 dsm = fp{k}{m}(1,dsj{m});
290 elseif isa(option.strat,'mirdata')
291 dsm = xx{k}{m}(dsj{m});
292 else
293 dsm = dsj{m};
294 end
295 if iscell(dsm)
296 dsm = dsm{1};
297 end
298 dsm(:,find(dsm(1,:) < dtk(1))) = [];
299 dsm(:,find(dsm(end,:) > dtk(end))) = [];
300 % It is presupposed here that the segmentations times
301 % for a given channel are not decomposed per frames,
302 % because the segmentation of the frame decomposition
303 % is something that does not seem very clear.
304 % Practically, the peak picking for instance is based
305 % therefore on a frame analysis (such as novelty), and
306 % segmentation are inferred between these frames...
307 if size(dsm,2) == 1
308 dsm = dsm';
309 end
310 fsk = [fsk dsm];
311 end
312 end
313
314 fsk = sort(fsk); % Here is the chronological ordering
315
316 if isempty(fsk)
317 ffsk = {[0;dtk(end)]};
318 sxk = {dxk};
319 stk = {dtk};
320 n = 1;
321 elseif size(fsk,1) == 1
322 ffsk = cell(1,length(fsk)+1);
323 ffsk{1} = [dtk(1);fsk(1)];
324 for h = 1:length(fsk)-1
325 ffsk{h+1} = [fsk(h);fsk(h+1)];
326 end
327 ffsk{end} = [fsk(end);dtk(end)];
328
329 n = length(ffsk);
330
331 crd = zeros(1,n+1); % the sample positions of the
332 % segmentations in the channel
333 crd0 = 0;
334 for i = 1:n
335 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1);
336 crd(i) = crd0;
337 end
338 crd(n+1) = size(dxk,1)+1;
339
340 sxk = cell(1,n); % each cell contains a segment
341 stk = cell(1,n); % each cell contains
342 % the corresponding time positions
343
344 for i = 1:n
345 sxk{i} = dxk(crd(i):crd(i+1)-1,1,:);
346 stk{i} = dtk(crd(i):crd(i+1)-1);
347 end
348
349 elseif size(fsk,1) == 2
350 ffsk = cell(1,size(fsk,2));
351 for h = 1:length(fsk)
352 ffsk{h} = [fsk(1,h);fsk(2,h)];
353 end
354 n = length(ffsk);
355 crd = zeros(2,n); % the sample positions of the
356 % segmentations in the channel
357 crd0 = 0;
358 for i = 1:n
359 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1);
360 crd(i,1) = crd0;
361 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(2),1);
362 crd(i,2) = crd0;
363 end
364 sxk = cell(1,n); % each cell contains a segment
365 stk = cell(1,n); % each cell contains
366 % the corresponding time positions
367 for i = 1:n
368 sxk{i} = dxk(crd(i,1):crd(i,2),1,:);
369 stk{i} = dtk(crd(i,1):crd(i,2));
370 end
371 end
372 sx{k} = sxk;
373 st{k} = stk;
374 fp{k} = ffsk;
375 cl{k} = 1:n;
376 end
377 f = set(x,'Data',sx,'Time',st,'FramePos',fp,'Clusters',cl);
378 p = strat;
379 m = {};
380 fe = {};
381 end
382 else
383 [f p] = mirsegment(miraudio(x),varargin{:});
384 end
385
386
387 function p = findsilence(d,fp,throff,thron)
388 d = [0 d 0];
389 begseg = find(d(1:end-1)<thron & d(2:end)>=thron);
390 nseg = length(begseg);
391 endseg = zeros(1,nseg);
392 removed = [];
393 for i = 1:nseg
394 endseg(i) = begseg(i) + find(d(begseg(i)+1:end)<=throff, 1)-1;
395 if i>1 && endseg(i) == endseg(i-1)
396 removed = [removed i];
397 end
398 end
399 begseg(removed) = [];
400 %endseg(removed) = [];
401 %endseg(end) = min(endseg(end),length(d)+1);
402 p = fp(1,begseg); %; fp(2,endseg-1)];