Mercurial > hg > camir-aes2014
comparison toolboxes/MIRtoolbox1.3.2/MIRToolbox/mirsegment.m @ 0:e9a9cd732c1e tip
first hg version after svn
author | wolffd |
---|---|
date | Tue, 10 Feb 2015 15:05:51 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e9a9cd732c1e |
---|---|
1 function [f,p,m,fe] = mirsegment(x,varargin) | |
2 % f = mirsegment(a) segments an audio signal. It can also be the name of an | |
3 % audio file or 'Folder', for the analysis of the audio files in the | |
4 % current folder. The segmentation of audio signal already decomposed | |
5 % into frames is not available for the moment. | |
6 % f = mirsegment(...,'Novelty') segments using a self-similarity matrix | |
7 % (Foote & Cooper, 2003) (by default) | |
8 % f = mirsegment(...,feature) bases the segmentation strategy on a | |
9 % specific feature. | |
10 % 'Spectrum': from FFT spectrum (by default) | |
11 % 'MFCC': from MFCCs | |
12 % 'Keystrength': from the key strength profile | |
13 % 'AutocorPitch': from the autocorrelation function computed as | |
14 % for pitch extraction. | |
15 % The option related to this feature extraction can be specified. | |
16 % Example: mirsegment(...,'Spectrum','Window','bartlett') | |
17 % mirsegment(...,'MFCC','Rank',1:10) | |
18 % mirsegment(...,'Keystrength','Weight',.5) | |
19 % These feature need to be frame-based, in order to appreciate their | |
20 % temporal evolution. Therefore, the audio signal x is first | |
21 % decomposed into frames. This decomposition can be controled | |
22 % using the 'Frame' keyword. | |
23 % The options available for the chosen strategies can be specified | |
24 % directly as options of the segment function. | |
25 % Example: mirsegment(a,'Novelty','KernelSize',10) | |
26 % f = mirsegment(...,'HCDF') segments using the Harmonic Change Detection | |
27 % Function (Harte & Sandler, 2006) | |
28 % f = mirsegment(...,'RMS') segments at positions of long silences. A | |
29 % frame decomposed RMS is computed using mirrms (with default | |
30 % options), and segments are selected from temporal positions | |
31 % where the RMS rises to a given 'On' threshold, until temporal | |
32 % positions where the RMS drops back to a given 'Off' threshold. | |
33 % f = mirsegment(...,'Off',t1) specifies the RMS 'Off' threshold. | |
34 % Default value: t1 = .01 | |
35 % f = mirsegment(...,'On',t2) specifies the RMS 'On' threshold. | |
36 % Default value: t2 = .02 | |
37 % | |
38 % f = mirsegment(a,s) segments a using the results of a segmentation | |
39 % analysis s. s can be the peaks detected on an analysis of the | |
40 % audio for instance. | |
41 % | |
42 % f = mirsegment(a,v) where v is an array of numbers, segments a using | |
43 % the temporal positions specified in v (in s.) | |
44 % | |
45 % Foote, J. & Cooper, M. (2003). Media Segmentation using Self-Similarity | |
46 % Decomposition,. In Proc. SPIE Storage and Retrieval for Multimedia | |
47 % Databases, Vol. 5021, pp. 167-75. | |
48 % Harte, C. A. & Sandler, M. B. (2006). Detecting harmonic change in | |
49 % musical audio, in Proceedings of Audio and Music Computing for | |
50 % Multimedia Workshop, Santa Barbara, CA. | |
51 | |
52 | |
53 % [f,p] = mirsegment(...) also displays the analysis produced by the chosen | |
54 % strategy. | |
55 % For 'Novelty', p is the novelty curve. | |
56 % For 'HCDF', p is the Harmonic Change Detection Function. | |
57 % [f,p,m] = mirsegment(...) also displays the preliminary analysis | |
58 % undertaken in the chosen strategy. | |
59 % For 'Novelty', m is the similarity matrix. | |
60 % For 'HCDF', m is the tonal centroid. | |
61 % [f,p,m,fe] = mirsegment(...) also displays the temporal evolution of the | |
62 % feature used for the analysis. | |
63 | |
64 % f = mirsegment(...,'Novelty') | |
65 | |
66 mfc.key = {'Rank','MFCC'}; | |
67 mfc.type = 'Integers'; | |
68 mfc.default = 0; | |
69 mfc.keydefault = 1:13; | |
70 option.mfc = mfc; | |
71 | |
72 K.key = 'KernelSize'; | |
73 K.type = 'Integer'; | |
74 K.default = 128; | |
75 option.K = K; | |
76 | |
77 distance.key = 'Distance'; | |
78 distance.type = 'String'; | |
79 distance.default = 'cosine'; | |
80 option.distance = distance; | |
81 | |
82 measure.key = {'Measure','Similarity'}; | |
83 measure.type = 'String'; | |
84 measure.default = 'exponential'; | |
85 option.measure = measure; | |
86 | |
87 tot.key = 'Total'; | |
88 tot.type = 'Integer'; | |
89 tot.default = Inf; | |
90 option.tot = tot; | |
91 | |
92 cthr.key = 'Contrast'; | |
93 cthr.type = 'Integer'; | |
94 cthr.default = .1; | |
95 option.cthr = cthr; | |
96 | |
97 frame.key = 'Frame'; | |
98 frame.type = 'Integer'; | |
99 frame.number = 2; | |
100 frame.default = [0 0]; | |
101 frame.keydefault = [3 .1]; | |
102 option.frame = frame; | |
103 | |
104 ana.type = 'String'; | |
105 ana.choice = {'Spectrum','Keystrength','AutocorPitch','Pitch'}; | |
106 ana.default = 0; | |
107 option.ana = ana; | |
108 | |
109 % f = mirsegment(...,'Spectrum') | |
110 | |
111 band.choice = {'Mel','Bark','Freq'}; | |
112 band.type = 'String'; | |
113 band.default = 'Freq'; | |
114 option.band = band; | |
115 | |
116 mi.key = 'Min'; | |
117 mi.type = 'Integer'; | |
118 mi.default = 0; | |
119 option.mi = mi; | |
120 | |
121 ma.key = 'Max'; | |
122 ma.type = 'Integer'; | |
123 ma.default = 0; | |
124 option.ma = ma; | |
125 | |
126 norm.key = 'Normal'; | |
127 norm.type = 'Boolean'; | |
128 norm.default = 0; | |
129 option.norm = norm; | |
130 | |
131 win.key = 'Window'; | |
132 win.type = 'String'; | |
133 win.default = 'hamming'; | |
134 option.win = win; | |
135 | |
136 % f = mirsegment(...,'Silence') | |
137 | |
138 throff.key = 'Off'; | |
139 throff.type = 'Integer'; | |
140 throff.default = .01; | |
141 option.throff = throff; | |
142 | |
143 thron.key = 'On'; | |
144 thron.type = 'Integer'; | |
145 thron.default = .02; | |
146 option.thron = thron; | |
147 | |
148 strat.choice = {'Novelty','HCDF','RMS'}; % should remain as last field | |
149 strat.default = 'Novelty'; | |
150 strat.position = 2; | |
151 option.strat = strat; | |
152 | |
153 specif.option = option; | |
154 | |
155 | |
156 p = {}; | |
157 m = {}; | |
158 fe = {}; | |
159 | |
160 if isa(x,'mirdesign') | |
161 if not(get(x,'Eval')) | |
162 % During bottom-up construction of the general design | |
163 | |
164 [unused option] = miroptions(@mirframe,x,specif,varargin); | |
165 type = get(x,'Type'); | |
166 f = mirdesign(@mirsegment,x,option,{},struct,type); | |
167 | |
168 sg = get(x,'Segment'); | |
169 if not(isempty(sg)) | |
170 f = set(f,'Segment',sg); | |
171 else | |
172 f = set(f,'Segment',option.strat); | |
173 end | |
174 | |
175 else | |
176 % During top-down evaluation initiation | |
177 | |
178 f = evaleach(x); | |
179 if iscell(f) | |
180 f = f{1}; | |
181 end | |
182 p = x; | |
183 end | |
184 elseif isa(x,'mirdata') | |
185 [unused option] = miroptions(@mirframe,x,specif,varargin); | |
186 if ischar(option.strat) | |
187 dx = get(x,'Data'); | |
188 if size(dx{1},2) > 1 | |
189 error('ERROR IN MIRSEGMENT: The segmentation of audio signal already decomposed into frames is not available for the moment.'); | |
190 end | |
191 if strcmpi(option.strat,'Novelty') | |
192 if not(option.frame.length.val) | |
193 if strcmpi(option.ana,'Keystrength') | |
194 option.frame.length.val = .5; | |
195 option.frame.hop.val = .2; | |
196 elseif strcmpi(option.ana,'AutocorPitch') ... | |
197 || strcmpi(option.ana,'Pitch') | |
198 option.frame.length.val = .05; | |
199 option.frame.hop.val = .01; | |
200 else | |
201 option.frame.length.val = .05; | |
202 option.frame.hop.val = 1; | |
203 end | |
204 end | |
205 fr = mirframenow(x,option); | |
206 if not(isequal(option.mfc,0)) | |
207 fe = mirmfcc(fr,'Rank',option.mfc); | |
208 elseif strcmpi(option.ana,'Spectrum') | |
209 fe = mirspectrum(fr,'Min',option.mi,'Max',option.ma,... | |
210 'Normal',option.norm,option.band,... | |
211 'Window',option.win); | |
212 elseif strcmpi(option.ana,'Keystrength') | |
213 fe = mirkeystrength(fr); | |
214 elseif strcmpi(option.ana,'AutocorPitch') ... | |
215 || strcmpi(option.ana,'Pitch') | |
216 [unused,fe] = mirpitch(x,'Frame'); | |
217 else | |
218 fe = fr; | |
219 end | |
220 [n m] = mirnovelty(fe,'Distance',option.distance,... | |
221 'Measure',option.measure,... | |
222 'KernelSize',option.K); | |
223 p = mirpeaks(n,'Total',option.tot,... | |
224 'Contrast',option.cthr,... | |
225 'Chrono','NoBegin','NoEnd'); | |
226 elseif strcmpi(option.strat,'HCDF') | |
227 if not(option.frame.length.val) | |
228 option.frame.length.val = .743; | |
229 option.frame.hop.val = 1/8; | |
230 end | |
231 fr = mirframenow(x,option); | |
232 %[df m fe] = mirhcdf(fr); | |
233 df = mirhcdf(fr); | |
234 p = mirpeaks(df); | |
235 elseif strcmpi(option.strat,'RMS') | |
236 if not(option.frame.length.val) | |
237 option.frame.length.val = .05; | |
238 option.frame.hop.val = .5; | |
239 end | |
240 fr = mirframenow(x,option); | |
241 %[df m fe] = mirhcdf(fr); | |
242 df = mirrms(fr); | |
243 fp = get(df,'FramePos'); | |
244 p = mircompute(@findsilence,df,fp,option.throff,option.thron); | |
245 end | |
246 f = mirsegment(x,p); | |
247 else | |
248 dx = get(x,'Data'); | |
249 dt = get(x,'Time'); | |
250 | |
251 if isa(option.strat,'mirscalar') | |
252 ds = get(option.strat,'PeakPos'); | |
253 fp = get(option.strat,'FramePos'); | |
254 elseif isa(option.strat,'mirdata') | |
255 ds = get(option.strat,'AttackPos'); | |
256 if isempty(ds) || isempty(ds{1}) | |
257 ds = get(option.strat,'PeakPos'); | |
258 end | |
259 xx = get(option.strat,'Pos'); | |
260 else | |
261 ds = option.strat; | |
262 fp = cell(1,length(dx)); | |
263 end | |
264 st = cell(1,length(dx)); | |
265 sx = cell(1,length(dx)); | |
266 cl = cell(1,length(dx)); | |
267 for k = 1:length(dx) | |
268 dxk = dx{k}{1}; % values in kth audio file | |
269 dtk = dt{k}{1}; % time positions in kth audio file | |
270 if isa(option.strat,'mirdata') | |
271 dsk = ds{k}{1}; % segmentation times in kth audio file | |
272 else | |
273 dsk = {ds}; | |
274 end | |
275 fsk = []; % the structured array of segmentation times | |
276 % needs to be flatten | |
277 for j = 1:length(dsk) | |
278 if isa(option.strat,'mirdata') | |
279 dsj = dsk{j}; % segmentation times in jth segment | |
280 else | |
281 dsj = ds; | |
282 end | |
283 if not(iscell(dsj)) | |
284 dsj = {dsj}; | |
285 end | |
286 for m = 1:length(dsj) | |
287 % segmentation times in mth bank channel | |
288 if isa(option.strat,'mirscalar') | |
289 dsm = fp{k}{m}(1,dsj{m}); | |
290 elseif isa(option.strat,'mirdata') | |
291 dsm = xx{k}{m}(dsj{m}); | |
292 else | |
293 dsm = dsj{m}; | |
294 end | |
295 if iscell(dsm) | |
296 dsm = dsm{1}; | |
297 end | |
298 dsm(:,find(dsm(1,:) < dtk(1))) = []; | |
299 dsm(:,find(dsm(end,:) > dtk(end))) = []; | |
300 % It is presupposed here that the segmentations times | |
301 % for a given channel are not decomposed per frames, | |
302 % because the segmentation of the frame decomposition | |
303 % is something that does not seem very clear. | |
304 % Practically, the peak picking for instance is based | |
305 % therefore on a frame analysis (such as novelty), and | |
306 % segmentation are inferred between these frames... | |
307 if size(dsm,2) == 1 | |
308 dsm = dsm'; | |
309 end | |
310 fsk = [fsk dsm]; | |
311 end | |
312 end | |
313 | |
314 fsk = sort(fsk); % Here is the chronological ordering | |
315 | |
316 if isempty(fsk) | |
317 ffsk = {[0;dtk(end)]}; | |
318 sxk = {dxk}; | |
319 stk = {dtk}; | |
320 n = 1; | |
321 elseif size(fsk,1) == 1 | |
322 ffsk = cell(1,length(fsk)+1); | |
323 ffsk{1} = [dtk(1);fsk(1)]; | |
324 for h = 1:length(fsk)-1 | |
325 ffsk{h+1} = [fsk(h);fsk(h+1)]; | |
326 end | |
327 ffsk{end} = [fsk(end);dtk(end)]; | |
328 | |
329 n = length(ffsk); | |
330 | |
331 crd = zeros(1,n+1); % the sample positions of the | |
332 % segmentations in the channel | |
333 crd0 = 0; | |
334 for i = 1:n | |
335 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1); | |
336 crd(i) = crd0; | |
337 end | |
338 crd(n+1) = size(dxk,1)+1; | |
339 | |
340 sxk = cell(1,n); % each cell contains a segment | |
341 stk = cell(1,n); % each cell contains | |
342 % the corresponding time positions | |
343 | |
344 for i = 1:n | |
345 sxk{i} = dxk(crd(i):crd(i+1)-1,1,:); | |
346 stk{i} = dtk(crd(i):crd(i+1)-1); | |
347 end | |
348 | |
349 elseif size(fsk,1) == 2 | |
350 ffsk = cell(1,size(fsk,2)); | |
351 for h = 1:length(fsk) | |
352 ffsk{h} = [fsk(1,h);fsk(2,h)]; | |
353 end | |
354 n = length(ffsk); | |
355 crd = zeros(2,n); % the sample positions of the | |
356 % segmentations in the channel | |
357 crd0 = 0; | |
358 for i = 1:n | |
359 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(1),1); | |
360 crd(i,1) = crd0; | |
361 crd0 = crd0 + find(dtk(crd0+1:end)>=ffsk{i}(2),1); | |
362 crd(i,2) = crd0; | |
363 end | |
364 sxk = cell(1,n); % each cell contains a segment | |
365 stk = cell(1,n); % each cell contains | |
366 % the corresponding time positions | |
367 for i = 1:n | |
368 sxk{i} = dxk(crd(i,1):crd(i,2),1,:); | |
369 stk{i} = dtk(crd(i,1):crd(i,2)); | |
370 end | |
371 end | |
372 sx{k} = sxk; | |
373 st{k} = stk; | |
374 fp{k} = ffsk; | |
375 cl{k} = 1:n; | |
376 end | |
377 f = set(x,'Data',sx,'Time',st,'FramePos',fp,'Clusters',cl); | |
378 p = strat; | |
379 m = {}; | |
380 fe = {}; | |
381 end | |
382 else | |
383 [f p] = mirsegment(miraudio(x),varargin{:}); | |
384 end | |
385 | |
386 | |
387 function p = findsilence(d,fp,throff,thron) | |
388 d = [0 d 0]; | |
389 begseg = find(d(1:end-1)<thron & d(2:end)>=thron); | |
390 nseg = length(begseg); | |
391 endseg = zeros(1,nseg); | |
392 removed = []; | |
393 for i = 1:nseg | |
394 endseg(i) = begseg(i) + find(d(begseg(i)+1:end)<=throff, 1)-1; | |
395 if i>1 && endseg(i) == endseg(i-1) | |
396 removed = [removed i]; | |
397 end | |
398 end | |
399 begseg(removed) = []; | |
400 %endseg(removed) = []; | |
401 %endseg(end) = min(endseg(end),length(d)+1); | |
402 p = fp(1,begseg); %; fp(2,endseg-1)]; |