Mercurial > hg > emotion-detection-top-level
comparison Code/Descriptors/Matlab/MPEG7/FromWeb/VoiceSauce/shrp.m @ 4:92ca03a8fa99 tip
Update to ICASSP 2013 benchmark
author | Dawn Black |
---|---|
date | Wed, 13 Feb 2013 11:02:39 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:e1cfa7765647 | 4:92ca03a8fa99 |
---|---|
1 function [f0_time,f0_value,SHR,f0_candidates]=shrp(Y,Fs,F0MinMax,frame_length,timestep,SHR_Threshold,ceiling,med_smooth,CHECK_VOICING) | |
2 % SHRP - a pitch determination algorithm based on Subharmonic-to-Harmonic Ratio (SHR) | |
3 % [f0_time,f0_value,SHR,f0_candidates]=shrp(Y,Fs[,F0MinMax,frame_length,TimeStep,SHR_Threshold,Ceiling,med_smooth,CHECK_VOICING]) | |
4 % | |
5 % Input parameters (There are 9): | |
6 % | |
7 % Y: Input data | |
8 % Fs: Sampling frequency (e.g., 16000 Hz) | |
9 % F0MinMax: 2-d array specifies the F0 range. [minf0 maxf0], default: [50 550] | |
10 % Quick solutions: | |
11 % For male speech: [50 250] | |
12 % For female speech: [120 400] | |
13 % frame_length: length of each frame in millisecond (default: 40 ms) | |
14 % TimeStep: Interval for updating short-term analysis in millisecond (default: 10 ms) | |
15 % SHR_Threshold: Subharmonic-to-harmonic ratio threshold in the range of [0,1] (default: 0.4). | |
16 % If the estimated SHR is greater than the threshold, the subharmonic is regarded as F0 candidate, | |
17 % Otherwise, the harmonic is favored. | |
18 % Ceiling: Upper bound of the frequencies that are used for estimating pitch. (default: 1250 Hz) | |
19 % med_smooth: the order of the median smoothing (default: 0 - no smoothing); | |
20 % CHECK_VOICING: check voicing. Current voicing determination algorithm is kind of crude. | |
21 % 0: no voicing checking (default) | |
22 % 1: voicing checking | |
23 % Output parameters: | |
24 % | |
25 % f0_time: an array stores the times for the F0 points | |
26 % f0_value: an array stores F0 values | |
27 % SHR: an array stores subharmonic-to-harmonic ratio for each frame | |
28 % f0_candidates: a matrix stores the f0 candidates for each frames, currently two f0 values generated for each frame. | |
29 % Each row (a frame) contains two values in increasing order, i.e., [low_f0 higher_f0]. | |
30 % For SHR=0, the first f0 is 0. The purpose of this is that when you want to test different SHR | |
31 % thresholds, you don't need to re-run the whole algorithm. You can choose to select the lower or higher | |
32 % value based on the shr value of this frame. | |
33 % | |
34 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
35 % Permission to use, copy, modify, and distribute this software without fee is hereby granted | |
36 % FOR RESEARCH PURPOSES only, provided that this copyright notice appears in all copies | |
37 % and in all supporting documentation. | |
38 % | |
39 % This program is distributed in the hope that it will be useful, | |
40 % but WITHOUT ANY WARRANTY; without even the implied warranty of | |
41 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
42 % | |
43 % For details of the algorithm, please see | |
44 % Sun, X.,"Pitch determination and voice quality analysis using subharmonic-to-harmonic ratio" To appear in the Proc. of ICASSP2002, Orlando, Florida, May 13 -17, 2002. | |
45 % For update information, please check http://mel.speech.nwu.edu/sunxj/pda.htm. | |
46 % | |
47 % Copyright (c) 2001 Xuejing Sun | |
48 % Department of Communication Sciences and Disorders | |
49 % Northwestern University, USA | |
50 % sunxj@northwestern.edu | |
51 % | |
52 % Update history: | |
53 % Added "f0_candidates" as a return value, Dec. 21, 2001 | |
54 % Changed default median smoothing order from 5 to 0, Jan. 9, 2002 | |
55 % Modified the GetLogSpectrum function, bug fixed due to Herbert Griebel. Jan. 15, 2002 | |
56 % Several minor changes. Jan. 15,2002. | |
57 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
58 | |
59 %t0 = clock; | |
60 %------------------ Get input arguments values and set default values ------------------------- | |
61 if nargin<9 | |
62 CHECK_VOICING=0; | |
63 end | |
64 if nargin<8 | |
65 med_smooth=0; | |
66 end | |
67 if nargin<7 | |
68 ceiling=1250; | |
69 end | |
70 if nargin<6 | |
71 SHR_Threshold=0.4; % subharmonic to harmonic ratio threshold | |
72 end | |
73 if nargin<5 | |
74 timestep=10; | |
75 %timestep=6.4; | |
76 end | |
77 if nargin<4 | |
78 frame_length=40; % default 40 ms | |
79 end | |
80 if nargin<3 | |
81 minf0=50; | |
82 maxf0=500; | |
83 else | |
84 minf0=F0MinMax(1); | |
85 maxf0=F0MinMax(2); | |
86 end | |
87 if nargin<2 | |
88 error('Sampling rate must be supplied!') | |
89 end | |
90 segmentduration=frame_length; | |
91 | |
92 %------------------- pre-processing input signal ------------------------- | |
93 Y=Y-mean(Y); % remove DC component | |
94 Y=Y/max(abs(Y)); %normalization | |
95 total_len=length(Y); | |
96 %------------------ specify some algorithm-specific thresholds ------------------------- | |
97 interpolation_depth=0.5; % for FFT length | |
98 %--------------- derived thresholds specific to the algorithm ------------------------------- | |
99 maxlogf=log2(maxf0/2); | |
100 minlogf=log2(minf0/2); % the search region to compute SHR is as low as 0.5 minf0. | |
101 N=floor(ceiling/minf0); % maximum number harmonics | |
102 m=mod(N,2); | |
103 N=N-m; | |
104 N=N*4; %In fact, in most cases we don't need to multiply N by 4 and get equally good results yet much faster. | |
105 % derive how many frames we have based on segment length and timestep. | |
106 segmentlen=round(segmentduration*(Fs/1000)); | |
107 inc=round(timestep*(Fs/1000)); | |
108 nf = fix((total_len-segmentlen+inc)/inc); | |
109 n=(1:nf); | |
110 f0_time=((n-1)*timestep+segmentduration/2)'; % anchor time for each frame, the middle point | |
111 %f0_time=((n-1)*timestep)'; % anchor time for each frame, starting from zero | |
112 %------------------ determine FFT length --------------------- | |
113 fftlen=1; | |
114 while (fftlen < segmentlen * (1 +interpolation_depth)) | |
115 fftlen =fftlen* 2; | |
116 end | |
117 %----------------- derive linear and log frequency scale ---------------- | |
118 frequency=Fs*(1:fftlen/2)/fftlen; % we ignore frequency 0 here since we need to do log transformation later and won't use it anyway. | |
119 limit=find(frequency>=ceiling); | |
120 limit=limit(1); % only the first is useful | |
121 frequency=frequency(1:limit); | |
122 logf=log2(frequency); | |
123 %% clear some variables to save memory | |
124 clear frequency; | |
125 min_bin=logf(end)-logf(end-1); % the minimum distance between two points after interpolation | |
126 shift=log2(N); % shift distance | |
127 shift_units=round(shift/min_bin); %the number of unit on the log x-axis | |
128 i=(2:N); | |
129 % ------------- the followings are universal for all the frames ---------------%% | |
130 startpos=shift_units+1-round(log2(i)/min_bin); % find out all the start position of each shift | |
131 index=find(startpos<1); % find out those positions that are less than 1 | |
132 startpos(index)=1; % set them to 1 since the array index starts from 1 in matlab | |
133 interp_logf=logf(1):min_bin:logf(end); | |
134 interp_len=length(interp_logf);% new length of the amplitude spectrum after interpolation | |
135 totallen=shift_units+interp_len; | |
136 endpos=startpos+interp_len-1; %% note that : totallen=shift_units+interp_len; | |
137 index=find(endpos>totallen); | |
138 endpos(index)=totallen; % make sure all the end positions not greater than the totoal length of the shift spectrum | |
139 | |
140 newfre=2.^(interp_logf); % the linear Hz scale derived from the interpolated log scale | |
141 upperbound=find(interp_logf>=maxlogf); % find out the index of upper bound of search region on the log frequency scale. | |
142 upperbound=upperbound(1);% only the first element is useful | |
143 lowerbound=find(interp_logf>=minlogf); % find out the index of lower bound of search region on the log frequency scale. | |
144 lowerbound=lowerbound(1); | |
145 | |
146 %----------------- segmentation of speech ------------------------------ | |
147 curpos=round(f0_time/1000*Fs); % position for each frame in terms of index, not time | |
148 frames=toframes(Y,curpos,segmentlen,'hamm'); | |
149 [nf framelen]=size(frames); | |
150 clear Y; | |
151 %----------------- initialize vectors for f0 time, f0 values, and SHR | |
152 f0_value=zeros(nf,1); | |
153 SHR=zeros(nf,1); | |
154 f0_time=f0_time(1:nf); | |
155 f0_candidates=zeros(nf,2); | |
156 %----------------- voicing determination ---------------------------- | |
157 if (CHECK_VOICING) | |
158 NoiseFloor=sum(frames(1,:).^2); | |
159 voicing=vda(frames,segmentduration/1000,NoiseFloor); | |
160 else | |
161 voicing=ones(nf,1); | |
162 end | |
163 %------------------- the main loop ----------------------- | |
164 curf0=0; | |
165 cur_SHR=0; | |
166 cur_cand1=0; | |
167 cur_cand2=0; | |
168 for n=1:nf | |
169 segment=frames(n,:); | |
170 curtime=f0_time(n); | |
171 if voicing(n)==0 | |
172 curf0=0; | |
173 cur_SHR=0; | |
174 else | |
175 [log_spectrum]=GetLogSpectrum(segment,fftlen,limit,logf,interp_logf); | |
176 [peak_index,cur_SHR,shshift,all_peak_indices]=ComputeSHR(log_spectrum,min_bin,startpos,endpos,lowerbound,upperbound,N,shift_units,SHR_Threshold); | |
177 if (peak_index==-1) % -1 indicates a possibly unvoiced frame, if CHECK_VOICING, set f0 to 0, otherwise uses previous value | |
178 if (CHECK_VOICING) | |
179 curf0=0; | |
180 cur_cand1=0; | |
181 cur_cand2=0; | |
182 end | |
183 | |
184 else | |
185 curf0=newfre(peak_index)*2; | |
186 if (curf0>maxf0) | |
187 curf0=curf0/2; | |
188 end | |
189 if (length(all_peak_indices)==1) | |
190 cur_cand1=0; | |
191 cur_cand2=newfre(all_peak_indices(1))*2; | |
192 else | |
193 cur_cand1=newfre(all_peak_indices(1))*2; | |
194 cur_cand2=newfre(all_peak_indices(2))*2; | |
195 end | |
196 if (cur_cand1>maxf0) | |
197 cur_cand1=cur_cand1/2; | |
198 end | |
199 if (cur_cand2>maxf0) | |
200 cur_cand2=cur_cand2/2; | |
201 end | |
202 if (CHECK_VOICING) | |
203 voicing(n)=postvda(segment,curf0,Fs); | |
204 if (voicing(n)==0) | |
205 curf0=0; | |
206 end | |
207 end | |
208 end | |
209 end | |
210 f0_value(n)=curf0; | |
211 SHR(n)=cur_SHR; | |
212 f0_candidates(n,1)=cur_cand1; | |
213 f0_candidates(n,2)=cur_cand2; | |
214 DEBUG=0; | |
215 if DEBUG | |
216 figure(9) | |
217 %subplot(5,1,1),plot(segment,'*') | |
218 %title('windowed waveform segment') | |
219 subplot(2,2,1),plot(interp_logf,log_spectrum,'k*') | |
220 title('(a)') | |
221 grid | |
222 %('spectrum on log frequency scale') | |
223 %grid | |
224 shsodd=sum(shshift(1:2:N-1,:),1); | |
225 shseven=sum(shshift(2:2:N,:),1); | |
226 difference=shseven-shsodd; | |
227 subplot(2,2,2),plot(interp_logf,shseven,'k*') | |
228 title('(b)') | |
229 %title('even') | |
230 grid | |
231 subplot(2,2,3),plot(interp_logf,shsodd,'k*') | |
232 title('(c)') | |
233 %title('odd') | |
234 grid | |
235 subplot(2,2,4), plot(interp_logf,difference,'k*') | |
236 title('(d)') | |
237 %title('difference (even-odd)') | |
238 grid | |
239 curtime | |
240 curf0 | |
241 cur_SHR | |
242 pause | |
243 end | |
244 end | |
245 %-------------- post-processing ------------------------------- | |
246 if (med_smooth > 0) | |
247 f0_value=medsmooth(f0_value,med_smooth); | |
248 end | |
249 %f0=linsmooth(f0,5); % this is really optional. | |
250 | |
251 %***************************************************************************************** | |
252 %-------------- do FFT and get log spectrum --------------------------------- | |
253 %***************************************************************************************** | |
254 function [interp_amplitude]=GetLogSpectrum(segment,fftlen,limit,logf,interp_logf) | |
255 Spectra=fft(segment,fftlen); | |
256 amplitude = abs(Spectra(1:fftlen/2+1)); % fftlen is always even here. Note: change fftlen/2 to fftlen/2+1. bug fixed due to Herbert Griebel | |
257 amplitude=amplitude(2:limit+1); % ignore the zero frequency component | |
258 %amplitude=log10(amplitude+1); | |
259 interp_amplitude=interp1(logf,amplitude,interp_logf,'linear'); | |
260 interp_amplitude=interp_amplitude-min(interp_amplitude); | |
261 %***************************************************************************************** | |
262 %-------------- compute subharmonic-to-harmonic ratio --------------------------------- | |
263 %***************************************************************************************** | |
264 function [peak_index,SHR,shshift,index]=ComputeSHR(log_spectrum,min_bin,startpos,endpos,lowerbound,upperbound,N,shift_units,SHR_Threshold) | |
265 % computeshr: compute subharmonic-to-harmonic ratio for a short-term signal | |
266 len_spectrum=length(log_spectrum); | |
267 totallen=shift_units+len_spectrum; | |
268 shshift=zeros(N,totallen); %initialize the subharmonic shift matrix; each row corresponds to a shift version | |
269 shshift(1,(totallen-len_spectrum+1):totallen)=log_spectrum; % place the spectrum at the right end of the first row | |
270 % note that here startpos and endpos has N-1 rows, so we start from 2 | |
271 % the first row in shshift is the original log spectrum | |
272 for i=2:N | |
273 shshift(i,startpos(i-1):endpos(i-1))=log_spectrum(1:endpos(i-1)-startpos(i-1)+1); % store each shifted sequence | |
274 end | |
275 shshift=shshift(:,shift_units+1:totallen); % we don't need the stuff smaller than shift_units | |
276 shsodd=sum(shshift(1:2:N-1,:),1); | |
277 shseven=sum(shshift(2:2:N,:),1); | |
278 difference=shseven-shsodd; | |
279 % peak picking process | |
280 SHR=0; | |
281 [mag,index]=twomax(difference,lowerbound,upperbound,min_bin); % only find two maxima | |
282 % first mag is always the maximum, the second, if there is, is the second max | |
283 NumPitchCandidates=length(mag); | |
284 if (NumPitchCandidates == 1) % this is possible, mainly due to we put a constraint on search region, i.e., f0 range | |
285 if (mag <=0) % this must be an unvoiced frame | |
286 peak_index=-1; | |
287 return | |
288 end | |
289 peak_index=index; | |
290 SHR=0; | |
291 else | |
292 SHR=(mag(1)-mag(2))/(mag(1)+mag(2)); | |
293 if (SHR<=SHR_Threshold) | |
294 peak_index=index(2); % subharmonic is weak, so favor the harmonic | |
295 else | |
296 peak_index=index(1); % subharmonic is strong, so favor the subharmonic as F0 | |
297 end | |
298 end | |
299 %%***************************************************************************************** | |
300 %****************** this function only finds two maximum peaks ************************ | |
301 function [mag,index]=twomax(x,lowerbound,upperbound,unitlen) | |
302 %In descending order, the magnitude and index are returned in [mag,index], respectively | |
303 lenx=length(x); | |
304 halfoct=round(1/unitlen/2); % compute the number of units of half octave. log2(2)=1; 1/unitlen | |
305 [mag,index]=max(x(lowerbound:upperbound));%find the maximum value | |
306 if (mag<=0) | |
307 % error('max is smaller than zero!') % return it! | |
308 return | |
309 end | |
310 index=index+lowerbound-1; | |
311 harmonics=2; | |
312 LIMIT=0.0625; % 1/8 octave | |
313 startpos=index+round(log2(harmonics-LIMIT)/unitlen); | |
314 if (startpos<=min(lenx,upperbound)) | |
315 endpos=index+round(log2(harmonics+LIMIT)/unitlen); % for example, 100hz-200hz is one octave, 200hz-250hz is 1/4octave | |
316 if (endpos> min(lenx,upperbound)) | |
317 endpos=min(lenx,upperbound); | |
318 end | |
319 [mag1,index1]=max(x(startpos:endpos));%find the maximum value at right side of last maximum | |
320 if (mag1>0) | |
321 index1=index1+startpos-1; | |
322 mag=[mag;mag1]; | |
323 index=[index;index1]; | |
324 end | |
325 end | |
326 %***************************************************************************************** | |
327 %%---------------------------------------------------------------------------------------- | |
328 %%-----------------------------------voicing determination ------------------------------- | |
329 function voice=vda(x,segmentdur,noisefloor,minzcr) | |
330 %voice=vda(x) determine whether the segment is voiced, unvoiced or silence | |
331 %this VDA is independent from the PDA process, and does not take advantage of the info derived from PDA | |
332 %thus, it requires more computation load. | |
333 if nargin<4 | |
334 %minzcr=2500; %unit: hertz | |
335 minzcr=3000; | |
336 end | |
337 if nargin<3 | |
338 noisefloor=0.01; | |
339 end | |
340 [nf, len]=size(x); | |
341 voice=ones(nf,1); | |
342 engergy=sum(x.^2,2); | |
343 index=find(engergy<=noisefloor*3); | |
344 voice(index)=0; | |
345 | |
346 %***************************************************************************************** | |
347 %% --------------------------------- determine the energy threshold for silence------------------------- | |
348 function thr=ethreshold(frames) | |
349 %%%%% use Rabiner and Sambur (1975) method | |
350 [nf,len]=size(frames); | |
351 lastpoint=1; | |
352 emax=0; | |
353 emin=0; | |
354 e=sum(frames.^2,2); | |
355 emax=max(e); | |
356 emin=min(e); | |
357 I1=0.03*(emax-emin)+emin; | |
358 I2=4*emin; | |
359 thr=25*min(I1,I2); | |
360 | |
361 %***************************************************************************************** | |
362 %% ------------------- split signal into frames --------------- | |
363 function frames=toframes(input,curpos,segmentlen,wintype) | |
364 len=length(input); | |
365 numFrames=length(curpos); | |
366 frames=zeros(numFrames,segmentlen); | |
367 start=curpos-round(segmentlen/2); | |
368 offset=(0:segmentlen-1); | |
369 index_start=find(start<1); % find out those frames beyond the first point | |
370 start(index_start)=1; % for those, just use the first frame | |
371 endpos=start+segmentlen-1; | |
372 index=find(endpos>len); | |
373 endpos(index)=len; % duplicate the last several frames if window is over the limit. | |
374 start(index)=len+1-segmentlen; | |
375 frames(:)=input(start(:,ones(1,segmentlen))+offset(ones(numFrames,1),:)); | |
376 [nf, len]=size(frames); | |
377 win=window(segmentlen,wintype); | |
378 frames = frames .* win(ones(nf,1),:); | |
379 %***************************************************************************************** | |
380 %-------------- post voicing checking --------------------------------------------- | |
381 function voicing=postvda(segment, curf0,Fs,r_threshold) | |
382 %%% check voicing again using estimated F0, which follows Hermes, SHS algorithm, JASA, 1988 | |
383 if nargin<4 | |
384 r_threshold=0.2; | |
385 end | |
386 estimated_period=1/curf0; | |
387 mid_point=round(length(segment)/2); | |
388 num_points=round(estimated_period*Fs); % number of points in each period | |
389 start_point=mid_point-num_points; | |
390 end_point=mid_point+num_points; | |
391 if (start_point <1) | |
392 start_point=1; | |
393 mid_point=start_point+num_points; | |
394 if (mid_point>length(segment)) % this is unreasonable, set f0 to zero | |
395 voicing=0; | |
396 return; | |
397 end | |
398 end | |
399 segment1=segment(start_point:mid_point); | |
400 if (end_point>length(segment)) | |
401 end_point=length(segment); | |
402 mid_point=end_point-num_points; | |
403 if (mid_point<1) % this is unreasonable, set f0 to zero | |
404 voicing=0; | |
405 return; | |
406 end | |
407 end | |
408 segment2=segment(mid_point:end_point); | |
409 len=min(length(segment1),length(segment2)); | |
410 r=corrcoef(segment1(1:len),segment2(1:len)); | |
411 r1=r(1,2); | |
412 if (r1<r_threshold) % correlation threshold | |
413 voicing=0; | |
414 else | |
415 voicing=1; | |
416 end | |
417 USE_ZCR=1; | |
418 if(USE_ZCR & voicing) | |
419 zcr1=zcr(segment1,estimated_period); | |
420 zcr2=zcr(segment2,estimated_period); | |
421 %minzcr=2500; | |
422 minzcr=3500; | |
423 if (zcr1<minzcr | zcr2<minzcr) | |
424 voicing=1; | |
425 else | |
426 voicing=0; | |
427 end | |
428 end | |
429 %%***************************************************************************************** | |
430 %--------------------- Compute zero-crossing rate ------------------------------------------- | |
431 function zcr=zcr(x,dur) | |
432 % function zcr=zcr(x,dur) : compute zero-crossing rate | |
433 % x: input data | |
434 % x: duration of the input data | |
435 [nf,len]=size(x); | |
436 zcr=sum(0.5*abs(sign(x(:,2:len))-sign(x(:,1:len-1))))/dur; | |
437 %%************************************************************************************* | |
438 %--------------------- Window function ------------------------------------------- | |
439 function w = window(N,wt,beta) | |
440 % | |
441 % w = window(N,wt) | |
442 % | |
443 % generate a window function | |
444 % | |
445 % N = length of desired window | |
446 % wt = window type desired | |
447 % 'rect' = rectangular 'tria' = triangular (Bartlett) | |
448 % 'hann' = Hanning 'hamm' = Hamming | |
449 % 'blac' = Blackman | |
450 % 'kais' = Kaiser | |
451 % | |
452 % w = row vector containing samples of the desired window | |
453 % beta : used in Kaiser window | |
454 | |
455 nn = N-1; | |
456 n=0:nn; | |
457 pn = 2*pi*(0:nn)/nn; | |
458 if wt(1,1:4) == 'rect', | |
459 w = ones(1,N); | |
460 elseif wt(1,1:4) == 'tria', | |
461 m = nn/2; | |
462 w = (0:m)/m; | |
463 w = [w w(ceil(m):-1:1)]; | |
464 elseif wt(1,1:4) == 'hann', | |
465 w = 0.5*(1 - cos(pn)); | |
466 elseif wt(1,1:4) == 'hamm', | |
467 w = .54 - .46*cos(pn); | |
468 elseif wt(1,1:4) == 'blac', | |
469 w = .42 -.5*cos(pn) + .08*cos(2*pn); | |
470 elseif wt(1,1:4) == 'kais', | |
471 if nargin<3 | |
472 error('you need provide beta!') | |
473 end | |
474 w =bessel1(beta*sqrt(1-((n-N/2)/(N/2)).^2))./bessel1(beta); | |
475 else | |
476 disp('Incorrect Window type requested') | |
477 end |