Dawn@4
|
1 function [f0_time,f0_value,SHR,f0_candidates]=shrp(Y,Fs,F0MinMax,frame_length,timestep,SHR_Threshold,ceiling,med_smooth,CHECK_VOICING)
|
Dawn@4
|
2 % SHRP - a pitch determination algorithm based on Subharmonic-to-Harmonic Ratio (SHR)
|
Dawn@4
|
3 % [f0_time,f0_value,SHR,f0_candidates]=shrp(Y,Fs[,F0MinMax,frame_length,TimeStep,SHR_Threshold,Ceiling,med_smooth,CHECK_VOICING])
|
Dawn@4
|
4 %
|
Dawn@4
|
5 % Input parameters (There are 9):
|
Dawn@4
|
6 %
|
Dawn@4
|
7 % Y: Input data
|
Dawn@4
|
8 % Fs: Sampling frequency (e.g., 16000 Hz)
|
Dawn@4
|
9 % F0MinMax: 2-d array specifies the F0 range. [minf0 maxf0], default: [50 550]
|
Dawn@4
|
10 % Quick solutions:
|
Dawn@4
|
11 % For male speech: [50 250]
|
Dawn@4
|
12 % For female speech: [120 400]
|
Dawn@4
|
13 % frame_length: length of each frame in millisecond (default: 40 ms)
|
Dawn@4
|
14 % TimeStep: Interval for updating short-term analysis in millisecond (default: 10 ms)
|
Dawn@4
|
15 % SHR_Threshold: Subharmonic-to-harmonic ratio threshold in the range of [0,1] (default: 0.4).
|
Dawn@4
|
16 % If the estimated SHR is greater than the threshold, the subharmonic is regarded as F0 candidate,
|
Dawn@4
|
17 % Otherwise, the harmonic is favored.
|
Dawn@4
|
18 % Ceiling: Upper bound of the frequencies that are used for estimating pitch. (default: 1250 Hz)
|
Dawn@4
|
19 % med_smooth: the order of the median smoothing (default: 0 - no smoothing);
|
Dawn@4
|
20 % CHECK_VOICING: check voicing. Current voicing determination algorithm is kind of crude.
|
Dawn@4
|
21 % 0: no voicing checking (default)
|
Dawn@4
|
22 % 1: voicing checking
|
Dawn@4
|
23 % Output parameters:
|
Dawn@4
|
24 %
|
Dawn@4
|
25 % f0_time: an array stores the times for the F0 points
|
Dawn@4
|
26 % f0_value: an array stores F0 values
|
Dawn@4
|
27 % SHR: an array stores subharmonic-to-harmonic ratio for each frame
|
Dawn@4
|
28 % f0_candidates: a matrix stores the f0 candidates for each frames, currently two f0 values generated for each frame.
|
Dawn@4
|
29 % Each row (a frame) contains two values in increasing order, i.e., [low_f0 higher_f0].
|
Dawn@4
|
30 % For SHR=0, the first f0 is 0. The purpose of this is that when you want to test different SHR
|
Dawn@4
|
31 % thresholds, you don't need to re-run the whole algorithm. You can choose to select the lower or higher
|
Dawn@4
|
32 % value based on the shr value of this frame.
|
Dawn@4
|
33 %
|
Dawn@4
|
34 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
Dawn@4
|
35 % Permission to use, copy, modify, and distribute this software without fee is hereby granted
|
Dawn@4
|
36 % FOR RESEARCH PURPOSES only, provided that this copyright notice appears in all copies
|
Dawn@4
|
37 % and in all supporting documentation.
|
Dawn@4
|
38 %
|
Dawn@4
|
39 % This program is distributed in the hope that it will be useful,
|
Dawn@4
|
40 % but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Dawn@4
|
41 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
Dawn@4
|
42 %
|
Dawn@4
|
43 % For details of the algorithm, please see
|
Dawn@4
|
44 % Sun, X.,"Pitch determination and voice quality analysis using subharmonic-to-harmonic ratio" To appear in the Proc. of ICASSP2002, Orlando, Florida, May 13 -17, 2002.
|
Dawn@4
|
45 % For update information, please check http://mel.speech.nwu.edu/sunxj/pda.htm.
|
Dawn@4
|
46 %
|
Dawn@4
|
47 % Copyright (c) 2001 Xuejing Sun
|
Dawn@4
|
48 % Department of Communication Sciences and Disorders
|
Dawn@4
|
49 % Northwestern University, USA
|
Dawn@4
|
50 % sunxj@northwestern.edu
|
Dawn@4
|
51 %
|
Dawn@4
|
52 % Update history:
|
Dawn@4
|
53 % Added "f0_candidates" as a return value, Dec. 21, 2001
|
Dawn@4
|
54 % Changed default median smoothing order from 5 to 0, Jan. 9, 2002
|
Dawn@4
|
55 % Modified the GetLogSpectrum function, bug fixed due to Herbert Griebel. Jan. 15, 2002
|
Dawn@4
|
56 % Several minor changes. Jan. 15,2002.
|
Dawn@4
|
57 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
Dawn@4
|
58
|
Dawn@4
|
59 %t0 = clock;
|
Dawn@4
|
60 %------------------ Get input arguments values and set default values -------------------------
|
Dawn@4
|
61 if nargin<9
|
Dawn@4
|
62 CHECK_VOICING=0;
|
Dawn@4
|
63 end
|
Dawn@4
|
64 if nargin<8
|
Dawn@4
|
65 med_smooth=0;
|
Dawn@4
|
66 end
|
Dawn@4
|
67 if nargin<7
|
Dawn@4
|
68 ceiling=1250;
|
Dawn@4
|
69 end
|
Dawn@4
|
70 if nargin<6
|
Dawn@4
|
71 SHR_Threshold=0.4; % subharmonic to harmonic ratio threshold
|
Dawn@4
|
72 end
|
Dawn@4
|
73 if nargin<5
|
Dawn@4
|
74 timestep=10;
|
Dawn@4
|
75 %timestep=6.4;
|
Dawn@4
|
76 end
|
Dawn@4
|
77 if nargin<4
|
Dawn@4
|
78 frame_length=40; % default 40 ms
|
Dawn@4
|
79 end
|
Dawn@4
|
80 if nargin<3
|
Dawn@4
|
81 minf0=50;
|
Dawn@4
|
82 maxf0=500;
|
Dawn@4
|
83 else
|
Dawn@4
|
84 minf0=F0MinMax(1);
|
Dawn@4
|
85 maxf0=F0MinMax(2);
|
Dawn@4
|
86 end
|
Dawn@4
|
87 if nargin<2
|
Dawn@4
|
88 error('Sampling rate must be supplied!')
|
Dawn@4
|
89 end
|
Dawn@4
|
90 segmentduration=frame_length;
|
Dawn@4
|
91
|
Dawn@4
|
92 %------------------- pre-processing input signal -------------------------
|
Dawn@4
|
93 Y=Y-mean(Y); % remove DC component
|
Dawn@4
|
94 Y=Y/max(abs(Y)); %normalization
|
Dawn@4
|
95 total_len=length(Y);
|
Dawn@4
|
96 %------------------ specify some algorithm-specific thresholds -------------------------
|
Dawn@4
|
97 interpolation_depth=0.5; % for FFT length
|
Dawn@4
|
98 %--------------- derived thresholds specific to the algorithm -------------------------------
|
Dawn@4
|
99 maxlogf=log2(maxf0/2);
|
Dawn@4
|
100 minlogf=log2(minf0/2); % the search region to compute SHR is as low as 0.5 minf0.
|
Dawn@4
|
101 N=floor(ceiling/minf0); % maximum number harmonics
|
Dawn@4
|
102 m=mod(N,2);
|
Dawn@4
|
103 N=N-m;
|
Dawn@4
|
104 N=N*4; %In fact, in most cases we don't need to multiply N by 4 and get equally good results yet much faster.
|
Dawn@4
|
105 % derive how many frames we have based on segment length and timestep.
|
Dawn@4
|
106 segmentlen=round(segmentduration*(Fs/1000));
|
Dawn@4
|
107 inc=round(timestep*(Fs/1000));
|
Dawn@4
|
108 nf = fix((total_len-segmentlen+inc)/inc);
|
Dawn@4
|
109 n=(1:nf);
|
Dawn@4
|
110 f0_time=((n-1)*timestep+segmentduration/2)'; % anchor time for each frame, the middle point
|
Dawn@4
|
111 %f0_time=((n-1)*timestep)'; % anchor time for each frame, starting from zero
|
Dawn@4
|
112 %------------------ determine FFT length ---------------------
|
Dawn@4
|
113 fftlen=1;
|
Dawn@4
|
114 while (fftlen < segmentlen * (1 +interpolation_depth))
|
Dawn@4
|
115 fftlen =fftlen* 2;
|
Dawn@4
|
116 end
|
Dawn@4
|
117 %----------------- derive linear and log frequency scale ----------------
|
Dawn@4
|
118 frequency=Fs*(1:fftlen/2)/fftlen; % we ignore frequency 0 here since we need to do log transformation later and won't use it anyway.
|
Dawn@4
|
119 limit=find(frequency>=ceiling);
|
Dawn@4
|
120 limit=limit(1); % only the first is useful
|
Dawn@4
|
121 frequency=frequency(1:limit);
|
Dawn@4
|
122 logf=log2(frequency);
|
Dawn@4
|
123 %% clear some variables to save memory
|
Dawn@4
|
124 clear frequency;
|
Dawn@4
|
125 min_bin=logf(end)-logf(end-1); % the minimum distance between two points after interpolation
|
Dawn@4
|
126 shift=log2(N); % shift distance
|
Dawn@4
|
127 shift_units=round(shift/min_bin); %the number of unit on the log x-axis
|
Dawn@4
|
128 i=(2:N);
|
Dawn@4
|
129 % ------------- the followings are universal for all the frames ---------------%%
|
Dawn@4
|
130 startpos=shift_units+1-round(log2(i)/min_bin); % find out all the start position of each shift
|
Dawn@4
|
131 index=find(startpos<1); % find out those positions that are less than 1
|
Dawn@4
|
132 startpos(index)=1; % set them to 1 since the array index starts from 1 in matlab
|
Dawn@4
|
133 interp_logf=logf(1):min_bin:logf(end);
|
Dawn@4
|
134 interp_len=length(interp_logf);% new length of the amplitude spectrum after interpolation
|
Dawn@4
|
135 totallen=shift_units+interp_len;
|
Dawn@4
|
136 endpos=startpos+interp_len-1; %% note that : totallen=shift_units+interp_len;
|
Dawn@4
|
137 index=find(endpos>totallen);
|
Dawn@4
|
138 endpos(index)=totallen; % make sure all the end positions not greater than the totoal length of the shift spectrum
|
Dawn@4
|
139
|
Dawn@4
|
140 newfre=2.^(interp_logf); % the linear Hz scale derived from the interpolated log scale
|
Dawn@4
|
141 upperbound=find(interp_logf>=maxlogf); % find out the index of upper bound of search region on the log frequency scale.
|
Dawn@4
|
142 upperbound=upperbound(1);% only the first element is useful
|
Dawn@4
|
143 lowerbound=find(interp_logf>=minlogf); % find out the index of lower bound of search region on the log frequency scale.
|
Dawn@4
|
144 lowerbound=lowerbound(1);
|
Dawn@4
|
145
|
Dawn@4
|
146 %----------------- segmentation of speech ------------------------------
|
Dawn@4
|
147 curpos=round(f0_time/1000*Fs); % position for each frame in terms of index, not time
|
Dawn@4
|
148 frames=toframes(Y,curpos,segmentlen,'hamm');
|
Dawn@4
|
149 [nf framelen]=size(frames);
|
Dawn@4
|
150 clear Y;
|
Dawn@4
|
151 %----------------- initialize vectors for f0 time, f0 values, and SHR
|
Dawn@4
|
152 f0_value=zeros(nf,1);
|
Dawn@4
|
153 SHR=zeros(nf,1);
|
Dawn@4
|
154 f0_time=f0_time(1:nf);
|
Dawn@4
|
155 f0_candidates=zeros(nf,2);
|
Dawn@4
|
156 %----------------- voicing determination ----------------------------
|
Dawn@4
|
157 if (CHECK_VOICING)
|
Dawn@4
|
158 NoiseFloor=sum(frames(1,:).^2);
|
Dawn@4
|
159 voicing=vda(frames,segmentduration/1000,NoiseFloor);
|
Dawn@4
|
160 else
|
Dawn@4
|
161 voicing=ones(nf,1);
|
Dawn@4
|
162 end
|
Dawn@4
|
163 %------------------- the main loop -----------------------
|
Dawn@4
|
164 curf0=0;
|
Dawn@4
|
165 cur_SHR=0;
|
Dawn@4
|
166 cur_cand1=0;
|
Dawn@4
|
167 cur_cand2=0;
|
Dawn@4
|
168 for n=1:nf
|
Dawn@4
|
169 segment=frames(n,:);
|
Dawn@4
|
170 curtime=f0_time(n);
|
Dawn@4
|
171 if voicing(n)==0
|
Dawn@4
|
172 curf0=0;
|
Dawn@4
|
173 cur_SHR=0;
|
Dawn@4
|
174 else
|
Dawn@4
|
175 [log_spectrum]=GetLogSpectrum(segment,fftlen,limit,logf,interp_logf);
|
Dawn@4
|
176 [peak_index,cur_SHR,shshift,all_peak_indices]=ComputeSHR(log_spectrum,min_bin,startpos,endpos,lowerbound,upperbound,N,shift_units,SHR_Threshold);
|
Dawn@4
|
177 if (peak_index==-1) % -1 indicates a possibly unvoiced frame, if CHECK_VOICING, set f0 to 0, otherwise uses previous value
|
Dawn@4
|
178 if (CHECK_VOICING)
|
Dawn@4
|
179 curf0=0;
|
Dawn@4
|
180 cur_cand1=0;
|
Dawn@4
|
181 cur_cand2=0;
|
Dawn@4
|
182 end
|
Dawn@4
|
183
|
Dawn@4
|
184 else
|
Dawn@4
|
185 curf0=newfre(peak_index)*2;
|
Dawn@4
|
186 if (curf0>maxf0)
|
Dawn@4
|
187 curf0=curf0/2;
|
Dawn@4
|
188 end
|
Dawn@4
|
189 if (length(all_peak_indices)==1)
|
Dawn@4
|
190 cur_cand1=0;
|
Dawn@4
|
191 cur_cand2=newfre(all_peak_indices(1))*2;
|
Dawn@4
|
192 else
|
Dawn@4
|
193 cur_cand1=newfre(all_peak_indices(1))*2;
|
Dawn@4
|
194 cur_cand2=newfre(all_peak_indices(2))*2;
|
Dawn@4
|
195 end
|
Dawn@4
|
196 if (cur_cand1>maxf0)
|
Dawn@4
|
197 cur_cand1=cur_cand1/2;
|
Dawn@4
|
198 end
|
Dawn@4
|
199 if (cur_cand2>maxf0)
|
Dawn@4
|
200 cur_cand2=cur_cand2/2;
|
Dawn@4
|
201 end
|
Dawn@4
|
202 if (CHECK_VOICING)
|
Dawn@4
|
203 voicing(n)=postvda(segment,curf0,Fs);
|
Dawn@4
|
204 if (voicing(n)==0)
|
Dawn@4
|
205 curf0=0;
|
Dawn@4
|
206 end
|
Dawn@4
|
207 end
|
Dawn@4
|
208 end
|
Dawn@4
|
209 end
|
Dawn@4
|
210 f0_value(n)=curf0;
|
Dawn@4
|
211 SHR(n)=cur_SHR;
|
Dawn@4
|
212 f0_candidates(n,1)=cur_cand1;
|
Dawn@4
|
213 f0_candidates(n,2)=cur_cand2;
|
Dawn@4
|
214 DEBUG=0;
|
Dawn@4
|
215 if DEBUG
|
Dawn@4
|
216 figure(9)
|
Dawn@4
|
217 %subplot(5,1,1),plot(segment,'*')
|
Dawn@4
|
218 %title('windowed waveform segment')
|
Dawn@4
|
219 subplot(2,2,1),plot(interp_logf,log_spectrum,'k*')
|
Dawn@4
|
220 title('(a)')
|
Dawn@4
|
221 grid
|
Dawn@4
|
222 %('spectrum on log frequency scale')
|
Dawn@4
|
223 %grid
|
Dawn@4
|
224 shsodd=sum(shshift(1:2:N-1,:),1);
|
Dawn@4
|
225 shseven=sum(shshift(2:2:N,:),1);
|
Dawn@4
|
226 difference=shseven-shsodd;
|
Dawn@4
|
227 subplot(2,2,2),plot(interp_logf,shseven,'k*')
|
Dawn@4
|
228 title('(b)')
|
Dawn@4
|
229 %title('even')
|
Dawn@4
|
230 grid
|
Dawn@4
|
231 subplot(2,2,3),plot(interp_logf,shsodd,'k*')
|
Dawn@4
|
232 title('(c)')
|
Dawn@4
|
233 %title('odd')
|
Dawn@4
|
234 grid
|
Dawn@4
|
235 subplot(2,2,4), plot(interp_logf,difference,'k*')
|
Dawn@4
|
236 title('(d)')
|
Dawn@4
|
237 %title('difference (even-odd)')
|
Dawn@4
|
238 grid
|
Dawn@4
|
239 curtime
|
Dawn@4
|
240 curf0
|
Dawn@4
|
241 cur_SHR
|
Dawn@4
|
242 pause
|
Dawn@4
|
243 end
|
Dawn@4
|
244 end
|
Dawn@4
|
245 %-------------- post-processing -------------------------------
|
Dawn@4
|
246 if (med_smooth > 0)
|
Dawn@4
|
247 f0_value=medsmooth(f0_value,med_smooth);
|
Dawn@4
|
248 end
|
Dawn@4
|
249 %f0=linsmooth(f0,5); % this is really optional.
|
Dawn@4
|
250
|
Dawn@4
|
251 %*****************************************************************************************
|
Dawn@4
|
252 %-------------- do FFT and get log spectrum ---------------------------------
|
Dawn@4
|
253 %*****************************************************************************************
|
Dawn@4
|
254 function [interp_amplitude]=GetLogSpectrum(segment,fftlen,limit,logf,interp_logf)
|
Dawn@4
|
255 Spectra=fft(segment,fftlen);
|
Dawn@4
|
256 amplitude = abs(Spectra(1:fftlen/2+1)); % fftlen is always even here. Note: change fftlen/2 to fftlen/2+1. bug fixed due to Herbert Griebel
|
Dawn@4
|
257 amplitude=amplitude(2:limit+1); % ignore the zero frequency component
|
Dawn@4
|
258 %amplitude=log10(amplitude+1);
|
Dawn@4
|
259 interp_amplitude=interp1(logf,amplitude,interp_logf,'linear');
|
Dawn@4
|
260 interp_amplitude=interp_amplitude-min(interp_amplitude);
|
Dawn@4
|
261 %*****************************************************************************************
|
Dawn@4
|
262 %-------------- compute subharmonic-to-harmonic ratio ---------------------------------
|
Dawn@4
|
263 %*****************************************************************************************
|
Dawn@4
|
264 function [peak_index,SHR,shshift,index]=ComputeSHR(log_spectrum,min_bin,startpos,endpos,lowerbound,upperbound,N,shift_units,SHR_Threshold)
|
Dawn@4
|
265 % computeshr: compute subharmonic-to-harmonic ratio for a short-term signal
|
Dawn@4
|
266 len_spectrum=length(log_spectrum);
|
Dawn@4
|
267 totallen=shift_units+len_spectrum;
|
Dawn@4
|
268 shshift=zeros(N,totallen); %initialize the subharmonic shift matrix; each row corresponds to a shift version
|
Dawn@4
|
269 shshift(1,(totallen-len_spectrum+1):totallen)=log_spectrum; % place the spectrum at the right end of the first row
|
Dawn@4
|
270 % note that here startpos and endpos has N-1 rows, so we start from 2
|
Dawn@4
|
271 % the first row in shshift is the original log spectrum
|
Dawn@4
|
272 for i=2:N
|
Dawn@4
|
273 shshift(i,startpos(i-1):endpos(i-1))=log_spectrum(1:endpos(i-1)-startpos(i-1)+1); % store each shifted sequence
|
Dawn@4
|
274 end
|
Dawn@4
|
275 shshift=shshift(:,shift_units+1:totallen); % we don't need the stuff smaller than shift_units
|
Dawn@4
|
276 shsodd=sum(shshift(1:2:N-1,:),1);
|
Dawn@4
|
277 shseven=sum(shshift(2:2:N,:),1);
|
Dawn@4
|
278 difference=shseven-shsodd;
|
Dawn@4
|
279 % peak picking process
|
Dawn@4
|
280 SHR=0;
|
Dawn@4
|
281 [mag,index]=twomax(difference,lowerbound,upperbound,min_bin); % only find two maxima
|
Dawn@4
|
282 % first mag is always the maximum, the second, if there is, is the second max
|
Dawn@4
|
283 NumPitchCandidates=length(mag);
|
Dawn@4
|
284 if (NumPitchCandidates == 1) % this is possible, mainly due to we put a constraint on search region, i.e., f0 range
|
Dawn@4
|
285 if (mag <=0) % this must be an unvoiced frame
|
Dawn@4
|
286 peak_index=-1;
|
Dawn@4
|
287 return
|
Dawn@4
|
288 end
|
Dawn@4
|
289 peak_index=index;
|
Dawn@4
|
290 SHR=0;
|
Dawn@4
|
291 else
|
Dawn@4
|
292 SHR=(mag(1)-mag(2))/(mag(1)+mag(2));
|
Dawn@4
|
293 if (SHR<=SHR_Threshold)
|
Dawn@4
|
294 peak_index=index(2); % subharmonic is weak, so favor the harmonic
|
Dawn@4
|
295 else
|
Dawn@4
|
296 peak_index=index(1); % subharmonic is strong, so favor the subharmonic as F0
|
Dawn@4
|
297 end
|
Dawn@4
|
298 end
|
Dawn@4
|
299 %%*****************************************************************************************
|
Dawn@4
|
300 %****************** this function only finds two maximum peaks ************************
|
Dawn@4
|
301 function [mag,index]=twomax(x,lowerbound,upperbound,unitlen)
|
Dawn@4
|
302 %In descending order, the magnitude and index are returned in [mag,index], respectively
|
Dawn@4
|
303 lenx=length(x);
|
Dawn@4
|
304 halfoct=round(1/unitlen/2); % compute the number of units of half octave. log2(2)=1; 1/unitlen
|
Dawn@4
|
305 [mag,index]=max(x(lowerbound:upperbound));%find the maximum value
|
Dawn@4
|
306 if (mag<=0)
|
Dawn@4
|
307 % error('max is smaller than zero!') % return it!
|
Dawn@4
|
308 return
|
Dawn@4
|
309 end
|
Dawn@4
|
310 index=index+lowerbound-1;
|
Dawn@4
|
311 harmonics=2;
|
Dawn@4
|
312 LIMIT=0.0625; % 1/8 octave
|
Dawn@4
|
313 startpos=index+round(log2(harmonics-LIMIT)/unitlen);
|
Dawn@4
|
314 if (startpos<=min(lenx,upperbound))
|
Dawn@4
|
315 endpos=index+round(log2(harmonics+LIMIT)/unitlen); % for example, 100hz-200hz is one octave, 200hz-250hz is 1/4octave
|
Dawn@4
|
316 if (endpos> min(lenx,upperbound))
|
Dawn@4
|
317 endpos=min(lenx,upperbound);
|
Dawn@4
|
318 end
|
Dawn@4
|
319 [mag1,index1]=max(x(startpos:endpos));%find the maximum value at right side of last maximum
|
Dawn@4
|
320 if (mag1>0)
|
Dawn@4
|
321 index1=index1+startpos-1;
|
Dawn@4
|
322 mag=[mag;mag1];
|
Dawn@4
|
323 index=[index;index1];
|
Dawn@4
|
324 end
|
Dawn@4
|
325 end
|
Dawn@4
|
326 %*****************************************************************************************
|
Dawn@4
|
327 %%----------------------------------------------------------------------------------------
|
Dawn@4
|
328 %%-----------------------------------voicing determination -------------------------------
|
Dawn@4
|
329 function voice=vda(x,segmentdur,noisefloor,minzcr)
|
Dawn@4
|
330 %voice=vda(x) determine whether the segment is voiced, unvoiced or silence
|
Dawn@4
|
331 %this VDA is independent from the PDA process, and does not take advantage of the info derived from PDA
|
Dawn@4
|
332 %thus, it requires more computation load.
|
Dawn@4
|
333 if nargin<4
|
Dawn@4
|
334 %minzcr=2500; %unit: hertz
|
Dawn@4
|
335 minzcr=3000;
|
Dawn@4
|
336 end
|
Dawn@4
|
337 if nargin<3
|
Dawn@4
|
338 noisefloor=0.01;
|
Dawn@4
|
339 end
|
Dawn@4
|
340 [nf, len]=size(x);
|
Dawn@4
|
341 voice=ones(nf,1);
|
Dawn@4
|
342 engergy=sum(x.^2,2);
|
Dawn@4
|
343 index=find(engergy<=noisefloor*3);
|
Dawn@4
|
344 voice(index)=0;
|
Dawn@4
|
345
|
Dawn@4
|
346 %*****************************************************************************************
|
Dawn@4
|
347 %% --------------------------------- determine the energy threshold for silence-------------------------
|
Dawn@4
|
348 function thr=ethreshold(frames)
|
Dawn@4
|
349 %%%%% use Rabiner and Sambur (1975) method
|
Dawn@4
|
350 [nf,len]=size(frames);
|
Dawn@4
|
351 lastpoint=1;
|
Dawn@4
|
352 emax=0;
|
Dawn@4
|
353 emin=0;
|
Dawn@4
|
354 e=sum(frames.^2,2);
|
Dawn@4
|
355 emax=max(e);
|
Dawn@4
|
356 emin=min(e);
|
Dawn@4
|
357 I1=0.03*(emax-emin)+emin;
|
Dawn@4
|
358 I2=4*emin;
|
Dawn@4
|
359 thr=25*min(I1,I2);
|
Dawn@4
|
360
|
Dawn@4
|
361 %*****************************************************************************************
|
Dawn@4
|
362 %% ------------------- split signal into frames ---------------
|
Dawn@4
|
363 function frames=toframes(input,curpos,segmentlen,wintype)
|
Dawn@4
|
364 len=length(input);
|
Dawn@4
|
365 numFrames=length(curpos);
|
Dawn@4
|
366 frames=zeros(numFrames,segmentlen);
|
Dawn@4
|
367 start=curpos-round(segmentlen/2);
|
Dawn@4
|
368 offset=(0:segmentlen-1);
|
Dawn@4
|
369 index_start=find(start<1); % find out those frames beyond the first point
|
Dawn@4
|
370 start(index_start)=1; % for those, just use the first frame
|
Dawn@4
|
371 endpos=start+segmentlen-1;
|
Dawn@4
|
372 index=find(endpos>len);
|
Dawn@4
|
373 endpos(index)=len; % duplicate the last several frames if window is over the limit.
|
Dawn@4
|
374 start(index)=len+1-segmentlen;
|
Dawn@4
|
375 frames(:)=input(start(:,ones(1,segmentlen))+offset(ones(numFrames,1),:));
|
Dawn@4
|
376 [nf, len]=size(frames);
|
Dawn@4
|
377 win=window(segmentlen,wintype);
|
Dawn@4
|
378 frames = frames .* win(ones(nf,1),:);
|
Dawn@4
|
379 %*****************************************************************************************
|
Dawn@4
|
380 %-------------- post voicing checking ---------------------------------------------
|
Dawn@4
|
381 function voicing=postvda(segment, curf0,Fs,r_threshold)
|
Dawn@4
|
382 %%% check voicing again using estimated F0, which follows Hermes, SHS algorithm, JASA, 1988
|
Dawn@4
|
383 if nargin<4
|
Dawn@4
|
384 r_threshold=0.2;
|
Dawn@4
|
385 end
|
Dawn@4
|
386 estimated_period=1/curf0;
|
Dawn@4
|
387 mid_point=round(length(segment)/2);
|
Dawn@4
|
388 num_points=round(estimated_period*Fs); % number of points in each period
|
Dawn@4
|
389 start_point=mid_point-num_points;
|
Dawn@4
|
390 end_point=mid_point+num_points;
|
Dawn@4
|
391 if (start_point <1)
|
Dawn@4
|
392 start_point=1;
|
Dawn@4
|
393 mid_point=start_point+num_points;
|
Dawn@4
|
394 if (mid_point>length(segment)) % this is unreasonable, set f0 to zero
|
Dawn@4
|
395 voicing=0;
|
Dawn@4
|
396 return;
|
Dawn@4
|
397 end
|
Dawn@4
|
398 end
|
Dawn@4
|
399 segment1=segment(start_point:mid_point);
|
Dawn@4
|
400 if (end_point>length(segment))
|
Dawn@4
|
401 end_point=length(segment);
|
Dawn@4
|
402 mid_point=end_point-num_points;
|
Dawn@4
|
403 if (mid_point<1) % this is unreasonable, set f0 to zero
|
Dawn@4
|
404 voicing=0;
|
Dawn@4
|
405 return;
|
Dawn@4
|
406 end
|
Dawn@4
|
407 end
|
Dawn@4
|
408 segment2=segment(mid_point:end_point);
|
Dawn@4
|
409 len=min(length(segment1),length(segment2));
|
Dawn@4
|
410 r=corrcoef(segment1(1:len),segment2(1:len));
|
Dawn@4
|
411 r1=r(1,2);
|
Dawn@4
|
412 if (r1<r_threshold) % correlation threshold
|
Dawn@4
|
413 voicing=0;
|
Dawn@4
|
414 else
|
Dawn@4
|
415 voicing=1;
|
Dawn@4
|
416 end
|
Dawn@4
|
417 USE_ZCR=1;
|
Dawn@4
|
418 if(USE_ZCR & voicing)
|
Dawn@4
|
419 zcr1=zcr(segment1,estimated_period);
|
Dawn@4
|
420 zcr2=zcr(segment2,estimated_period);
|
Dawn@4
|
421 %minzcr=2500;
|
Dawn@4
|
422 minzcr=3500;
|
Dawn@4
|
423 if (zcr1<minzcr | zcr2<minzcr)
|
Dawn@4
|
424 voicing=1;
|
Dawn@4
|
425 else
|
Dawn@4
|
426 voicing=0;
|
Dawn@4
|
427 end
|
Dawn@4
|
428 end
|
Dawn@4
|
429 %%*****************************************************************************************
|
Dawn@4
|
430 %--------------------- Compute zero-crossing rate -------------------------------------------
|
Dawn@4
|
431 function zcr=zcr(x,dur)
|
Dawn@4
|
432 % function zcr=zcr(x,dur) : compute zero-crossing rate
|
Dawn@4
|
433 % x: input data
|
Dawn@4
|
434 % x: duration of the input data
|
Dawn@4
|
435 [nf,len]=size(x);
|
Dawn@4
|
436 zcr=sum(0.5*abs(sign(x(:,2:len))-sign(x(:,1:len-1))))/dur;
|
Dawn@4
|
437 %%*************************************************************************************
|
Dawn@4
|
438 %--------------------- Window function -------------------------------------------
|
Dawn@4
|
439 function w = window(N,wt,beta)
|
Dawn@4
|
440 %
|
Dawn@4
|
441 % w = window(N,wt)
|
Dawn@4
|
442 %
|
Dawn@4
|
443 % generate a window function
|
Dawn@4
|
444 %
|
Dawn@4
|
445 % N = length of desired window
|
Dawn@4
|
446 % wt = window type desired
|
Dawn@4
|
447 % 'rect' = rectangular 'tria' = triangular (Bartlett)
|
Dawn@4
|
448 % 'hann' = Hanning 'hamm' = Hamming
|
Dawn@4
|
449 % 'blac' = Blackman
|
Dawn@4
|
450 % 'kais' = Kaiser
|
Dawn@4
|
451 %
|
Dawn@4
|
452 % w = row vector containing samples of the desired window
|
Dawn@4
|
453 % beta : used in Kaiser window
|
Dawn@4
|
454
|
Dawn@4
|
455 nn = N-1;
|
Dawn@4
|
456 n=0:nn;
|
Dawn@4
|
457 pn = 2*pi*(0:nn)/nn;
|
Dawn@4
|
458 if wt(1,1:4) == 'rect',
|
Dawn@4
|
459 w = ones(1,N);
|
Dawn@4
|
460 elseif wt(1,1:4) == 'tria',
|
Dawn@4
|
461 m = nn/2;
|
Dawn@4
|
462 w = (0:m)/m;
|
Dawn@4
|
463 w = [w w(ceil(m):-1:1)];
|
Dawn@4
|
464 elseif wt(1,1:4) == 'hann',
|
Dawn@4
|
465 w = 0.5*(1 - cos(pn));
|
Dawn@4
|
466 elseif wt(1,1:4) == 'hamm',
|
Dawn@4
|
467 w = .54 - .46*cos(pn);
|
Dawn@4
|
468 elseif wt(1,1:4) == 'blac',
|
Dawn@4
|
469 w = .42 -.5*cos(pn) + .08*cos(2*pn);
|
Dawn@4
|
470 elseif wt(1,1:4) == 'kais',
|
Dawn@4
|
471 if nargin<3
|
Dawn@4
|
472 error('you need provide beta!')
|
Dawn@4
|
473 end
|
Dawn@4
|
474 w =bessel1(beta*sqrt(1-((n-N/2)/(N/2)).^2))./bessel1(beta);
|
Dawn@4
|
475 else
|
Dawn@4
|
476 disp('Incorrect Window type requested')
|
Dawn@4
|
477 end |