Chris@5
|
1 function [w,h,z,u,xa] = cplcaMT( x, K, T, R, w, h, z, u, iter, sw, sh, sz, su, lw, lh, lz, lu, pa)
|
Chris@5
|
2 % function [w,h,xa2] = cplcaMT( x, K, T, R, w, h, z, u, iter, sw, sh, sz, su, lw, lh, lz, lu)
|
Chris@5
|
3 %
|
Chris@5
|
4 % Perform multiple-source, multiple-template SIPLCA for transcription
|
Chris@5
|
5 %
|
Chris@5
|
6 % Inputs:
|
Chris@5
|
7 % x input distribution
|
Chris@5
|
8 % K number of components
|
Chris@5
|
9 % T size of components
|
Chris@5
|
10 % R size of sources
|
Chris@5
|
11 % w initial value of p(w) [default = random]
|
Chris@5
|
12 % h initial value of p(h) [default = random]
|
Chris@5
|
13 % z initial value of p(z) [default = random]
|
Chris@5
|
14 % iter number of EM iterations [default = 10]
|
Chris@5
|
15 % sw sparsity parameter for w [default = 1]
|
Chris@5
|
16 % sh sparsity parameter for h [default = 1]
|
Chris@5
|
17 % sz sparsity parameter for z [default = 1]
|
Chris@5
|
18 % lw flag to update w [default = 1]
|
Chris@5
|
19 % lh flag to update h [default = 1]
|
Chris@5
|
20 % lh flag to update h [default = 1]
|
Chris@5
|
21 % pa source-component activity range [Rx2]
|
Chris@5
|
22 %
|
Chris@5
|
23 % Outputs:
|
Chris@5
|
24 % w p(w) - spectral bases
|
Chris@5
|
25 % h p(h) - pitch impulse
|
Chris@5
|
26 % z p(z) - mixing matrix for p(h)
|
Chris@5
|
27 % xa approximation of input
|
Chris@5
|
28
|
Chris@5
|
29 % Emmanouil Benetos 2011, based on cplca code by Paris Smaragdis
|
Chris@5
|
30
|
Chris@5
|
31
|
Chris@5
|
32 %% for the transcription application,
|
Chris@5
|
33 %% x -> noise-reduced constant Q. In the application this is a 2-sec,
|
Chris@5
|
34 %% 100-col segment with 2 zeros at top and bottom, so 549x100
|
Chris@5
|
35 %% K -> 88, number of notes
|
Chris@5
|
36 %% T -> [545 1], a two-element array: 545 is the length of each
|
Chris@5
|
37 %% template, but why 1?
|
Chris@5
|
38 %% R -> 10, number of instruments
|
Chris@5
|
39 %% w -> a 10x88 cell array, in which w{instrument,note} is a 545x1
|
Chris@5
|
40 %% array containing the template for the given instrument and note
|
Chris@5
|
41 %% number
|
Chris@5
|
42 %% h -> empty
|
Chris@5
|
43 %% z -> empty
|
Chris@5
|
44 %% u -> empty
|
Chris@5
|
45 %% iter -> a parameter for the program, 12 in the mirex submission
|
Chris@5
|
46 %% sw -> 1
|
Chris@5
|
47 %% sh -> 1
|
Chris@5
|
48 %% sz -> 1.1
|
Chris@5
|
49 %% su -> 1.3, not documented above, presumably sparsity for u
|
Chris@5
|
50 %% lw -> 0, don't update w
|
Chris@5
|
51 %% lh -> 1, do update h
|
Chris@5
|
52 %% lz -> 1, do update z
|
Chris@5
|
53 %% lu -> 1, not documented above, presumably do update u
|
Chris@5
|
54 %% pa -> a 10x2 array in which pa(instrument,1) is the lowest note
|
Chris@5
|
55 %% expected for that instrument and pa(instrument,2) is the highest
|
Chris@5
|
56
|
Chris@5
|
57
|
Chris@5
|
58 % Sort out the sizes
|
Chris@5
|
59
|
Chris@5
|
60 wc = 2*size(x)-T; %% works out to 553x199
|
Chris@5
|
61 hc = size(x)+T-1; %% works out to 1093x100
|
Chris@5
|
62
|
Chris@5
|
63 % Default training iterations
|
Chris@5
|
64 if ~exist( 'iter')
|
Chris@5
|
65 iter = 10;
|
Chris@5
|
66 end
|
Chris@5
|
67
|
Chris@5
|
68
|
Chris@5
|
69 % Initialize
|
Chris@5
|
70 sumx = sum(x); %% for later normalisation
|
Chris@5
|
71
|
Chris@5
|
72 if ~exist( 'w') || isempty( w)
|
Chris@5
|
73 %% doesn't happen, w was provided (it's the template data)
|
Chris@5
|
74 w = cell(R, K);
|
Chris@5
|
75 for k = 1:K
|
Chris@5
|
76 for r=1:R
|
Chris@5
|
77 w{r,k} = rand( T);
|
Chris@5
|
78 w{r,k} = w{r,k} / sum( w{r,k}(:));
|
Chris@5
|
79 end
|
Chris@5
|
80 end
|
Chris@5
|
81 end
|
Chris@5
|
82 if ~exist( 'h') || isempty( h)
|
Chris@5
|
83 %% does happen, h was not provided
|
Chris@5
|
84 h = cell(1, K);
|
Chris@5
|
85 for k = 1:K
|
Chris@5
|
86 h{k} = rand( size(x)-T+1);
|
Chris@5
|
87 h{k} = h{k};
|
Chris@5
|
88 end
|
Chris@5
|
89 %% h is now a 1x88 cell, h{note} is a 5x100 array of random values.
|
Chris@5
|
90 %% The 5 comes from the height of the CQ array minus the length of
|
Chris@5
|
91 %% a template, plus 1. I guess this is space to allow for the
|
Chris@5
|
92 %% 5-bins-per-semitone pitch shift.
|
Chris@5
|
93 end
|
Chris@5
|
94 if ~exist( 'z') || isempty( z)
|
Chris@5
|
95 %% does happen, z was not provided
|
Chris@5
|
96 z = cell(1, K);
|
Chris@5
|
97 for k = 1:K
|
Chris@5
|
98 z{k} = rand( size(x,2),1);
|
Chris@5
|
99 z{k} = z{k};
|
Chris@5
|
100 end
|
Chris@5
|
101 %% z is a 1x88 cell, z{note} is a 100x1 array of random values.
|
Chris@5
|
102 end
|
Chris@5
|
103 if ~exist( 'u') || isempty( u)
|
Chris@5
|
104 %% does happen, u was not provided
|
Chris@5
|
105 u = cell(R, K);
|
Chris@5
|
106 for k = 1:K
|
Chris@5
|
107 for r=1:R
|
Chris@5
|
108 if( (pa(r,1) <= k && k <= pa(r,2)) )
|
Chris@5
|
109 u{r,k} = ones( size(x,2),1);
|
Chris@5
|
110 else
|
Chris@5
|
111 u{r,k} = zeros( size(x,2),1);
|
Chris@5
|
112 end
|
Chris@5
|
113 end;
|
Chris@5
|
114 end
|
Chris@5
|
115 %% u is a 10x88 cell, u{instrument,note} is a 100x1 double containing
|
Chris@5
|
116 %% all 1s if note is in-range for instrument and all 0s otherwise
|
Chris@5
|
117 end
|
Chris@5
|
118
|
Chris@6
|
119 fh = cell(1, K); %% 1x88
|
Chris@6
|
120 fw = cell(R, K); %% 10x88
|
Chris@5
|
121 for k = 1:K
|
Chris@5
|
122 fh{k} = ones(wc) + 1i*ones(wc);
|
Chris@5
|
123 for r=1:R
|
Chris@5
|
124 fw{r,k} = ones(wc) + 1i*ones(wc);
|
Chris@5
|
125 end;
|
Chris@5
|
126 end;
|
Chris@5
|
127
|
Chris@6
|
128 %% now fh is a 1x88 cell, and fh{note} is a 553x199 array initialised
|
Chris@6
|
129 %% with all complex values 1 + 1i
|
Chris@6
|
130
|
Chris@6
|
131 %% fw is a 10x88 cell, and fw{instrument,note} is a 553x199 array
|
Chris@6
|
132 %% likewise
|
Chris@5
|
133
|
Chris@5
|
134
|
Chris@8
|
135 %% The MIREX paper describes the model as
|
Chris@8
|
136 %%
|
Chris@8
|
137 %% P(w,t) = P(t) sum[p,f,s]( P(w-f|s,p) P(f|p,t) P(s|p,t) P(p|t) )
|
Chris@8
|
138 %%
|
Chris@8
|
139 %% where
|
Chris@8
|
140 %% w = log-frequency bin index
|
Chris@8
|
141 %% t = time
|
Chris@8
|
142 %% s = instrument number
|
Chris@8
|
143 %% p = MIDI pitch number
|
Chris@8
|
144 %% f = frequency offset (pitch-adjustment convolution)
|
Chris@8
|
145 %% so
|
Chris@8
|
146 %% P(w,t) = the input distribution (constant q spectrum)
|
Chris@8
|
147 %% P(t) = overall energy by time
|
Chris@8
|
148 %% P(w-f|s,p) = spectral template for instrument s, pitch p with offset f
|
Chris@8
|
149 %% P(f|p,t) = frequency offset for pitch p, time t?
|
Chris@8
|
150 %% P(s|p,t) = instrument contribution for pitch p, time t
|
Chris@8
|
151 %% P(p|t) = pitch probability for time t
|
Chris@8
|
152 %% the outputs we want to produce are P(p|t) (the transcription matrix)
|
Chris@8
|
153 %% and P(s|p,t) (the instrument classification).
|
Chris@8
|
154 %%
|
Chris@8
|
155 %% In this program,
|
Chris@8
|
156 %% x -> P(w,t), the input distribution
|
Chris@8
|
157 %% w -> P(w|s,p), the templates
|
Chris@8
|
158 %% h -> P(f|p,t), the pitch shift component
|
Chris@8
|
159 %% z -> P(p|t), the pitch probabilities, the main return value
|
Chris@8
|
160 %% u -> P(s|p,t), the source contribution, the secondary return value
|
Chris@8
|
161 %%
|
Chris@8
|
162 %% The paper gives the update rule for the expectation step as
|
Chris@8
|
163 %%
|
Chris@8
|
164 %% P(p,f,s|w,t) = P(w-f|s,p) P(f|p,t) P(s|p,t) P(p|t)
|
Chris@8
|
165 %% --------------------------------------------------
|
Chris@8
|
166 %% sum[p,f,s] ( P(w-f|s,p) P(f|p,t) P(s|p,t) P(p|t) )
|
Chris@8
|
167 %%
|
Chris@8
|
168 %% and the update equations for the maximization step as
|
Chris@8
|
169 %%
|
Chris@8
|
170 %% P(f|p,t) = sum[w,s] ( P(p,f,s|w,t) P(w,t) )
|
Chris@8
|
171 %% or h ----------------------------------
|
Chris@8
|
172 %% sum[f,w,s] ( P(p,f,s|w,t) P(w,t) )
|
Chris@8
|
173 %%
|
Chris@8
|
174 %% P(s|p,t) = sum[w,f] ( P(p,f,s|w,t) P(w,t) )
|
Chris@8
|
175 %% or u ----------------------------------
|
Chris@8
|
176 %% sum[s,w,f] ( P(p,f,s|w,t) P(w,t) )
|
Chris@8
|
177 %%
|
Chris@8
|
178 %% P(p|t) = sum[w,f,s] ( P(p,f,s|w,t) P(w,t) )
|
Chris@8
|
179 %% or z ------------------------------------
|
Chris@8
|
180 %% sum[p,w,f,s] ( P(p,f,s|w,t) P(w,t) )
|
Chris@8
|
181 %%
|
Chris@8
|
182 %% (there is also an update equation for x, or P(w|s,p) but we
|
Chris@15
|
183 %% don't want that as it's the input -- one paper proposes an 89th
|
Chris@15
|
184 %% template to learn the noise component but... not yet)
|
Chris@8
|
185
|
Chris@8
|
186
|
Chris@8
|
187
|
Chris@5
|
188 % Make commands for subsequent multidim operations and initialize fw
|
Chris@6
|
189
|
Chris@5
|
190 fnh = 'c(hc(1)-(T(1)+(1:size(h{k},1))-2),hc(2)-(T(2)+(1:size(h{k},2))-2))';
|
Chris@5
|
191 xai = 'xa(1:size(x,1),1:size(x,2))';
|
Chris@5
|
192 flz = 'xbar(end:-1:1,end:-1:1)';
|
Chris@5
|
193
|
Chris@5
|
194 for k = 1:K
|
Chris@5
|
195 for r=1:R
|
Chris@5
|
196 if( (pa(r,1) <= k && k <= pa(r,2)) )
|
Chris@6
|
197
|
Chris@6
|
198 %% fftn(X,siz) takes an N-dimensional FFT (same number of
|
Chris@6
|
199 %% dimensions as siz) of X, padding or truncating X
|
Chris@6
|
200 %% beforehand so that it is of size siz. Here w{r,k} is the
|
Chris@6
|
201 %% 545x1 template for instrument r and note k, and wc is
|
Chris@6
|
202 %% 553x199.
|
Chris@6
|
203
|
Chris@6
|
204 %% I believe this is equivalent to performing a 553-point
|
Chris@6
|
205 %% FFT of each column of the input (with w{r,k} in the first
|
Chris@15
|
206 %% 545 elements of the first column of that input).
|
Chris@6
|
207
|
Chris@6
|
208 %% The output is of course complex.
|
Chris@6
|
209
|
Chris@8
|
210 %% The purpose of this is to support convolution for pitch
|
Chris@8
|
211 %% shifting. w{r,k} are the templates, and fw{r,k} are ffts
|
Chris@8
|
212 %% of the templates which will be multiplied by fh, the
|
Chris@8
|
213 %% equivalent ffts of the pitch shift contributions, later
|
Chris@8
|
214
|
Chris@5
|
215 fw{r,k} = fftn( w{r,k}, wc);
|
Chris@5
|
216 end;
|
Chris@5
|
217 end;
|
Chris@5
|
218 end;
|
Chris@5
|
219
|
Chris@5
|
220 % Iterate
|
Chris@5
|
221 for it = 1:iter
|
Chris@5
|
222
|
Chris@5
|
223 %disp(['Iteration: ' num2str(it)]);
|
Chris@5
|
224
|
Chris@5
|
225 % E-step
|
Chris@13
|
226 xa = eps; %% tiny non-zero initialiser as we'll be dividing by this later
|
Chris@6
|
227 for k = 16:73 %% overall note range found in instrument set
|
Chris@8
|
228 fh{k} = fftn( h{k}, wc); %% this and the subsequent ifftn are for the pitch-shift convolution step I think
|
Chris@12
|
229 for r=1:R %% instruments
|
Chris@5
|
230 if( (pa(r,1) <= k && k <= pa(r,2)) )
|
Chris@5
|
231 xa1 = abs( real( ifftn( fw{r,k} .* fh{k})));
|
Chris@5
|
232 xa = xa + xa1(1:size(x,1),1:size(x,2)) .*repmat(z{k},1,size(x,1))'.*repmat(u{r,k},1,size(x,1))';
|
Chris@8
|
233
|
Chris@8
|
234 %% so xa is the accumulation of the element-by-element
|
Chris@8
|
235 %% product of: the pitch-shifted templates (xa1); the
|
Chris@8
|
236 %% pitch probabilities (z); and the source
|
Chris@8
|
237 %% contributions (u); across all instruments and notes
|
Chris@8
|
238
|
Chris@8
|
239 %% note that xa1 is resized to match x, the input,
|
Chris@8
|
240 %% which is possible because fw and fh were
|
Chris@8
|
241 %% constructed at (just over) the same width and
|
Chris@8
|
242 %% (almost) twice the height (553x199 if x is
|
Chris@8
|
243 %% 549x100).
|
Chris@8
|
244
|
Chris@8
|
245 %% the other components are 100x1, i.e. one value per
|
Chris@8
|
246 %% time step, so they are tiled up to 100x549 and then
|
Chris@8
|
247 %% transposed for the multiplication
|
Chris@8
|
248
|
Chris@5
|
249 clear xa1;
|
Chris@5
|
250 end
|
Chris@5
|
251 end
|
Chris@5
|
252 end
|
Chris@5
|
253
|
Chris@5
|
254 xbar = x ./ xa;
|
Chris@5
|
255 xbar = eval( flz);
|
Chris@15
|
256
|
Chris@29
|
257 %% xbar now contains the summation of Eqn 8 in the CMJ paper, i.e.
|
Chris@29
|
258 %% it is a time-frequency distribution consisting of the
|
Chris@29
|
259 %% five-dimensional distribution Pt(p,f,s|w) summed across p, f,
|
Chris@29
|
260 %% and s. In the M step, for each parameter we want to update, we
|
Chris@29
|
261 %% take the same five-dimensional distribution and marginalise it
|
Chris@29
|
262 %% across the other parameters that that one does not depend on.
|
Chris@29
|
263 %% The M-step is a bit tricky for me to follow here, with the
|
Chris@29
|
264 %% convolution and use of MATLAB eval macros.
|
Chris@15
|
265
|
Chris@5
|
266 fx = fftn( xbar, wc);
|
Chris@5
|
267
|
Chris@5
|
268
|
Chris@5
|
269 % M-step
|
Chris@5
|
270 for k = 16:73
|
Chris@5
|
271
|
Chris@5
|
272
|
Chris@8
|
273 %% Throughout here, r is an instrument number (1..R) and k is a
|
Chris@8
|
274 %% note number (1..K)
|
Chris@8
|
275
|
Chris@5
|
276 % Update h, z, u
|
Chris@5
|
277 nh=eps;
|
Chris@5
|
278 for r=1:R
|
Chris@5
|
279 if( (pa(r,1) <= k && k <= pa(r,2)) )
|
Chris@14
|
280
|
Chris@14
|
281 %% so we're accumulating to nh (which is per-note but
|
Chris@14
|
282 %% across all instruments) here
|
Chris@14
|
283
|
Chris@14
|
284 %% this is a convolution of the error (xbar) with w,
|
Chris@14
|
285 %% carried out as a frequency-domain multiplication.
|
Chris@14
|
286 %% so it's like xbar-for-all-w
|
Chris@5
|
287 c = abs( real( ifftn( fx .* fw{r,k} )));
|
Chris@14
|
288
|
Chris@14
|
289 nh1 = eval( fnh); %% this one is highly mysterious
|
Chris@14
|
290
|
Chris@14
|
291 %% take the 100x1 note range matrix, repeat to 100x5
|
Chris@14
|
292 %% (as h{k} is 5x100), transpose, multiply nh1 by that
|
Chris@14
|
293 nh1 = nh1 .* repmat(u{r,k},1,size(h{k},1))';
|
Chris@14
|
294 nh = nh + nh1; %% so nh will presumably be 100x5 too
|
Chris@5
|
295
|
Chris@15
|
296 nhu = eval( fnh); %% more magic
|
Chris@14
|
297
|
Chris@5
|
298 nhu = nhu .* h{k};
|
Chris@14
|
299
|
Chris@5
|
300 nu = sum(nhu)';
|
Chris@5
|
301 nu = u{r,k} .* nu + eps;
|
Chris@5
|
302 if lu
|
Chris@5
|
303 u{r,k} = nu;
|
Chris@5
|
304 end;
|
Chris@5
|
305
|
Chris@5
|
306 end;
|
Chris@5
|
307 end
|
Chris@5
|
308 nh = h{k} .* (nh.^sh);
|
Chris@5
|
309 nz = sum(nh)';
|
Chris@5
|
310 nz = z{k} .* nz + eps;
|
Chris@5
|
311
|
Chris@5
|
312
|
Chris@5
|
313 % Assign and normalize
|
Chris@5
|
314 if lh
|
Chris@5
|
315 h{k} = nh;
|
Chris@5
|
316 end
|
Chris@5
|
317 if lz
|
Chris@5
|
318 z{k} = nz;
|
Chris@5
|
319 end
|
Chris@5
|
320
|
Chris@5
|
321
|
Chris@5
|
322 end
|
Chris@5
|
323
|
Chris@5
|
324 % Normalize z over t
|
Chris@5
|
325 if lz
|
Chris@5
|
326 Z=[]; for i=1:K Z=[Z z{i}]; end;
|
Chris@5
|
327 Z = Z.^sz;
|
Chris@5
|
328 Z(1:end,1:15)=0;
|
Chris@5
|
329 Z(1:end,74:88)=0;
|
Chris@5
|
330 Z = Z./repmat(sum(Z,2),1,K); z = num2cell(Z,1); %figure; imagesc(imrotate(Z,90));
|
Chris@5
|
331 end
|
Chris@5
|
332
|
Chris@5
|
333 % Normalize u over z,t
|
Chris@5
|
334 if lu
|
Chris@5
|
335 U=[]; for r=1:R U(r,:,:) = cell2mat(u(r,:)); end;
|
Chris@5
|
336 for i=1:size(U,2) for j=1:size(U,3) U(:,i,j) = U(:,i,j).^su; U(:,i,j) = U(:,i,j) ./ sum(U(:,i,j)); end; end;
|
Chris@5
|
337 U0 = permute(U,[2 1 3]); u = squeeze(num2cell(U0,1));
|
Chris@5
|
338 end
|
Chris@5
|
339
|
Chris@5
|
340 % Normalize h over z,t
|
Chris@5
|
341 H=[]; for k=1:K H(k,:,:) = cell2mat(h(k)); end; H0 = permute(H,[2 1 3]);
|
Chris@5
|
342 for i=1:size(H0,2) for j=1:size(H0,3) H0(:,i,j) = sumx(j)* (H0(:,i,j) ./ sum(H0(:,i,j))); end; end;
|
Chris@5
|
343 h = squeeze(num2cell(squeeze(H0),[1 3])); for k=1:K h{k} = squeeze(h{k}); end;
|
Chris@5
|
344
|
Chris@5
|
345 %figure; imagesc(imrotate(xa',90));
|
Chris@5
|
346
|
Chris@5
|
347 end
|
Chris@5
|
348
|
Chris@5
|
349 %figure; imagesc(imrotate(xa',90));
|