# HG changeset patch # User Chris Cannam # Date 1395232846 0 # Node ID ed9e20b6b165454d379c37c63349eaefd15557ad # Parent 155a50cf91f5815234ba656c73ce475fe03e6c03 Begin annotating the m files diff -r 155a50cf91f5 -r ed9e20b6b165 notes/cplcaMT-annotated.m --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/notes/cplcaMT-annotated.m Wed Mar 19 12:40:46 2014 +0000 @@ -0,0 +1,230 @@ +function [w,h,z,u,xa] = cplcaMT( x, K, T, R, w, h, z, u, iter, sw, sh, sz, su, lw, lh, lz, lu, pa) +% function [w,h,xa2] = cplcaMT( x, K, T, R, w, h, z, u, iter, sw, sh, sz, su, lw, lh, lz, lu) +% +% Perform multiple-source, multiple-template SIPLCA for transcription +% +% Inputs: +% x input distribution +% K number of components +% T size of components +% R size of sources +% w initial value of p(w) [default = random] +% h initial value of p(h) [default = random] +% z initial value of p(z) [default = random] +% iter number of EM iterations [default = 10] +% sw sparsity parameter for w [default = 1] +% sh sparsity parameter for h [default = 1] +% sz sparsity parameter for z [default = 1] +% lw flag to update w [default = 1] +% lh flag to update h [default = 1] +% lh flag to update h [default = 1] +% pa source-component activity range [Rx2] +% +% Outputs: +% w p(w) - spectral bases +% h p(h) - pitch impulse +% z p(z) - mixing matrix for p(h) +% xa approximation of input + +% Emmanouil Benetos 2011, based on cplca code by Paris Smaragdis + + +%% for the transcription application, +%% x -> noise-reduced constant Q. In the application this is a 2-sec, +%% 100-col segment with 2 zeros at top and bottom, so 549x100 +%% K -> 88, number of notes +%% T -> [545 1], a two-element array: 545 is the length of each +%% template, but why 1? +%% R -> 10, number of instruments +%% w -> a 10x88 cell array, in which w{instrument,note} is a 545x1 +%% array containing the template for the given instrument and note +%% number +%% h -> empty +%% z -> empty +%% u -> empty +%% iter -> a parameter for the program, 12 in the mirex submission +%% sw -> 1 +%% sh -> 1 +%% sz -> 1.1 +%% su -> 1.3, not documented above, presumably sparsity for u +%% lw -> 0, don't update w +%% lh -> 1, do update h +%% lz -> 1, do update z +%% lu -> 1, not documented above, presumably do update u +%% pa -> a 10x2 array in which pa(instrument,1) is the lowest note +%% expected for that instrument and pa(instrument,2) is the highest + + +% Sort out the sizes + +wc = 2*size(x)-T; %% works out to 553x199 +hc = size(x)+T-1; %% works out to 1093x100 + +% Default training iterations +if ~exist( 'iter') + iter = 10; +end + + +% Initialize +sumx = sum(x); %% for later normalisation + +if ~exist( 'w') || isempty( w) + %% doesn't happen, w was provided (it's the template data) + w = cell(R, K); + for k = 1:K + for r=1:R + w{r,k} = rand( T); + w{r,k} = w{r,k} / sum( w{r,k}(:)); + end + end +end +if ~exist( 'h') || isempty( h) + %% does happen, h was not provided + h = cell(1, K); + for k = 1:K + h{k} = rand( size(x)-T+1); + h{k} = h{k}; + end + %% h is now a 1x88 cell, h{note} is a 5x100 array of random values. + %% The 5 comes from the height of the CQ array minus the length of + %% a template, plus 1. I guess this is space to allow for the + %% 5-bins-per-semitone pitch shift. +end +if ~exist( 'z') || isempty( z) + %% does happen, z was not provided + z = cell(1, K); + for k = 1:K + z{k} = rand( size(x,2),1); + z{k} = z{k}; + end + %% z is a 1x88 cell, z{note} is a 100x1 array of random values. +end +if ~exist( 'u') || isempty( u) + %% does happen, u was not provided + u = cell(R, K); + for k = 1:K + for r=1:R + if( (pa(r,1) <= k && k <= pa(r,2)) ) + u{r,k} = ones( size(x,2),1); + else + u{r,k} = zeros( size(x,2),1); + end + end; + end + %% u is a 10x88 cell, u{instrument,note} is a 100x1 double containing + %% all 1s if note is in-range for instrument and all 0s otherwise +end + +fh = cell(1, K); +fw = cell(R, K); +for k = 1:K + fh{k} = ones(wc) + 1i*ones(wc); + for r=1:R + fw{r,k} = ones(wc) + 1i*ones(wc); + end; +end; + + + +% Make commands for subsequent multidim operations and initialize fw +fnh = 'c(hc(1)-(T(1)+(1:size(h{k},1))-2),hc(2)-(T(2)+(1:size(h{k},2))-2))'; +xai = 'xa(1:size(x,1),1:size(x,2))'; +flz = 'xbar(end:-1:1,end:-1:1)'; + +for k = 1:K + for r=1:R + if( (pa(r,1) <= k && k <= pa(r,2)) ) + fw{r,k} = fftn( w{r,k}, wc); + end; + end; +end; + +% Iterate +for it = 1:iter + + %disp(['Iteration: ' num2str(it)]); + + % E-step + xa = eps; + for k = 16:73 + fh{k} = fftn( h{k}, wc); + for r=1:R + if( (pa(r,1) <= k && k <= pa(r,2)) ) + xa1 = abs( real( ifftn( fw{r,k} .* fh{k}))); + xa = xa + xa1(1:size(x,1),1:size(x,2)) .*repmat(z{k},1,size(x,1))'.*repmat(u{r,k},1,size(x,1))'; + clear xa1; + end + end + end + + xbar = x ./ xa; + xbar = eval( flz); + fx = fftn( xbar, wc); + + + % M-step + for k = 16:73 + + + % Update h, z, u + nh=eps; + for r=1:R + if( (pa(r,1) <= k && k <= pa(r,2)) ) + c = abs( real( ifftn( fx .* fw{r,k} ))); + nh1 = eval( fnh); + nh1 = nh1 .*repmat(u{r,k},1,size(h{k},1))'; + nh = nh + nh1; + + nhu = eval( fnh); + nhu = nhu .* h{k}; + nu = sum(nhu)'; + nu = u{r,k} .* nu + eps; + if lu + u{r,k} = nu; + end; + + end; + end + nh = h{k} .* (nh.^sh); + nz = sum(nh)'; + nz = z{k} .* nz + eps; + + + % Assign and normalize + if lh + h{k} = nh; + end + if lz + z{k} = nz; + end + + + end + + % Normalize z over t + if lz + Z=[]; for i=1:K Z=[Z z{i}]; end; + Z = Z.^sz; + Z(1:end,1:15)=0; + Z(1:end,74:88)=0; + Z = Z./repmat(sum(Z,2),1,K); z = num2cell(Z,1); %figure; imagesc(imrotate(Z,90)); + end + + % Normalize u over z,t + if lu + U=[]; for r=1:R U(r,:,:) = cell2mat(u(r,:)); end; + for i=1:size(U,2) for j=1:size(U,3) U(:,i,j) = U(:,i,j).^su; U(:,i,j) = U(:,i,j) ./ sum(U(:,i,j)); end; end; + U0 = permute(U,[2 1 3]); u = squeeze(num2cell(U0,1)); + end + + % Normalize h over z,t + H=[]; for k=1:K H(k,:,:) = cell2mat(h(k)); end; H0 = permute(H,[2 1 3]); + for i=1:size(H0,2) for j=1:size(H0,3) H0(:,i,j) = sumx(j)* (H0(:,i,j) ./ sum(H0(:,i,j))); end; end; + h = squeeze(num2cell(squeeze(H0),[1 3])); for k=1:K h{k} = squeeze(h{k}); end; + + %figure; imagesc(imrotate(xa',90)); + +end + +%figure; imagesc(imrotate(xa',90)); diff -r 155a50cf91f5 -r ed9e20b6b165 notes/transcriptionMultipleTemplates-annotated.m --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/notes/transcriptionMultipleTemplates-annotated.m Wed Mar 19 12:40:46 2014 +0000 @@ -0,0 +1,100 @@ +function [ph pz sumY] = transcriptionMultipleTemplates(filename,iter,sz,su) + + +% Load note templates +load('noteTemplatesBassoon'); W(:,:,1) = noteTemplatesBassoon; +load('noteTemplatesCello'); W(:,:,2) = noteTemplatesCello; +load('noteTemplatesClarinet'); W(:,:,3) = noteTemplatesClarinet; +load('noteTemplatesFlute'); W(:,:,4) = noteTemplatesFlute; +load('noteTemplatesGuitar'); W(:,:,5) = noteTemplatesGuitar; +load('noteTemplatesHorn'); W(:,:,6) = noteTemplatesHorn; +load('noteTemplatesOboe'); W(:,:,7) = noteTemplatesOboe; +load('noteTemplatesTenorSax'); W(:,:,8) = noteTemplatesTenorSax; +load('noteTemplatesViolin'); W(:,:,9) = noteTemplatesViolin; +load('noteTemplatesSptkBGCl'); W(:,:,10) = noteTemplatesSptkBGCl; + + +%pitchActivity = [14 16 30 40 20 21 38 24 35 1; 52 61 69 76 56 57 71 55 80 88]'; +pitchActivity = [16 16 30 40 20 21 38 24 35 16; 52 61 69 73 56 57 71 55 73 73]'; + + +%% this turns W0 into a 10x88 cell array in which W0{instrument}{note} +%% is the 545x1 template for the given instrument and note number. +W = permute(W,[2 1 3]); +W0 = squeeze(num2cell(W,1))'; + +clear('noteTemplatesBassoon','noteTemplatesCello','noteTemplatesClarinet','noteTemplatesFlute','noteTemplatesGuitar',... + 'noteTemplatesHorn','noteTemplatesOboe','noteTemplatesTenorSax','noteTemplatesViolin','noteTemplatesSptkBGCl','W'); + + +% Compute CQT + +%% The CQT parameters are hardcoded in computeCQT. It has frequency +%% range 27.5 -> samplerate/3, 60 bins per octave, a 'q' of 0.8 (lower +%% than the maximum, and default, value of 1), 'atomHopFactor' 0.3 +%% rather than the default 0.25 (why?), Hann window, default sparsity +%% threshold. + +%% for a 43.5 second 44.1 KHz audio file, intCQT will be a 545x30941 +%% array, one column every 0.0014 seconds. +[intCQT] = computeCQT(filename); + +%% X is sampled from intCQT at 7.1128-column intervals, giving +%% 4350x545 in this case, so clearly 100 columns per second; then +%% transposed +X = intCQT(:,round(1:7.1128:size(intCQT,2)))'; + +%% median filter to reduce noise -- I think this is essentially the +%% same as Xue's method for devuvuzelation +noiseLevel1 = medfilt1(X',40); +noiseLevel2 = medfilt1(min(X',noiseLevel1),40); +X = max(X-noiseLevel2',0); + +%% take every 4th row. We had 100 per second (10ms) so this is 40ms as +%% the comment says. I am guessing we denoised at a higher resolution +%% for better denoising, though still not at the original resolution, +%% for speed. Y is now 1088x545 in our example and looks pretty clean +%% as a contour plot. +Y = X(1:4:size(X,1),:); % 40ms step + +%% a 1x1088 array containing the sum of each column. Doesn't appear to +%% be used. +sumY = sum(Y'); + +clear('intCQT','X','noiseLevel1','noiseLevel2'); + +fprintf('%s','done'); +fprintf('\n'); +fprintf('%s',['Estimating F0s...........']); + +% For each 2sec segment, perform SIPLCA with fixed W0 +ph = zeros(440,size(Y,1)); +pz = zeros(88,size(Y,1)); + +for j=1:floor(size(Y,1)/100) + + x=[zeros(2,100); Y(1+(j-1)*100:j*100,:)'; zeros(2,100)]; + [w,h,z,u,xa] = cplcaMT( x, 88, [545 1], 10, W0, [], [], [], iter, 1, 1, sz, su, 0, 1, 1, 1, pitchActivity); + + H=[]; for i=1:88 H=[H; h{i}]; end; + ph(:,1+(j-1)*100:j*100) = H; + Z=[]; for i=1:88 Z=[Z z{i}]; end; + pz(:,1+(j-1)*100:j*100) = Z'; + perc = 100*(j/(floor(size(Y,1)/100)+1)); + fprintf('\n'); + fprintf('%.2f%% complete',perc); +end; + +len=size(Y,1)-j*100; % Final segment + +if (len >0) +x=[zeros(2,len); Y(1+j*100:end,:)'; zeros(2,len)]; +[w,h,z,u,xa] = cplcaMT( x, 88, [545 1], 10, W0, [], [], [], iter, 1, 1, sz, su, 0, 1, 1, 1, pitchActivity); +fprintf('\n'); +fprintf('100%% complete'); + +H=[]; for i=1:88 H=[H; h{i}]; end; +ph(:,1+j*100:end) = H; +Z=[]; for i=1:88 Z=[Z z{i}]; end; +pz(:,1+j*100:end) = Z'; +end;