# HG changeset patch # User Chris Cannam # Date 1395859752 0 # Node ID f1f8c84339d07de5c91ee9c0c74ff6e80966afb3 # Parent bbc8185ba511e87d7c2d5b70b0d376bdf6de422a Starting to revisit this EM logic and test in a single-column world diff -r bbc8185ba511 -r f1f8c84339d0 notes/em.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/notes/em.txt Wed Mar 26 18:49:12 2014 +0000 @@ -0,0 +1,63 @@ + +I agree with you - having a look at a model that does not support +convolution would help. You'll find attached 'hnmf.m', which is +essentially the same model without convolution. So the simplified model +is: P(w,t) = P(t) \sum_{s,p} P(w|p,s)P(p|t)P(s|p,t) + +Also, a more recent (and much more efficient) version of the CMJ system +converts the model from a convolutive to a linear one, but still keeping +the shift-invariance support. That is achieved by having a pre-extracted +4-D dictionary that also supports templates that are pre-shifted across +log-frequency (so that the system would not need to compute the +convolutions during the EM step). I have uploaded the source code on +SoundSoftware [i], and you can find the related paper in [ii]. This +system has the exact same performance with the CMJ one, but is much +easier to understand/implement, and is over 50 times faster. + +[i] https://code.soundsoftware.ac.uk/projects/amt_mssiplca_fast +[ii] http://www.ecmlpkdd2013.org/wp-content/uploads/2013/09/MLMU_benetos.pdf + +> In eqn 12, +> Pt(p) = +> sum[w,f,s] ( P(p,f,s|w,t) Vw,t ) / +> sum[p,w,f,s] ( P(p,f,s|w,t) Vw,t ) +> +> P(p,f,s|w,t) is the result of the E-step (and a time-frequency +> distribution), and Vw,t is the input spectrogram (also a +> time-frequency distribution), right? + +Right! Basically, P(p,f,s|w,t) is a 5-dimensional matrix, essentially +the model without the sums (the sums convert P(p,f,s|w,t) into a 2-D +matrix P(w,t)). + +> So I read this as something like: update the pitch probability +> distribution for time t so that its value for a pitch p is the ratio +> of the sum of the expression P(p,f,s|w,t) Vw,t for *that* pitch +> variable to the sum of the same expression across *all* pitch +> variables. + +The equation you put essentially takes the 5-dimensional quantity +P(p,f,s|w,t) Vw,t and marginalises it to P(p,t), i.e. it sums over all +other dimensions. All these 'unknown' parameters, e.g. P(s|p,t), are +generated from this 5-dimensional posterior distribution. + +> But what does it mean to refer to P(p,f,s|w,t) for a single pitch +> variable, given that P(p,f,s|w,t) is just a time-frequency +> distribution? There doesn't seem to be any dependence on p in it. I +> think this is where I'm missing the (hopefully obvious) fundamental +> thing. + +P(p,f,s|w,t) is not a time-frequency distribution; it is a 5-dimensional +posterior distribution of the 3 unknown model parameters given time and +frequency. + +The basic concept of EM is that you have a latent variable in your +model, e.g. p; in the E-step, you compute the posterior given the +known/input data (e.g. P(p|w,t)). For the M-step, you compute the +complete likelihood given the original input, P(p|w,t)Vwt; and you +marginalise over the variables that you don't care about, e.g. if you +want to find P(p|t), you compute \sum_w P(p|w,t)V_wt; finally, you +normalise that result according to your model, so if your model has a +P(p|t) component, you normalise so that P(p) for a given timeframe sums +to one (this is the denominator in the equation you showed). + diff -r bbc8185ba511 -r f1f8c84339d0 notes/hnmf.m --- a/notes/hnmf.m Wed Mar 26 11:44:51 2014 +0000 +++ b/notes/hnmf.m Wed Mar 26 18:49:12 2014 +0000 @@ -68,7 +68,7 @@ for it = 1:iter % E-step - zh = z .* permute(repmat(h,[1 1 R]),[3 1 2]); + zh = z .* permute(repmat(h,[1 1 R]),[3 1 2]); %% z is the source activation distribution, h the component (pitch) activation xa=eps; for r=1:R for k=1:K diff -r bbc8185ba511 -r f1f8c84339d0 yeti/em_onecolumn.yeti --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/yeti/em_onecolumn.yeti Wed Mar 26 18:49:12 2014 +0000 @@ -0,0 +1,89 @@ + +module em_onecolumn; + +mm = load may.mathmisc; +vec = load may.vector; +mat = load may.matrix; + +inRange ranges instrument note = + note >= ranges[instrument].lowest and note <= ranges[instrument].highest; + +initialise ranges templates notes = + (instruments = keys ranges; + { + pitches = // z in the original. 1 per note + vec.randoms notes, + sources = // u in the original. 1 per note-instrument + mapIntoHash id + do instrument: + vec.fromList + (map do note: + if inRange ranges instrument note then 1 else 0 fi + done [0..notes-1]) + done instruments, + instruments, + templates, + ranges, + lowest = head + (sort (map do i: ranges[i].lowest done instruments)), + highest = head (reverse + (sort (map do i: ranges[i].highest done instruments))), + }); + +epsilon = 1e-16; + +select predicate = concatMap do v: if predicate v then [v] else [] fi done; + +distributionsFor data instrument note = + { + w = mat.getColumn note data.templates[instrument], + p = vec.at data.pitches note, + s = vec.at data.sources[instrument] note, + }; + +performExpectation data column = + (estimate = + fold do acc instrument: + fold do acc note: + { w, p, s } = distributionsFor data instrument note; + vec.add [acc, vec.scaled (p * s) w]; + done acc [data.ranges[instrument].lowest .. + data.ranges[instrument].highest] + done (vec.consts epsilon (vec.length column)) data.instruments; + vec.divide column estimate); + +performMaximisation data column q = + (pitchTop = vec.fromList + (map do note: + fold do acc instrument: + { w, p, s } = distributionsFor data instrument note; + fold do acc bin: + acc + s * (vec.at w bin) * (vec.at column bin); + done acc [0..vec.length column - 1] + done epsilon data.instruments; + done [data.lowest .. data.highest]); + pitchBottom = vec.sum pitchTop; + +/* sourceTop = vec.fromList + (map do instrument: + fold do acc note: + { w, p, s } = distributionsFor data instrument note; + fold do acc bin: + acc + p * (vec.at w bin) * (vec.at column bin); + done acc [0..vec.length column - 1] + done epsilon [data.lowest .. data.highest] + done data.instruments); + sourceBottom = vec.sum sourceTop; +*/ + data with { + pitches = vec.divideBy pitchBottom pitchTop, +// sources = vec.divideBy sourceBottom sourceTop + }); + +{ + initialise, + performExpectation, + performMaximisation, +} + + diff -r bbc8185ba511 -r f1f8c84339d0 yeti/run.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/yeti/run.sh Wed Mar 26 18:49:12 2014 +0000 @@ -0,0 +1,3 @@ +#!/bin/bash + +CLASSPATH=../../constant-q-cpp/yeti/cqt.jar ../../may/bin/yc ./silvet.yeti diff -r bbc8185ba511 -r f1f8c84339d0 yeti/silvet.yeti --- a/yeti/silvet.yeti Wed Mar 26 11:44:51 2014 +0000 +++ b/yeti/silvet.yeti Wed Mar 26 18:49:12 2014 +0000 @@ -5,6 +5,7 @@ { loadTemplates, extractRanges } = load templates; em = load em; +em1 = load em_onecolumn; mat = load may.matrix; vec = load may.vector; @@ -22,6 +23,7 @@ columns = prepareTimeFrequency "test.wav"; +/* height = if empty? columns then 0 else vec.length (head columns) fi; chunkSize = { rows = height, columns = 100 }; @@ -58,6 +60,25 @@ \() (plot.plot [ Grid error ]); \() (plot.plot [ Grid newP ]); +*/ + +em1data = em1.initialise ranges templates 88; + +col = head (drop 35 columns); + +\() (plot.plot [ Vector col ]); + +oneIteration em1data col = + (q = em1.performExpectation em1data col; + \() (plot.plot [ Vector col, Vector q ]); + newdata = em1.performMaximisation em1data col q; + \() (plot.plot [ Vector (em1data.pitches), Vector (newdata.pitches) ]); + newdata); + +em1data = oneIteration em1data col; +em1data = oneIteration em1data col; +em1data = oneIteration em1data col; + (); diff -r bbc8185ba511 -r f1f8c84339d0 yeti/test.wav Binary file yeti/test.wav has changed