Mercurial > hg > plosone_underreview
comparison scripts/smoothiecore.py @ 4:e50c63cf96be branch-tests
rearranging folders
author | Maria Panteli |
---|---|
date | Mon, 11 Sep 2017 11:51:50 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:230a0cf17de0 | 4:e50c63cf96be |
---|---|
1 """Smoothie | |
2 | |
3 A module aimed at melody salience and melody features. It contains | |
4 | |
5 * a frequency transform inspired by constant-Q transforms | |
6 * an NMF-based melodic salience function | |
7 * methods that transform this salience into melody features (not implemented) | |
8 """ | |
9 | |
10 import sys | |
11 import numpy as np | |
12 from scipy.signal import hann | |
13 from scipy.sparse import csr_matrix | |
14 import librosa | |
15 | |
16 p = {# smoothie q mapping parameters | |
17 'bpo' : 36, | |
18 'break1' : 500, | |
19 'break2' : 4000, | |
20 'f_max' : 8000, | |
21 'fs' : 16000, | |
22 # spectrogram parameters | |
23 'step_size' : 160, | |
24 'use_A_wtg' : False, | |
25 'harm_remove' : False, | |
26 # NMF and NMF dictionary parameters | |
27 's' : 0.85, # decay parameter | |
28 'n_harm' : 50, # I'm gonna do you no harm | |
29 'min_midi' : 25, | |
30 'max_midi' : 90.8, | |
31 'bps' : 5, | |
32 'n_iter' : 30 | |
33 } | |
34 | |
35 def update_nmf_simple(H, W, V): | |
36 """Update the gain matrix H using the multiplicative NMF update equation. | |
37 | |
38 Keyword arguments: | |
39 H -- gain matrix | |
40 W -- template matrix | |
41 V -- target data matrix | |
42 """ | |
43 WHV = np.dot(W, H) ** (-1) * V | |
44 H = H * np.dot(W.transpose(), WHV) | |
45 return H | |
46 | |
47 def make_a_weight(f): | |
48 """Return the values of A-weighting at the input frequencies f.""" | |
49 f = np.array(f) | |
50 zaehler = 12200 * 12200* (f**4) | |
51 nenner = (f*f + 20.6*20.6) * np.sqrt((f*f + 107.7*107.7) * (f*f + 737.9*737.9)) * (f*f + 12200*12200) | |
52 return zaehler / nenner | |
53 | |
54 def nextpow2(x): | |
55 """Return the smallest integer n such that 2 ** n > x.""" | |
56 return np.ceil(np.log2(x)) | |
57 | |
58 def get_smoothie_frequencies(p): | |
59 """Calculate filter centres and widths for the smooth Q transform.""" | |
60 # I think this calculation should be reconsidered in order to have | |
61 # a more principled transition between linear and exponential | |
62 n = 1.0 / (2.0**(1.0/p['bpo']) - 1) | |
63 x = p['break1'] * 1.0 / n | |
64 | |
65 # first linear stretch | |
66 # f = (np.arange(1, int(np.round(n)) + 1) * x).round().tolist() | |
67 f = (np.arange(1, int(np.round(n)) + 1) * x).tolist() | |
68 | |
69 # exponential stretch | |
70 while max(f) < p['break2']: | |
71 f.append(max(f) * 2**(1.0/p['bpo'])) | |
72 | |
73 # final linear stretch | |
74 lastdiff = f[-1] - f[-2] | |
75 while max(f) < p['f_max']: | |
76 f.append(max(f) + lastdiff) | |
77 | |
78 deltaf = np.diff(np.array(f)) | |
79 f = f[:-1] | |
80 | |
81 return f, deltaf | |
82 | |
83 def create_smoothie_kernel(f, deltaf, fs): | |
84 """Create a sparse matrix that maps the complex DFT to the complex | |
85 smoothie representation. | |
86 """ | |
87 print >>sys.stdout, "[ SMOOTHIE Q kernel calculation ... ]" | |
88 n_filter = len(f) | |
89 n_fft = 2**nextpow2(np.ceil(fs/min(deltaf))) | |
90 | |
91 thresh = 0.0054 | |
92 smoothie_kernel = np.zeros([n_fft, n_filter], np.complex64) | |
93 | |
94 for i_filter in range(n_filter-1, -1, -1): # descending | |
95 # print i_filter | |
96 Q = f[i_filter] * 1.0 / deltaf[i_filter] # local Q for this filter | |
97 lgth = int(np.ceil(fs * 1.0 / deltaf[i_filter])) | |
98 lgthinv = 1.0 / lgth | |
99 offset = int(n_fft/2 - np.ceil(lgth * 0.5)) | |
100 temp = hann(lgth) * lgthinv * \ | |
101 np.exp(2j * np.pi * Q * (np.arange(lgth) - offset) * lgthinv) | |
102 # print(sum(hann(lgth)), Q, lgth, offset) | |
103 temp_kernel = np.zeros(n_fft, dtype = np.complex64) | |
104 temp_kernel[np.arange(lgth) + offset] = temp | |
105 spec_kernel = np.fft.fft(temp_kernel) | |
106 spec_kernel[abs(spec_kernel) <= thresh] = 0 | |
107 smoothie_kernel[:,i_filter] = spec_kernel | |
108 return csr_matrix(smoothie_kernel.conj()/n_fft) | |
109 | |
110 def smoothie_q_spectrogram(x, p): | |
111 """Calculate the actual spectrogram with smooth Q frequencies""" | |
112 print >>sys.stdout, "[ SMOOTHIE Q spectrogram ... ]" | |
113 | |
114 # precalculate smoothie kernel | |
115 f, deltaf = get_smoothie_frequencies(p) | |
116 smoothie_kernel = create_smoothie_kernel(f, deltaf, p['fs']) | |
117 n_fft, n_filter = smoothie_kernel.shape | |
118 | |
119 # some preparations | |
120 n_sample = len(x) | |
121 # n_frame = int(np.floor(n_sample / p['step_size'])) | |
122 n_frame = int(np.ceil(n_sample / float(p['step_size']))) # added mp | |
123 t = (np.arange(n_frame) * p['step_size']) * 1.0 / p['fs'] | |
124 smoothie_kernelT = smoothie_kernel.T | |
125 | |
126 # allocate | |
127 s = np.zeros((n_filter, n_frame), dtype = np.complex64) | |
128 | |
129 # pad (if wanted) | |
130 x = np.concatenate((np.zeros(n_fft/2), x, np.zeros(n_fft/2))) | |
131 | |
132 for i_frame in range(n_frame): | |
133 smpl = p['step_size'] * i_frame | |
134 block = x[smpl + np.arange(n_fft)] | |
135 s[:, i_frame] = smoothie_kernelT.dot(np.fft.fft(block)) | |
136 | |
137 if p['use_A_wtg']: | |
138 a_wtg = make_a_weight(f) | |
139 s = s * a_wtg[:, np.newaxis] | |
140 | |
141 return s, t | |
142 | |
143 def mel_triangles(input_f): | |
144 """Make matrix with mel filters at the given frequencies. | |
145 Warning: this is a very coarse mel filterbank. | |
146 """ | |
147 n_linearfilters = 3 | |
148 n_logfilters0 = 30 # just something high, will be pruned later | |
149 linear_spacing = 100 | |
150 log_spacing = 6.0/4 | |
151 n_filter0 = n_linearfilters + n_logfilters0 | |
152 | |
153 freqs = np.zeros(n_filter0+2) # includes one more on either side, hence +2 | |
154 freqs[range(n_linearfilters+1)] = \ | |
155 np.arange(-1,n_linearfilters) * linear_spacing | |
156 freqs[range(n_linearfilters+1, n_filter0+2)] = \ | |
157 freqs[n_linearfilters] * log_spacing ** np.arange(1, n_logfilters0+2) | |
158 | |
159 centre_freqs = freqs[1:-1] | |
160 lower_freqs = freqs[0:-2] | |
161 upper_freqs = freqs[2:] | |
162 | |
163 n_filter = list(np.nonzero(lower_freqs < max(input_f)))[0][-1] + 1 | |
164 | |
165 n_input_f = len(input_f) | |
166 mtr = np.zeros((n_input_f, n_filter)) | |
167 | |
168 for i_filter in range(n_filter): | |
169 for i_freq in range(n_input_f): | |
170 if input_f[i_freq] > lower_freqs[i_filter] \ | |
171 and input_f[i_freq] <= upper_freqs[i_filter]: | |
172 | |
173 if input_f[i_freq] <= centre_freqs[i_filter]: | |
174 mtr[i_freq, i_filter] = \ | |
175 (input_f[i_freq] - lower_freqs[i_filter]) * 1.0 / \ | |
176 (centre_freqs[i_filter] - lower_freqs[i_filter]) | |
177 else: | |
178 mtr[i_freq, i_filter] = \ | |
179 1 - (input_f[i_freq] - centre_freqs[i_filter]) / \ | |
180 (upper_freqs[i_filter] - centre_freqs[i_filter]) | |
181 return mtr | |
182 | |
183 def create_smoothie_nfm_dict(smoothie_kernel, filterf, p): | |
184 """Create dictionary matrix with note templates.""" | |
185 n_note = int((p['max_midi'] - p['min_midi']) * p['bps'] + 1) | |
186 n_fft, n_filter = smoothie_kernel.shape | |
187 t = ((np.arange(n_fft) + 1) - ((n_fft + 1)*0.5))/p['fs'] | |
188 | |
189 mtr = mel_triangles(filterf) | |
190 n_template = n_note + mtr.shape[1] | |
191 | |
192 w = np.zeros((n_filter, n_template), dtype = np.complex64) | |
193 w[:,n_note:] = mtr | |
194 f0s = [] | |
195 | |
196 smoothie_kernelT = smoothie_kernel.T | |
197 | |
198 for i_note in range(n_note): | |
199 midi = p['min_midi'] + i_note * 1.0 / p['bps'] | |
200 f0 = 440 * 2 ** ((midi-69)*1.0/12) | |
201 f0s.append(f0) | |
202 sig = np.zeros(len(t)) | |
203 for i_harm in range(p['n_harm']): | |
204 f = f0 * (i_harm + 1) | |
205 if f > p['fs'] * 0.5: | |
206 continue | |
207 x = np.sin(2*np.pi*f*t) * p['s']**(i_harm) | |
208 sig += x | |
209 w[:, i_note] = smoothie_kernelT.dot(np.fft.fft(sig)) | |
210 | |
211 for i in range(mtr.shape[1]): | |
212 f0s.append(np.nan) | |
213 | |
214 w = abs(w) | |
215 col_sums = w.sum(axis = 0) | |
216 w = w / col_sums[np.newaxis, :] # normalisation | |
217 return w, np.array(f0s) | |
218 | |
219 def smoothie_salience(x, p, do_tune = False): | |
220 """Calculate melodic salience.""" | |
221 print >>sys.stdout, "[ SMOOTHIE Q salience ... ]" | |
222 | |
223 # precalculate nmf kernel | |
224 f, deltaf = get_smoothie_frequencies(p) | |
225 smoothie_kernel = create_smoothie_kernel(f, deltaf, p['fs']) | |
226 w, f0s = create_smoothie_nfm_dict(smoothie_kernel, f, p) | |
227 | |
228 # calculate smoothiegram | |
229 s, t = smoothie_q_spectrogram(x, p) | |
230 s[s==0] = np.spacing(1) # very small number | |
231 s = abs(s) | |
232 | |
233 # run NMF | |
234 n_frame = len(t) | |
235 print >>sys.stdout, "[ SMOOTHIE Q : NMF updates ... ]" | |
236 sal = np.ones((w.shape[1], n_frame)) | |
237 for i_iter in range(p['n_iter']): | |
238 sal = update_nmf_simple(sal, w, s) | |
239 | |
240 if do_tune: | |
241 error("Tuning isn't yet implemented in the Python version") | |
242 | |
243 return sal, t, f0s | |
244 | |
245 def qnormalise(x, q, dim): | |
246 nrm = np.sum(x**q, axis=dim, keepdims=True)**(1./float(q)) | |
247 nrmmatrix = np.repeat(nrm, x.shape[0], axis=0) | |
248 x = x / nrmmatrix | |
249 return x | |
250 | |
251 def wrap_to_octave(sal, param): | |
252 epsilon = 0.00001 | |
253 nsal = qnormalise(sal, 2, 0) | |
254 | |
255 step = 1./float(param['bps']) | |
256 NMFpitch = np.arange(param['min_midi'],param['max_midi']+step,step) | |
257 nNote = len(NMFpitch) | |
258 nsal = nsal[0:nNote, :] | |
259 | |
260 allsal = nsal | |
261 allsal[nsal<epsilon] = epsilon | |
262 | |
263 # chroma mapping | |
264 n = param['bps']*12 | |
265 mmap = np.tile(np.eye(n),(1,5)) | |
266 allchroma = mmap.dot(allsal[0:(n*5),:]) | |
267 return allchroma | |
268 | |
269 def segment_audio(y, sr): | |
270 tracklength = y.shape[0]/float(sr) | |
271 startSample = 0 | |
272 endSample = None | |
273 if tracklength > 90: | |
274 startPointSec = (tracklength/2.)-20 | |
275 startSample = round(startPointSec*sr) | |
276 endSample = startSample+45*sr | |
277 y = y[startSample:endSample] | |
278 return y | |
279 | |
280 def get_smoothie(filename = 'test.wav', param = None, segment=True, secondframedecomp=False, hopinsec=None): | |
281 if param is None: | |
282 param = p | |
283 param['fs'] = None | |
284 param['step_size'] = 128 | |
285 | |
286 y, sr = librosa.load(filename, sr = param['fs']) | |
287 param['fs'] = sr | |
288 if hopinsec is not None: | |
289 param['step_size'] = int(round(hopinsec*param['fs'])) | |
290 if segment: | |
291 y = segment_audio(y,sr) | |
292 # sg, t = smoothie_q_spectrogram(y, param) | |
293 # nmf_dict = create_smoothie_nfm_dict(smoothie_kernel, f, param) | |
294 sal, t, f0s = smoothie_salience(y, param) | |
295 sal = sal[-np.isnan(f0s),:] | |
296 chroma = wrap_to_octave(sal, param) | |
297 | |
298 if secondframedecomp: | |
299 chromasr = param['fs']/float(param['step_size']) | |
300 win2 = int(round(8*chromasr)) | |
301 hop2 = int(round(0.5*chromasr)) | |
302 nbins, norigframes = chroma.shape | |
303 nframes = int(1+np.floor((norigframes-win2)/float(hop2))) | |
304 avechroma = np.empty((nbins, nframes)) | |
305 for i in range(nframes): # loop over all 8-sec frames | |
306 avechroma[:,i] = np.mean(chroma[:, (i*hop2):(i*hop2+win2)], axis=1, keepdims=True) | |
307 chroma = avechroma | |
308 return chroma | |
309 | |
310 def get_smoothie_for_bihist(filename = 'test.wav', param = None, segment=True, hopinsec=None): | |
311 if param is None: | |
312 param = p | |
313 param['fs'] = None # no downsample | |
314 param['step_size'] = 128 | |
315 | |
316 y, sr = librosa.load(filename, sr = param['fs']) | |
317 param['fs'] = sr | |
318 if hopinsec is not None: | |
319 param['step_size'] = int(round(hopinsec*param['fs'])) | |
320 if segment: | |
321 y = segment_audio(y,sr) | |
322 # sg, t = smoothie_q_spectrogram(y, param) | |
323 # nmf_dict = create_smoothie_nfm_dict(smoothie_kernel, f, param) | |
324 sal, t, f0s = smoothie_salience(y, param) | |
325 sal = sal[-np.isnan(f0s),:] | |
326 chroma = wrap_to_octave(sal, param) | |
327 chromasr = param['fs']/float(param['step_size']) | |
328 return (chroma, chromasr) |