Mercurial > hg > dml-open-cliopatria
comparison dml-cla/python/similarity.py @ 0:718306e29690 tip
commiting public release
author | Daniel Wolff |
---|---|
date | Tue, 09 Feb 2016 21:05:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:718306e29690 |
---|---|
1 # Part of DML (Digital Music Laboratory) | |
2 # Copyright 2014-2015 Daniel Wolff, City University | |
3 | |
4 # This program is free software; you can redistribute it and/or | |
5 # modify it under the terms of the GNU General Public License | |
6 # as published by the Free Software Foundation; either version 2 | |
7 # of the License, or (at your option) any later version. | |
8 # | |
9 # This program is distributed in the hope that it will be useful, | |
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 # GNU General Public License for more details. | |
13 # | |
14 # You should have received a copy of the GNU General Public | |
15 # License along with this library; if not, write to the Free Software | |
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
17 | |
18 # -*- coding: utf-8 -*- | |
19 __author__='wolffd' | |
20 | |
21 # this script derives all pairwise similarity measures for the chroma vectors provided. | |
22 # as a first experiment, only the mean chroma vectors per piece are compared | |
23 # using euclidean distance | |
24 | |
25 | |
26 # parameters to be forwarded to API | |
27 | |
28 # similarity type: | |
29 # euclidean, compression | |
30 # simtype = 'euclidean' | |
31 | |
32 # parallelisation | |
33 num_cores = 10 | |
34 | |
35 | |
36 #min_clusters = 40## unused | |
37 #max_clusters = 256## unused | |
38 | |
39 #set_clusters = 40 | |
40 #max_clips = 50 | |
41 encoding = 'binary' | |
42 #compressor = 'zxd' | |
43 mds_init_tries = 4 | |
44 mds_max_iter = 100 | |
45 mfccbins = 12 | |
46 | |
47 # resample chroma / timbre values at this fraction for compression distance. 0 to switch off | |
48 # we want a vector just every second. | |
49 # The standard sample rate and window size are 44100 / 1024 for chroma / timbre | |
50 # this is dependent on the sim_downsample parameter | |
51 resample_factor = 44100/1024; | |
52 | |
53 | |
54 from rdflib import RDF, RDFS | |
55 from csvutils import * | |
56 from aggregate import * | |
57 from n3Parser import get_rdf_graph_from_n3 | |
58 | |
59 # numpy, scipy | |
60 import numpy as np | |
61 from scipy.spatial import distance | |
62 from sklearn.metrics.pairwise import pairwise_distances | |
63 from scipy.signal import resample | |
64 | |
65 #scikitlearn | |
66 from sklearn.datasets import make_blobs | |
67 from sklearn.cluster import KMeans | |
68 # from sklearn.metrics import silhouette_samples, silhouette_score | |
69 from sklearn import manifold | |
70 | |
71 | |
72 # chord processing | |
73 from chord_seq_key_relative import chords_from_csv, keys_from_csv, chord2function, fun2txt, fun2num,most_frequent_key,chord_roots,type_labels | |
74 | |
75 # subprocess, command line and threading | |
76 import os, tempfile | |
77 import subprocess, threading | |
78 | |
79 # for system / compression calls | |
80 import zlib | |
81 | |
82 | |
83 def chroma_from_csv(filename): | |
84 # we assume CSV: time, chroma 1, ... chroma 12] | |
85 # return (time, [chroma 1-12]) | |
86 return csv_map_rows(filename,13, lambda row:(float(row[0]),np.array(row[1:12],dtype=float))) | |
87 | |
88 def mfcc_from_csv(filename): | |
89 # we assume CSV: time, mfcc 1, ... mfcc 20] | |
90 # return (time, [chroma 1-12]) | |
91 return csv_map_rows(filename,21, lambda row:(float(row[0]),np.array(row[1:20],dtype=float))) | |
92 | |
93 | |
94 chroma_parser_table = { 'csv':chroma_from_csv} | |
95 mfcc_parser_table = { 'csv':mfcc_from_csv} | |
96 | |
97 # in chord_seq_relative | |
98 key_parser_table = { 'csv':keys_from_csv } | |
99 chord_parser_table = { 'csv':chords_from_csv } | |
100 | |
101 ## generate global dict of chord_keys | |
102 chord_keys = [] | |
103 for chordnum in range(1,12+1): | |
104 for typenum in range(1,11+1): | |
105 chord_keys.append("%02d%02d" % (chordnum,typenum)) | |
106 | |
107 def per_file(inputs,opts={}): | |
108 chromas = [] | |
109 chromas_idx = [] | |
110 mfccs = [] | |
111 mfccs_idx = [] | |
112 chords = [] | |
113 chords_idx = [] | |
114 uris = [] | |
115 | |
116 # get options from API | |
117 # print_status(str(opts)) | |
118 simtype = opts['sim_type'] | |
119 set_clusters = opts['sim_clusters'] # def 40 | |
120 downsample = opts['sim_downsample'] # def 1 | |
121 limit = opts['sim_reclimit'] # def 50 | |
122 compressor = opts['sim_compressor'] # def 'zlib' | |
123 | |
124 # parse feature list | |
125 features = opts['sim_features'].split(',') # features, def: chroma | |
126 use_chromagram = any(ext in 'chromagram' for ext in features) | |
127 use_mfcc = any(ext in 'mfcc' for ext in features) | |
128 use_chords = any(ext in 'chords' for ext in features) | |
129 | |
130 # check number of inputs | |
131 if len(inputs) > limit: | |
132 #return { 'error': ''} | |
133 print_status('Similarity: Too many inputs, truncating collection') | |
134 inputs = inputs[0:limit] | |
135 | |
136 | |
137 # accumulation for euclidean just gets the mean values over the whole clips | |
138 # todo: add std and other statistics? | |
139 def accum_euclidean(item): | |
140 # accumulate chroma vectors for this piece | |
141 if use_chromagram: | |
142 chroma = [ res[1] for res in decode_tagged(chroma_parser_table,item['chromagram'])] | |
143 # print_status('Chroma Raw Data' + str(chroma)) | |
144 # get mean chroma vector | |
145 chroma_mean = np.mean(np.array(chroma), axis = 0) | |
146 #print_status('Chroma Means' + str(chroma_mean)) | |
147 | |
148 # add vector to chromas table | |
149 chromas.append(chroma_mean) | |
150 | |
151 if use_mfcc: | |
152 mfcc = [ res[1] for res in decode_tagged(mfcc_parser_table,item['mfcc'])] | |
153 mfcc_mean = np.mean(np.array(mfcc), axis = 0) | |
154 mfccs.append(mfcc_mean) | |
155 | |
156 if use_chords: | |
157 # get duration and normalised frequency for all tuning pitches (A3,A4,A5) | |
158 keys = decode_tagged(key_parser_table,item['keys']) | |
159 # get most frequent key | |
160 key,mode = most_frequent_key(keys) | |
161 relchords = [] | |
162 for (time,chord) in decode_tagged(chord_parser_table,item['chords']): | |
163 | |
164 # get chord function | |
165 (root,fun,typ, bfun) = chord2function(chord, key,mode) | |
166 | |
167 # translate into text | |
168 #txt = fun2txt(fun,typ, bfun, mode) | |
169 #print_status('Chord: ' + chord + ', function: ' + txt) | |
170 num = fun2num(fun,typ, bfun, mode) | |
171 if num > 0: | |
172 # add to chords of this clip | |
173 #relchords.append((time,key,mode,fun,typ,bfun)) | |
174 | |
175 # get the root note of the chord and chord type | |
176 # ignore mode and base note | |
177 # format of num [1x mode, 2x function, 2x type, 2x base note] | |
178 relchords.append(str(num)[1:5]) | |
179 | |
180 # append histogram of all chords for this recording | |
181 hist = chord_histogram(relchords) | |
182 chords.append(hist) | |
183 | |
184 | |
185 # add uri if everything went well | |
186 uris.append(item['list']) | |
187 | |
188 # accumulation for compression: | |
189 # save all chroma vectors | |
190 # possibly build a codebook | |
191 # otherwise compare for quantisation | |
192 | |
193 def accum_compression(item): | |
194 | |
195 # get chromas | |
196 if use_chromagram: | |
197 # accumulate chroma vectors for this piece | |
198 chroma = [ res[1] for res in decode_tagged(chroma_parser_table,item['chromagram'])] | |
199 # print_status('Chroma Raw Data' + str(chroma)) | |
200 | |
201 # downsample if necessary | |
202 if downsample == 1: | |
203 #chroma = resample(chroma, len(chroma)//resample_factor, axis=0, window=None) | |
204 #chroma = [chroma[i] for i in np.random.randint(0,len(chroma),len(chroma)//resample_factor)] | |
205 chroma = [chroma[i*resample_factor] for i in range(0,len(chroma)//resample_factor)] | |
206 | |
207 chromas.extend(chroma) | |
208 chromas_idx.append(len(chromas)) | |
209 | |
210 | |
211 if use_mfcc: | |
212 mfcc = [ res[1] for res in decode_tagged(mfcc_parser_table,item['mfcc'])] | |
213 | |
214 if downsample == 1: | |
215 # mfcc = np.random.randint(0,len(mfcc),len(mfcc)//resample_factor)] | |
216 mfcc = [mfcc[i*resample_factor] for i in range(0,len(mfcc)//resample_factor)] | |
217 mfccs.extend(mfcc) | |
218 mfccs_idx.append(len(mfccs)) | |
219 | |
220 if use_chords: | |
221 # get duration and normalised frequency for all tuning pitches (A3,A4,A5) | |
222 keys = decode_tagged(key_parser_table,item['keys']) | |
223 # get most frequent key | |
224 key,mode = most_frequent_key(keys) | |
225 relchords = [] | |
226 for (time,chord) in decode_tagged(chord_parser_table,item['chords']): | |
227 | |
228 # get chord function | |
229 (root,fun,typ, bfun) = chord2function(chord, key,mode) | |
230 | |
231 # translate into text | |
232 #txt = fun2txt(fun,typ, bfun, mode) | |
233 #print_status('Chord: ' + chord + ', function: ' + txt) | |
234 num = fun2num(fun,typ, bfun, mode) | |
235 if num > 0: | |
236 # add to chords of this clip | |
237 #relchords.append((time,key,mode,fun,typ,bfun)) | |
238 | |
239 # get the root note of the chord and chord type | |
240 # ignore mode and base note | |
241 # format of num [1x mode, 2x function, 2x type, 2x base note] | |
242 relchords.append(int(str(num)[1:5])) | |
243 | |
244 # append histogram of all chords for this recording | |
245 #hist = chord_histogram(relchords) | |
246 chords.extend(relchords) | |
247 chords_idx.append(len(chords)) | |
248 | |
249 # add uri if everything went well | |
250 uris.append(item['list']) | |
251 | |
252 | |
253 | |
254 # --- | |
255 # this is the euclidean distance | |
256 # --- | |
257 if (simtype == 'euclidean'): | |
258 # accumulate over all inputs | |
259 st=for_each(inputs,accum_euclidean) | |
260 | |
261 # concatenate feature input for all features | |
262 arr = np.empty((len(uris),0), float) | |
263 # concatenate data to nparray for euclidean distance | |
264 if use_chromagram: | |
265 arr = np.append(arr, np.array(chromas), axis=1) | |
266 | |
267 if use_mfcc: | |
268 arr = np.append(arr, np.array(mfccs), axis=1) | |
269 | |
270 if use_chords: | |
271 # get chord dictionaries | |
272 #print(str(np.array(chords).shape)) | |
273 arr = np.append(arr,np.array(chords) , axis=1) | |
274 | |
275 #dist = distance.pdist(chromas, 'euclidean') | |
276 dist = pairwise_distances(arr, metric = 'euclidean', n_jobs = num_cores) | |
277 | |
278 # return to non-condensed matrix for simplicity. | |
279 # this can be reversed using the very same function for data | |
280 # efficiency | |
281 #dist = distance.squareform(dist) | |
282 | |
283 # --- | |
284 # this is the normalised compression distance | |
285 # --- | |
286 elif (simtype == 'compression'): | |
287 # accumulate over all inputs | |
288 print_status('Similarity Module: Accumulating') | |
289 st=for_each(inputs,accum_compression) | |
290 | |
291 dist = np.zeros((len(uris),len(uris))) | |
292 count = 0 | |
293 if use_chromagram: | |
294 print_status('Similarity Module: Chroma Quantisation') | |
295 chromas_coded = vector_quantisation(np.array(chromas), set_clusters,num_cores) | |
296 print_status('Similarity Module: Chroma Compression Results') | |
297 dist += similarity_by_mask(chromas_coded,chromas_idx,compressor,encoding) | |
298 count +=1 | |
299 | |
300 if use_mfcc: | |
301 print_status('Similarity Module: MFCC Quantisation') | |
302 mfccs_coded = vector_quantisation(np.array(mfccs), set_clusters,num_cores) | |
303 print_status('Similarity Module: MFCC Compression Results') | |
304 dist += similarity_by_mask(mfccs_coded,mfccs_idx,compressor,encoding) | |
305 count +=1 | |
306 | |
307 if use_chords: | |
308 print_status('Similarity Module: Chord Compression Results') | |
309 dist += similarity_by_mask(np.array(chords),chords_idx,compressor,encoding) | |
310 count +=1 | |
311 | |
312 dist = dist / count | |
313 | |
314 | |
315 # get rid of zeros in between | |
316 #for idx1 in range(0,len(chromas_idx)): | |
317 # dist[idx1][idx1] = 1 | |
318 | |
319 print_status('dist' + str(dist)) | |
320 | |
321 # Do MDS scaling with precomputed distance | |
322 mds = manifold.MDS(n_components = 2, max_iter=mds_max_iter, n_init=mds_init_tries, dissimilarity='precomputed') | |
323 | |
324 coordinates = mds.fit_transform(dist) | |
325 | |
326 return { 'result': { 'list': uris, 'mds': coordinates.tolist()}, 'stats' : st } | |
327 # return { 'result': { 'list': uris, 'distance': dist.tolist(), 'mds': coordinates.tolist()}, | |
328 # 'stats' : st } | |
329 | |
330 | |
331 | |
332 def vector_quantisation(data, set_clusters,num_cores): | |
333 # --- | |
334 # build codebook! | |
335 # --- | |
336 # --- 1 quantise chroma data | |
337 # --- 1a use scikit-learn k-means | |
338 # http://scikit-learn.org/stable/modules/clustering.html | |
339 | |
340 # quick try | |
341 clusterer = KMeans(n_clusters=set_clusters,n_jobs = num_cores) | |
342 | |
343 # --- 2 get compression distance | |
344 # get all single compressed sizes? | |
345 data_coded = clusterer.fit_predict(data) | |
346 #print_status('Chromas Coded' + str(chromas_coded)) | |
347 # print_status('Coding Histogram' + str(np.histogram(chromas_coded))) | |
348 return data_coded | |
349 | |
350 | |
351 | |
352 def similarity_by_mask(data_coded,data_idx,compressor,encoding): | |
353 | |
354 # idx is expected to start with the first chroma index of the second piece | |
355 # TODO: check indexing (starts at 0 or 1?) | |
356 lengths = [] | |
357 start_idx = [0] + data_idx[:-1] | |
358 dist = np.zeros((len(data_idx),len(data_idx))) | |
359 for idx1 in range(0,len(data_idx)): | |
360 for idx2 in range(0,len(data_idx)): | |
361 if (idx2 < idx1): | |
362 # select encoded chromas for the clips | |
363 data1_mask = np.zeros(len(data_coded), dtype=bool) | |
364 data1_mask[start_idx[idx1]:data_idx[idx1]-1] = True | |
365 | |
366 data2_mask = np.zeros(len(data_coded), dtype=bool) | |
367 data2_mask[start_idx[idx2]:data_idx[idx2]-1] = True | |
368 | |
369 a_coded = encode(data_coded[data1_mask],format = encoding) | |
370 b_coded = encode(data_coded[data2_mask],format = encoding) | |
371 # get compression lengths | |
372 if compressor == 'zlib': | |
373 (a,b,ab) = compressed_length(a_coded,b_coded,compressor) | |
374 | |
375 else: | |
376 # get complement chroma set | |
377 ref_mask = ~data1_mask & ~data2_mask | |
378 ref_coded = encode(data_coded[ref_mask],format = encoding) | |
379 (a,b,ab) = delta_compressed_length(a_coded,b_coded,ref_coded,compressor) | |
380 | |
381 #NCD(z - min(v, w))/ max(v, w); | |
382 dist[idx1][idx2] = (ab - min(a,b))/float(max(a,b)) | |
383 | |
384 # the above normalised compression distance is symmetric | |
385 # this is required by the nds routine below | |
386 dist[idx2][idx1] = dist[idx1][idx2] | |
387 | |
388 return dist | |
389 | |
390 def encode(data, format = 'string'): | |
391 | |
392 # Encoding | |
393 if format == 'binary': | |
394 data_coded = data.tostring() | |
395 elif format == 'string': | |
396 data_coded = str(data) | |
397 return data_coded | |
398 | |
399 def compressed_length(a_coded,b_coded, type = 'zlib'): | |
400 | |
401 # Compression | |
402 if type == 'zlib': | |
403 # zlib is quite helpful https://docs.python.org/2/library/zlib.html#module-zlib | |
404 a = len(zlib.compress(a_coded, 9)) | |
405 b = len(zlib.compress(a_coded, 9)) | |
406 ab = len(zlib.compress(a_coded + b_coded, 9)) | |
407 | |
408 return (a,b,ab) | |
409 | |
410 def delta_compressed_length(a_coded,b_coded,ref_coded, type = 'zxd'): | |
411 # Compression | |
412 # zbs - use bsdiff | |
413 # zxd - uses xdelta3 | |
414 # zvcd - uses open-vcdiff | |
415 # zvcz - uses vczip | |
416 # zdiff - converts binary to text and uses diff to produce an ed script | |
417 | |
418 if type == 'zxd' or type == 'zbs' or type == 'zvcz' or type == 'zdiff' or type == 'zvcd': | |
419 | |
420 freference = tempfile.NamedTemporaryFile(delete=False) | |
421 freference.write(ref_coded) | |
422 freference.close() | |
423 #print_status('Ref File: ' + freference.name) | |
424 | |
425 # to be optimised with bufs later | |
426 # get length of a regarding reference | |
427 command = '/home/dml/src/hg/dml-cliopatria/cpack/dml/scripts/compression/%s encode %s | /home/dml/src/hg/dml-cliopatria/cpack/dml/scripts/compression/length' % (type, freference.name) | |
428 # print_status(command) | |
429 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True) | |
430 output,err = p1.communicate(input=a_coded) | |
431 a = int(output) | |
432 | |
433 # get length of b regarding reference | |
434 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True) | |
435 output,err = p1.communicate(input=b_coded) | |
436 b = int(output) | |
437 | |
438 # get length of a,b regarding reference | |
439 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True) | |
440 output,err = p1.communicate(input=a_coded + b_coded) | |
441 ab = int(output) | |
442 | |
443 | |
444 #print_status('Compressed Output' + compressed) | |
445 #print_status('Compressed Size' + str(len(compressed))) | |
446 os.remove(freference.name) | |
447 return (a,b,ab) | |
448 | |
449 # histogram of the last entry in a list | |
450 # returns the most frequently used key | |
451 def chord_histogram(chordstr = []): | |
452 global chord_keys | |
453 # build histogram | |
454 | |
455 histo = dict.fromkeys(chord_keys,0) | |
456 for chord in chordstr: | |
457 histo[chord] = histo.get(chord,0) + 1 | |
458 #print_status(str(histo.keys())) | |
459 | |
460 counts = np.array(histo.values(),float) | |
461 if max(counts) > 0: | |
462 counts = counts / max(counts) | |
463 return (counts) | |
464 | |
465 |