comparison dml-cla/python/similarity.py @ 0:718306e29690 tip

commiting public release
author Daniel Wolff
date Tue, 09 Feb 2016 21:05:06 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:718306e29690
1 # Part of DML (Digital Music Laboratory)
2 # Copyright 2014-2015 Daniel Wolff, City University
3
4 # This program is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU General Public License
6 # as published by the Free Software Foundation; either version 2
7 # of the License, or (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public
15 # License along with this library; if not, write to the Free Software
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 # -*- coding: utf-8 -*-
19 __author__='wolffd'
20
21 # this script derives all pairwise similarity measures for the chroma vectors provided.
22 # as a first experiment, only the mean chroma vectors per piece are compared
23 # using euclidean distance
24
25
26 # parameters to be forwarded to API
27
28 # similarity type:
29 # euclidean, compression
30 # simtype = 'euclidean'
31
32 # parallelisation
33 num_cores = 10
34
35
36 #min_clusters = 40## unused
37 #max_clusters = 256## unused
38
39 #set_clusters = 40
40 #max_clips = 50
41 encoding = 'binary'
42 #compressor = 'zxd'
43 mds_init_tries = 4
44 mds_max_iter = 100
45 mfccbins = 12
46
47 # resample chroma / timbre values at this fraction for compression distance. 0 to switch off
48 # we want a vector just every second.
49 # The standard sample rate and window size are 44100 / 1024 for chroma / timbre
50 # this is dependent on the sim_downsample parameter
51 resample_factor = 44100/1024;
52
53
54 from rdflib import RDF, RDFS
55 from csvutils import *
56 from aggregate import *
57 from n3Parser import get_rdf_graph_from_n3
58
59 # numpy, scipy
60 import numpy as np
61 from scipy.spatial import distance
62 from sklearn.metrics.pairwise import pairwise_distances
63 from scipy.signal import resample
64
65 #scikitlearn
66 from sklearn.datasets import make_blobs
67 from sklearn.cluster import KMeans
68 # from sklearn.metrics import silhouette_samples, silhouette_score
69 from sklearn import manifold
70
71
72 # chord processing
73 from chord_seq_key_relative import chords_from_csv, keys_from_csv, chord2function, fun2txt, fun2num,most_frequent_key,chord_roots,type_labels
74
75 # subprocess, command line and threading
76 import os, tempfile
77 import subprocess, threading
78
79 # for system / compression calls
80 import zlib
81
82
83 def chroma_from_csv(filename):
84 # we assume CSV: time, chroma 1, ... chroma 12]
85 # return (time, [chroma 1-12])
86 return csv_map_rows(filename,13, lambda row:(float(row[0]),np.array(row[1:12],dtype=float)))
87
88 def mfcc_from_csv(filename):
89 # we assume CSV: time, mfcc 1, ... mfcc 20]
90 # return (time, [chroma 1-12])
91 return csv_map_rows(filename,21, lambda row:(float(row[0]),np.array(row[1:20],dtype=float)))
92
93
94 chroma_parser_table = { 'csv':chroma_from_csv}
95 mfcc_parser_table = { 'csv':mfcc_from_csv}
96
97 # in chord_seq_relative
98 key_parser_table = { 'csv':keys_from_csv }
99 chord_parser_table = { 'csv':chords_from_csv }
100
101 ## generate global dict of chord_keys
102 chord_keys = []
103 for chordnum in range(1,12+1):
104 for typenum in range(1,11+1):
105 chord_keys.append("%02d%02d" % (chordnum,typenum))
106
107 def per_file(inputs,opts={}):
108 chromas = []
109 chromas_idx = []
110 mfccs = []
111 mfccs_idx = []
112 chords = []
113 chords_idx = []
114 uris = []
115
116 # get options from API
117 # print_status(str(opts))
118 simtype = opts['sim_type']
119 set_clusters = opts['sim_clusters'] # def 40
120 downsample = opts['sim_downsample'] # def 1
121 limit = opts['sim_reclimit'] # def 50
122 compressor = opts['sim_compressor'] # def 'zlib'
123
124 # parse feature list
125 features = opts['sim_features'].split(',') # features, def: chroma
126 use_chromagram = any(ext in 'chromagram' for ext in features)
127 use_mfcc = any(ext in 'mfcc' for ext in features)
128 use_chords = any(ext in 'chords' for ext in features)
129
130 # check number of inputs
131 if len(inputs) > limit:
132 #return { 'error': ''}
133 print_status('Similarity: Too many inputs, truncating collection')
134 inputs = inputs[0:limit]
135
136
137 # accumulation for euclidean just gets the mean values over the whole clips
138 # todo: add std and other statistics?
139 def accum_euclidean(item):
140 # accumulate chroma vectors for this piece
141 if use_chromagram:
142 chroma = [ res[1] for res in decode_tagged(chroma_parser_table,item['chromagram'])]
143 # print_status('Chroma Raw Data' + str(chroma))
144 # get mean chroma vector
145 chroma_mean = np.mean(np.array(chroma), axis = 0)
146 #print_status('Chroma Means' + str(chroma_mean))
147
148 # add vector to chromas table
149 chromas.append(chroma_mean)
150
151 if use_mfcc:
152 mfcc = [ res[1] for res in decode_tagged(mfcc_parser_table,item['mfcc'])]
153 mfcc_mean = np.mean(np.array(mfcc), axis = 0)
154 mfccs.append(mfcc_mean)
155
156 if use_chords:
157 # get duration and normalised frequency for all tuning pitches (A3,A4,A5)
158 keys = decode_tagged(key_parser_table,item['keys'])
159 # get most frequent key
160 key,mode = most_frequent_key(keys)
161 relchords = []
162 for (time,chord) in decode_tagged(chord_parser_table,item['chords']):
163
164 # get chord function
165 (root,fun,typ, bfun) = chord2function(chord, key,mode)
166
167 # translate into text
168 #txt = fun2txt(fun,typ, bfun, mode)
169 #print_status('Chord: ' + chord + ', function: ' + txt)
170 num = fun2num(fun,typ, bfun, mode)
171 if num > 0:
172 # add to chords of this clip
173 #relchords.append((time,key,mode,fun,typ,bfun))
174
175 # get the root note of the chord and chord type
176 # ignore mode and base note
177 # format of num [1x mode, 2x function, 2x type, 2x base note]
178 relchords.append(str(num)[1:5])
179
180 # append histogram of all chords for this recording
181 hist = chord_histogram(relchords)
182 chords.append(hist)
183
184
185 # add uri if everything went well
186 uris.append(item['list'])
187
188 # accumulation for compression:
189 # save all chroma vectors
190 # possibly build a codebook
191 # otherwise compare for quantisation
192
193 def accum_compression(item):
194
195 # get chromas
196 if use_chromagram:
197 # accumulate chroma vectors for this piece
198 chroma = [ res[1] for res in decode_tagged(chroma_parser_table,item['chromagram'])]
199 # print_status('Chroma Raw Data' + str(chroma))
200
201 # downsample if necessary
202 if downsample == 1:
203 #chroma = resample(chroma, len(chroma)//resample_factor, axis=0, window=None)
204 #chroma = [chroma[i] for i in np.random.randint(0,len(chroma),len(chroma)//resample_factor)]
205 chroma = [chroma[i*resample_factor] for i in range(0,len(chroma)//resample_factor)]
206
207 chromas.extend(chroma)
208 chromas_idx.append(len(chromas))
209
210
211 if use_mfcc:
212 mfcc = [ res[1] for res in decode_tagged(mfcc_parser_table,item['mfcc'])]
213
214 if downsample == 1:
215 # mfcc = np.random.randint(0,len(mfcc),len(mfcc)//resample_factor)]
216 mfcc = [mfcc[i*resample_factor] for i in range(0,len(mfcc)//resample_factor)]
217 mfccs.extend(mfcc)
218 mfccs_idx.append(len(mfccs))
219
220 if use_chords:
221 # get duration and normalised frequency for all tuning pitches (A3,A4,A5)
222 keys = decode_tagged(key_parser_table,item['keys'])
223 # get most frequent key
224 key,mode = most_frequent_key(keys)
225 relchords = []
226 for (time,chord) in decode_tagged(chord_parser_table,item['chords']):
227
228 # get chord function
229 (root,fun,typ, bfun) = chord2function(chord, key,mode)
230
231 # translate into text
232 #txt = fun2txt(fun,typ, bfun, mode)
233 #print_status('Chord: ' + chord + ', function: ' + txt)
234 num = fun2num(fun,typ, bfun, mode)
235 if num > 0:
236 # add to chords of this clip
237 #relchords.append((time,key,mode,fun,typ,bfun))
238
239 # get the root note of the chord and chord type
240 # ignore mode and base note
241 # format of num [1x mode, 2x function, 2x type, 2x base note]
242 relchords.append(int(str(num)[1:5]))
243
244 # append histogram of all chords for this recording
245 #hist = chord_histogram(relchords)
246 chords.extend(relchords)
247 chords_idx.append(len(chords))
248
249 # add uri if everything went well
250 uris.append(item['list'])
251
252
253
254 # ---
255 # this is the euclidean distance
256 # ---
257 if (simtype == 'euclidean'):
258 # accumulate over all inputs
259 st=for_each(inputs,accum_euclidean)
260
261 # concatenate feature input for all features
262 arr = np.empty((len(uris),0), float)
263 # concatenate data to nparray for euclidean distance
264 if use_chromagram:
265 arr = np.append(arr, np.array(chromas), axis=1)
266
267 if use_mfcc:
268 arr = np.append(arr, np.array(mfccs), axis=1)
269
270 if use_chords:
271 # get chord dictionaries
272 #print(str(np.array(chords).shape))
273 arr = np.append(arr,np.array(chords) , axis=1)
274
275 #dist = distance.pdist(chromas, 'euclidean')
276 dist = pairwise_distances(arr, metric = 'euclidean', n_jobs = num_cores)
277
278 # return to non-condensed matrix for simplicity.
279 # this can be reversed using the very same function for data
280 # efficiency
281 #dist = distance.squareform(dist)
282
283 # ---
284 # this is the normalised compression distance
285 # ---
286 elif (simtype == 'compression'):
287 # accumulate over all inputs
288 print_status('Similarity Module: Accumulating')
289 st=for_each(inputs,accum_compression)
290
291 dist = np.zeros((len(uris),len(uris)))
292 count = 0
293 if use_chromagram:
294 print_status('Similarity Module: Chroma Quantisation')
295 chromas_coded = vector_quantisation(np.array(chromas), set_clusters,num_cores)
296 print_status('Similarity Module: Chroma Compression Results')
297 dist += similarity_by_mask(chromas_coded,chromas_idx,compressor,encoding)
298 count +=1
299
300 if use_mfcc:
301 print_status('Similarity Module: MFCC Quantisation')
302 mfccs_coded = vector_quantisation(np.array(mfccs), set_clusters,num_cores)
303 print_status('Similarity Module: MFCC Compression Results')
304 dist += similarity_by_mask(mfccs_coded,mfccs_idx,compressor,encoding)
305 count +=1
306
307 if use_chords:
308 print_status('Similarity Module: Chord Compression Results')
309 dist += similarity_by_mask(np.array(chords),chords_idx,compressor,encoding)
310 count +=1
311
312 dist = dist / count
313
314
315 # get rid of zeros in between
316 #for idx1 in range(0,len(chromas_idx)):
317 # dist[idx1][idx1] = 1
318
319 print_status('dist' + str(dist))
320
321 # Do MDS scaling with precomputed distance
322 mds = manifold.MDS(n_components = 2, max_iter=mds_max_iter, n_init=mds_init_tries, dissimilarity='precomputed')
323
324 coordinates = mds.fit_transform(dist)
325
326 return { 'result': { 'list': uris, 'mds': coordinates.tolist()}, 'stats' : st }
327 # return { 'result': { 'list': uris, 'distance': dist.tolist(), 'mds': coordinates.tolist()},
328 # 'stats' : st }
329
330
331
332 def vector_quantisation(data, set_clusters,num_cores):
333 # ---
334 # build codebook!
335 # ---
336 # --- 1 quantise chroma data
337 # --- 1a use scikit-learn k-means
338 # http://scikit-learn.org/stable/modules/clustering.html
339
340 # quick try
341 clusterer = KMeans(n_clusters=set_clusters,n_jobs = num_cores)
342
343 # --- 2 get compression distance
344 # get all single compressed sizes?
345 data_coded = clusterer.fit_predict(data)
346 #print_status('Chromas Coded' + str(chromas_coded))
347 # print_status('Coding Histogram' + str(np.histogram(chromas_coded)))
348 return data_coded
349
350
351
352 def similarity_by_mask(data_coded,data_idx,compressor,encoding):
353
354 # idx is expected to start with the first chroma index of the second piece
355 # TODO: check indexing (starts at 0 or 1?)
356 lengths = []
357 start_idx = [0] + data_idx[:-1]
358 dist = np.zeros((len(data_idx),len(data_idx)))
359 for idx1 in range(0,len(data_idx)):
360 for idx2 in range(0,len(data_idx)):
361 if (idx2 < idx1):
362 # select encoded chromas for the clips
363 data1_mask = np.zeros(len(data_coded), dtype=bool)
364 data1_mask[start_idx[idx1]:data_idx[idx1]-1] = True
365
366 data2_mask = np.zeros(len(data_coded), dtype=bool)
367 data2_mask[start_idx[idx2]:data_idx[idx2]-1] = True
368
369 a_coded = encode(data_coded[data1_mask],format = encoding)
370 b_coded = encode(data_coded[data2_mask],format = encoding)
371 # get compression lengths
372 if compressor == 'zlib':
373 (a,b,ab) = compressed_length(a_coded,b_coded,compressor)
374
375 else:
376 # get complement chroma set
377 ref_mask = ~data1_mask & ~data2_mask
378 ref_coded = encode(data_coded[ref_mask],format = encoding)
379 (a,b,ab) = delta_compressed_length(a_coded,b_coded,ref_coded,compressor)
380
381 #NCD(z - min(v, w))/ max(v, w);
382 dist[idx1][idx2] = (ab - min(a,b))/float(max(a,b))
383
384 # the above normalised compression distance is symmetric
385 # this is required by the nds routine below
386 dist[idx2][idx1] = dist[idx1][idx2]
387
388 return dist
389
390 def encode(data, format = 'string'):
391
392 # Encoding
393 if format == 'binary':
394 data_coded = data.tostring()
395 elif format == 'string':
396 data_coded = str(data)
397 return data_coded
398
399 def compressed_length(a_coded,b_coded, type = 'zlib'):
400
401 # Compression
402 if type == 'zlib':
403 # zlib is quite helpful https://docs.python.org/2/library/zlib.html#module-zlib
404 a = len(zlib.compress(a_coded, 9))
405 b = len(zlib.compress(a_coded, 9))
406 ab = len(zlib.compress(a_coded + b_coded, 9))
407
408 return (a,b,ab)
409
410 def delta_compressed_length(a_coded,b_coded,ref_coded, type = 'zxd'):
411 # Compression
412 # zbs - use bsdiff
413 # zxd - uses xdelta3
414 # zvcd - uses open-vcdiff
415 # zvcz - uses vczip
416 # zdiff - converts binary to text and uses diff to produce an ed script
417
418 if type == 'zxd' or type == 'zbs' or type == 'zvcz' or type == 'zdiff' or type == 'zvcd':
419
420 freference = tempfile.NamedTemporaryFile(delete=False)
421 freference.write(ref_coded)
422 freference.close()
423 #print_status('Ref File: ' + freference.name)
424
425 # to be optimised with bufs later
426 # get length of a regarding reference
427 command = '/home/dml/src/hg/dml-cliopatria/cpack/dml/scripts/compression/%s encode %s | /home/dml/src/hg/dml-cliopatria/cpack/dml/scripts/compression/length' % (type, freference.name)
428 # print_status(command)
429 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True)
430 output,err = p1.communicate(input=a_coded)
431 a = int(output)
432
433 # get length of b regarding reference
434 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True)
435 output,err = p1.communicate(input=b_coded)
436 b = int(output)
437
438 # get length of a,b regarding reference
439 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True)
440 output,err = p1.communicate(input=a_coded + b_coded)
441 ab = int(output)
442
443
444 #print_status('Compressed Output' + compressed)
445 #print_status('Compressed Size' + str(len(compressed)))
446 os.remove(freference.name)
447 return (a,b,ab)
448
449 # histogram of the last entry in a list
450 # returns the most frequently used key
451 def chord_histogram(chordstr = []):
452 global chord_keys
453 # build histogram
454
455 histo = dict.fromkeys(chord_keys,0)
456 for chord in chordstr:
457 histo[chord] = histo.get(chord,0) + 1
458 #print_status(str(histo.keys()))
459
460 counts = np.array(histo.values(),float)
461 if max(counts) > 0:
462 counts = counts / max(counts)
463 return (counts)
464
465