Daniel@0
|
1 # Part of DML (Digital Music Laboratory)
|
Daniel@0
|
2 # Copyright 2014-2015 Daniel Wolff, City University
|
Daniel@0
|
3
|
Daniel@0
|
4 # This program is free software; you can redistribute it and/or
|
Daniel@0
|
5 # modify it under the terms of the GNU General Public License
|
Daniel@0
|
6 # as published by the Free Software Foundation; either version 2
|
Daniel@0
|
7 # of the License, or (at your option) any later version.
|
Daniel@0
|
8 #
|
Daniel@0
|
9 # This program is distributed in the hope that it will be useful,
|
Daniel@0
|
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
Daniel@0
|
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
Daniel@0
|
12 # GNU General Public License for more details.
|
Daniel@0
|
13 #
|
Daniel@0
|
14 # You should have received a copy of the GNU General Public
|
Daniel@0
|
15 # License along with this library; if not, write to the Free Software
|
Daniel@0
|
16 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Daniel@0
|
17
|
Daniel@0
|
18 # -*- coding: utf-8 -*-
|
Daniel@0
|
19 __author__='wolffd'
|
Daniel@0
|
20
|
Daniel@0
|
21 # this script derives all pairwise similarity measures for the chroma vectors provided.
|
Daniel@0
|
22 # as a first experiment, only the mean chroma vectors per piece are compared
|
Daniel@0
|
23 # using euclidean distance
|
Daniel@0
|
24
|
Daniel@0
|
25
|
Daniel@0
|
26 # parameters to be forwarded to API
|
Daniel@0
|
27
|
Daniel@0
|
28 # similarity type:
|
Daniel@0
|
29 # euclidean, compression
|
Daniel@0
|
30 # simtype = 'euclidean'
|
Daniel@0
|
31
|
Daniel@0
|
32 # parallelisation
|
Daniel@0
|
33 num_cores = 10
|
Daniel@0
|
34
|
Daniel@0
|
35
|
Daniel@0
|
36 #min_clusters = 40## unused
|
Daniel@0
|
37 #max_clusters = 256## unused
|
Daniel@0
|
38
|
Daniel@0
|
39 #set_clusters = 40
|
Daniel@0
|
40 #max_clips = 50
|
Daniel@0
|
41 encoding = 'binary'
|
Daniel@0
|
42 #compressor = 'zxd'
|
Daniel@0
|
43 mds_init_tries = 4
|
Daniel@0
|
44 mds_max_iter = 100
|
Daniel@0
|
45 mfccbins = 12
|
Daniel@0
|
46
|
Daniel@0
|
47 # resample chroma / timbre values at this fraction for compression distance. 0 to switch off
|
Daniel@0
|
48 # we want a vector just every second.
|
Daniel@0
|
49 # The standard sample rate and window size are 44100 / 1024 for chroma / timbre
|
Daniel@0
|
50 # this is dependent on the sim_downsample parameter
|
Daniel@0
|
51 resample_factor = 44100/1024;
|
Daniel@0
|
52
|
Daniel@0
|
53
|
Daniel@0
|
54 from rdflib import RDF, RDFS
|
Daniel@0
|
55 from csvutils import *
|
Daniel@0
|
56 from aggregate import *
|
Daniel@0
|
57 from n3Parser import get_rdf_graph_from_n3
|
Daniel@0
|
58
|
Daniel@0
|
59 # numpy, scipy
|
Daniel@0
|
60 import numpy as np
|
Daniel@0
|
61 from scipy.spatial import distance
|
Daniel@0
|
62 from sklearn.metrics.pairwise import pairwise_distances
|
Daniel@0
|
63 from scipy.signal import resample
|
Daniel@0
|
64
|
Daniel@0
|
65 #scikitlearn
|
Daniel@0
|
66 from sklearn.datasets import make_blobs
|
Daniel@0
|
67 from sklearn.cluster import KMeans
|
Daniel@0
|
68 # from sklearn.metrics import silhouette_samples, silhouette_score
|
Daniel@0
|
69 from sklearn import manifold
|
Daniel@0
|
70
|
Daniel@0
|
71
|
Daniel@0
|
72 # chord processing
|
Daniel@0
|
73 from chord_seq_key_relative import chords_from_csv, keys_from_csv, chord2function, fun2txt, fun2num,most_frequent_key,chord_roots,type_labels
|
Daniel@0
|
74
|
Daniel@0
|
75 # subprocess, command line and threading
|
Daniel@0
|
76 import os, tempfile
|
Daniel@0
|
77 import subprocess, threading
|
Daniel@0
|
78
|
Daniel@0
|
79 # for system / compression calls
|
Daniel@0
|
80 import zlib
|
Daniel@0
|
81
|
Daniel@0
|
82
|
Daniel@0
|
83 def chroma_from_csv(filename):
|
Daniel@0
|
84 # we assume CSV: time, chroma 1, ... chroma 12]
|
Daniel@0
|
85 # return (time, [chroma 1-12])
|
Daniel@0
|
86 return csv_map_rows(filename,13, lambda row:(float(row[0]),np.array(row[1:12],dtype=float)))
|
Daniel@0
|
87
|
Daniel@0
|
88 def mfcc_from_csv(filename):
|
Daniel@0
|
89 # we assume CSV: time, mfcc 1, ... mfcc 20]
|
Daniel@0
|
90 # return (time, [chroma 1-12])
|
Daniel@0
|
91 return csv_map_rows(filename,21, lambda row:(float(row[0]),np.array(row[1:20],dtype=float)))
|
Daniel@0
|
92
|
Daniel@0
|
93
|
Daniel@0
|
94 chroma_parser_table = { 'csv':chroma_from_csv}
|
Daniel@0
|
95 mfcc_parser_table = { 'csv':mfcc_from_csv}
|
Daniel@0
|
96
|
Daniel@0
|
97 # in chord_seq_relative
|
Daniel@0
|
98 key_parser_table = { 'csv':keys_from_csv }
|
Daniel@0
|
99 chord_parser_table = { 'csv':chords_from_csv }
|
Daniel@0
|
100
|
Daniel@0
|
101 ## generate global dict of chord_keys
|
Daniel@0
|
102 chord_keys = []
|
Daniel@0
|
103 for chordnum in range(1,12+1):
|
Daniel@0
|
104 for typenum in range(1,11+1):
|
Daniel@0
|
105 chord_keys.append("%02d%02d" % (chordnum,typenum))
|
Daniel@0
|
106
|
Daniel@0
|
107 def per_file(inputs,opts={}):
|
Daniel@0
|
108 chromas = []
|
Daniel@0
|
109 chromas_idx = []
|
Daniel@0
|
110 mfccs = []
|
Daniel@0
|
111 mfccs_idx = []
|
Daniel@0
|
112 chords = []
|
Daniel@0
|
113 chords_idx = []
|
Daniel@0
|
114 uris = []
|
Daniel@0
|
115
|
Daniel@0
|
116 # get options from API
|
Daniel@0
|
117 # print_status(str(opts))
|
Daniel@0
|
118 simtype = opts['sim_type']
|
Daniel@0
|
119 set_clusters = opts['sim_clusters'] # def 40
|
Daniel@0
|
120 downsample = opts['sim_downsample'] # def 1
|
Daniel@0
|
121 limit = opts['sim_reclimit'] # def 50
|
Daniel@0
|
122 compressor = opts['sim_compressor'] # def 'zlib'
|
Daniel@0
|
123
|
Daniel@0
|
124 # parse feature list
|
Daniel@0
|
125 features = opts['sim_features'].split(',') # features, def: chroma
|
Daniel@0
|
126 use_chromagram = any(ext in 'chromagram' for ext in features)
|
Daniel@0
|
127 use_mfcc = any(ext in 'mfcc' for ext in features)
|
Daniel@0
|
128 use_chords = any(ext in 'chords' for ext in features)
|
Daniel@0
|
129
|
Daniel@0
|
130 # check number of inputs
|
Daniel@0
|
131 if len(inputs) > limit:
|
Daniel@0
|
132 #return { 'error': ''}
|
Daniel@0
|
133 print_status('Similarity: Too many inputs, truncating collection')
|
Daniel@0
|
134 inputs = inputs[0:limit]
|
Daniel@0
|
135
|
Daniel@0
|
136
|
Daniel@0
|
137 # accumulation for euclidean just gets the mean values over the whole clips
|
Daniel@0
|
138 # todo: add std and other statistics?
|
Daniel@0
|
139 def accum_euclidean(item):
|
Daniel@0
|
140 # accumulate chroma vectors for this piece
|
Daniel@0
|
141 if use_chromagram:
|
Daniel@0
|
142 chroma = [ res[1] for res in decode_tagged(chroma_parser_table,item['chromagram'])]
|
Daniel@0
|
143 # print_status('Chroma Raw Data' + str(chroma))
|
Daniel@0
|
144 # get mean chroma vector
|
Daniel@0
|
145 chroma_mean = np.mean(np.array(chroma), axis = 0)
|
Daniel@0
|
146 #print_status('Chroma Means' + str(chroma_mean))
|
Daniel@0
|
147
|
Daniel@0
|
148 # add vector to chromas table
|
Daniel@0
|
149 chromas.append(chroma_mean)
|
Daniel@0
|
150
|
Daniel@0
|
151 if use_mfcc:
|
Daniel@0
|
152 mfcc = [ res[1] for res in decode_tagged(mfcc_parser_table,item['mfcc'])]
|
Daniel@0
|
153 mfcc_mean = np.mean(np.array(mfcc), axis = 0)
|
Daniel@0
|
154 mfccs.append(mfcc_mean)
|
Daniel@0
|
155
|
Daniel@0
|
156 if use_chords:
|
Daniel@0
|
157 # get duration and normalised frequency for all tuning pitches (A3,A4,A5)
|
Daniel@0
|
158 keys = decode_tagged(key_parser_table,item['keys'])
|
Daniel@0
|
159 # get most frequent key
|
Daniel@0
|
160 key,mode = most_frequent_key(keys)
|
Daniel@0
|
161 relchords = []
|
Daniel@0
|
162 for (time,chord) in decode_tagged(chord_parser_table,item['chords']):
|
Daniel@0
|
163
|
Daniel@0
|
164 # get chord function
|
Daniel@0
|
165 (root,fun,typ, bfun) = chord2function(chord, key,mode)
|
Daniel@0
|
166
|
Daniel@0
|
167 # translate into text
|
Daniel@0
|
168 #txt = fun2txt(fun,typ, bfun, mode)
|
Daniel@0
|
169 #print_status('Chord: ' + chord + ', function: ' + txt)
|
Daniel@0
|
170 num = fun2num(fun,typ, bfun, mode)
|
Daniel@0
|
171 if num > 0:
|
Daniel@0
|
172 # add to chords of this clip
|
Daniel@0
|
173 #relchords.append((time,key,mode,fun,typ,bfun))
|
Daniel@0
|
174
|
Daniel@0
|
175 # get the root note of the chord and chord type
|
Daniel@0
|
176 # ignore mode and base note
|
Daniel@0
|
177 # format of num [1x mode, 2x function, 2x type, 2x base note]
|
Daniel@0
|
178 relchords.append(str(num)[1:5])
|
Daniel@0
|
179
|
Daniel@0
|
180 # append histogram of all chords for this recording
|
Daniel@0
|
181 hist = chord_histogram(relchords)
|
Daniel@0
|
182 chords.append(hist)
|
Daniel@0
|
183
|
Daniel@0
|
184
|
Daniel@0
|
185 # add uri if everything went well
|
Daniel@0
|
186 uris.append(item['list'])
|
Daniel@0
|
187
|
Daniel@0
|
188 # accumulation for compression:
|
Daniel@0
|
189 # save all chroma vectors
|
Daniel@0
|
190 # possibly build a codebook
|
Daniel@0
|
191 # otherwise compare for quantisation
|
Daniel@0
|
192
|
Daniel@0
|
193 def accum_compression(item):
|
Daniel@0
|
194
|
Daniel@0
|
195 # get chromas
|
Daniel@0
|
196 if use_chromagram:
|
Daniel@0
|
197 # accumulate chroma vectors for this piece
|
Daniel@0
|
198 chroma = [ res[1] for res in decode_tagged(chroma_parser_table,item['chromagram'])]
|
Daniel@0
|
199 # print_status('Chroma Raw Data' + str(chroma))
|
Daniel@0
|
200
|
Daniel@0
|
201 # downsample if necessary
|
Daniel@0
|
202 if downsample == 1:
|
Daniel@0
|
203 #chroma = resample(chroma, len(chroma)//resample_factor, axis=0, window=None)
|
Daniel@0
|
204 #chroma = [chroma[i] for i in np.random.randint(0,len(chroma),len(chroma)//resample_factor)]
|
Daniel@0
|
205 chroma = [chroma[i*resample_factor] for i in range(0,len(chroma)//resample_factor)]
|
Daniel@0
|
206
|
Daniel@0
|
207 chromas.extend(chroma)
|
Daniel@0
|
208 chromas_idx.append(len(chromas))
|
Daniel@0
|
209
|
Daniel@0
|
210
|
Daniel@0
|
211 if use_mfcc:
|
Daniel@0
|
212 mfcc = [ res[1] for res in decode_tagged(mfcc_parser_table,item['mfcc'])]
|
Daniel@0
|
213
|
Daniel@0
|
214 if downsample == 1:
|
Daniel@0
|
215 # mfcc = np.random.randint(0,len(mfcc),len(mfcc)//resample_factor)]
|
Daniel@0
|
216 mfcc = [mfcc[i*resample_factor] for i in range(0,len(mfcc)//resample_factor)]
|
Daniel@0
|
217 mfccs.extend(mfcc)
|
Daniel@0
|
218 mfccs_idx.append(len(mfccs))
|
Daniel@0
|
219
|
Daniel@0
|
220 if use_chords:
|
Daniel@0
|
221 # get duration and normalised frequency for all tuning pitches (A3,A4,A5)
|
Daniel@0
|
222 keys = decode_tagged(key_parser_table,item['keys'])
|
Daniel@0
|
223 # get most frequent key
|
Daniel@0
|
224 key,mode = most_frequent_key(keys)
|
Daniel@0
|
225 relchords = []
|
Daniel@0
|
226 for (time,chord) in decode_tagged(chord_parser_table,item['chords']):
|
Daniel@0
|
227
|
Daniel@0
|
228 # get chord function
|
Daniel@0
|
229 (root,fun,typ, bfun) = chord2function(chord, key,mode)
|
Daniel@0
|
230
|
Daniel@0
|
231 # translate into text
|
Daniel@0
|
232 #txt = fun2txt(fun,typ, bfun, mode)
|
Daniel@0
|
233 #print_status('Chord: ' + chord + ', function: ' + txt)
|
Daniel@0
|
234 num = fun2num(fun,typ, bfun, mode)
|
Daniel@0
|
235 if num > 0:
|
Daniel@0
|
236 # add to chords of this clip
|
Daniel@0
|
237 #relchords.append((time,key,mode,fun,typ,bfun))
|
Daniel@0
|
238
|
Daniel@0
|
239 # get the root note of the chord and chord type
|
Daniel@0
|
240 # ignore mode and base note
|
Daniel@0
|
241 # format of num [1x mode, 2x function, 2x type, 2x base note]
|
Daniel@0
|
242 relchords.append(int(str(num)[1:5]))
|
Daniel@0
|
243
|
Daniel@0
|
244 # append histogram of all chords for this recording
|
Daniel@0
|
245 #hist = chord_histogram(relchords)
|
Daniel@0
|
246 chords.extend(relchords)
|
Daniel@0
|
247 chords_idx.append(len(chords))
|
Daniel@0
|
248
|
Daniel@0
|
249 # add uri if everything went well
|
Daniel@0
|
250 uris.append(item['list'])
|
Daniel@0
|
251
|
Daniel@0
|
252
|
Daniel@0
|
253
|
Daniel@0
|
254 # ---
|
Daniel@0
|
255 # this is the euclidean distance
|
Daniel@0
|
256 # ---
|
Daniel@0
|
257 if (simtype == 'euclidean'):
|
Daniel@0
|
258 # accumulate over all inputs
|
Daniel@0
|
259 st=for_each(inputs,accum_euclidean)
|
Daniel@0
|
260
|
Daniel@0
|
261 # concatenate feature input for all features
|
Daniel@0
|
262 arr = np.empty((len(uris),0), float)
|
Daniel@0
|
263 # concatenate data to nparray for euclidean distance
|
Daniel@0
|
264 if use_chromagram:
|
Daniel@0
|
265 arr = np.append(arr, np.array(chromas), axis=1)
|
Daniel@0
|
266
|
Daniel@0
|
267 if use_mfcc:
|
Daniel@0
|
268 arr = np.append(arr, np.array(mfccs), axis=1)
|
Daniel@0
|
269
|
Daniel@0
|
270 if use_chords:
|
Daniel@0
|
271 # get chord dictionaries
|
Daniel@0
|
272 #print(str(np.array(chords).shape))
|
Daniel@0
|
273 arr = np.append(arr,np.array(chords) , axis=1)
|
Daniel@0
|
274
|
Daniel@0
|
275 #dist = distance.pdist(chromas, 'euclidean')
|
Daniel@0
|
276 dist = pairwise_distances(arr, metric = 'euclidean', n_jobs = num_cores)
|
Daniel@0
|
277
|
Daniel@0
|
278 # return to non-condensed matrix for simplicity.
|
Daniel@0
|
279 # this can be reversed using the very same function for data
|
Daniel@0
|
280 # efficiency
|
Daniel@0
|
281 #dist = distance.squareform(dist)
|
Daniel@0
|
282
|
Daniel@0
|
283 # ---
|
Daniel@0
|
284 # this is the normalised compression distance
|
Daniel@0
|
285 # ---
|
Daniel@0
|
286 elif (simtype == 'compression'):
|
Daniel@0
|
287 # accumulate over all inputs
|
Daniel@0
|
288 print_status('Similarity Module: Accumulating')
|
Daniel@0
|
289 st=for_each(inputs,accum_compression)
|
Daniel@0
|
290
|
Daniel@0
|
291 dist = np.zeros((len(uris),len(uris)))
|
Daniel@0
|
292 count = 0
|
Daniel@0
|
293 if use_chromagram:
|
Daniel@0
|
294 print_status('Similarity Module: Chroma Quantisation')
|
Daniel@0
|
295 chromas_coded = vector_quantisation(np.array(chromas), set_clusters,num_cores)
|
Daniel@0
|
296 print_status('Similarity Module: Chroma Compression Results')
|
Daniel@0
|
297 dist += similarity_by_mask(chromas_coded,chromas_idx,compressor,encoding)
|
Daniel@0
|
298 count +=1
|
Daniel@0
|
299
|
Daniel@0
|
300 if use_mfcc:
|
Daniel@0
|
301 print_status('Similarity Module: MFCC Quantisation')
|
Daniel@0
|
302 mfccs_coded = vector_quantisation(np.array(mfccs), set_clusters,num_cores)
|
Daniel@0
|
303 print_status('Similarity Module: MFCC Compression Results')
|
Daniel@0
|
304 dist += similarity_by_mask(mfccs_coded,mfccs_idx,compressor,encoding)
|
Daniel@0
|
305 count +=1
|
Daniel@0
|
306
|
Daniel@0
|
307 if use_chords:
|
Daniel@0
|
308 print_status('Similarity Module: Chord Compression Results')
|
Daniel@0
|
309 dist += similarity_by_mask(np.array(chords),chords_idx,compressor,encoding)
|
Daniel@0
|
310 count +=1
|
Daniel@0
|
311
|
Daniel@0
|
312 dist = dist / count
|
Daniel@0
|
313
|
Daniel@0
|
314
|
Daniel@0
|
315 # get rid of zeros in between
|
Daniel@0
|
316 #for idx1 in range(0,len(chromas_idx)):
|
Daniel@0
|
317 # dist[idx1][idx1] = 1
|
Daniel@0
|
318
|
Daniel@0
|
319 print_status('dist' + str(dist))
|
Daniel@0
|
320
|
Daniel@0
|
321 # Do MDS scaling with precomputed distance
|
Daniel@0
|
322 mds = manifold.MDS(n_components = 2, max_iter=mds_max_iter, n_init=mds_init_tries, dissimilarity='precomputed')
|
Daniel@0
|
323
|
Daniel@0
|
324 coordinates = mds.fit_transform(dist)
|
Daniel@0
|
325
|
Daniel@0
|
326 return { 'result': { 'list': uris, 'mds': coordinates.tolist()}, 'stats' : st }
|
Daniel@0
|
327 # return { 'result': { 'list': uris, 'distance': dist.tolist(), 'mds': coordinates.tolist()},
|
Daniel@0
|
328 # 'stats' : st }
|
Daniel@0
|
329
|
Daniel@0
|
330
|
Daniel@0
|
331
|
Daniel@0
|
332 def vector_quantisation(data, set_clusters,num_cores):
|
Daniel@0
|
333 # ---
|
Daniel@0
|
334 # build codebook!
|
Daniel@0
|
335 # ---
|
Daniel@0
|
336 # --- 1 quantise chroma data
|
Daniel@0
|
337 # --- 1a use scikit-learn k-means
|
Daniel@0
|
338 # http://scikit-learn.org/stable/modules/clustering.html
|
Daniel@0
|
339
|
Daniel@0
|
340 # quick try
|
Daniel@0
|
341 clusterer = KMeans(n_clusters=set_clusters,n_jobs = num_cores)
|
Daniel@0
|
342
|
Daniel@0
|
343 # --- 2 get compression distance
|
Daniel@0
|
344 # get all single compressed sizes?
|
Daniel@0
|
345 data_coded = clusterer.fit_predict(data)
|
Daniel@0
|
346 #print_status('Chromas Coded' + str(chromas_coded))
|
Daniel@0
|
347 # print_status('Coding Histogram' + str(np.histogram(chromas_coded)))
|
Daniel@0
|
348 return data_coded
|
Daniel@0
|
349
|
Daniel@0
|
350
|
Daniel@0
|
351
|
Daniel@0
|
352 def similarity_by_mask(data_coded,data_idx,compressor,encoding):
|
Daniel@0
|
353
|
Daniel@0
|
354 # idx is expected to start with the first chroma index of the second piece
|
Daniel@0
|
355 # TODO: check indexing (starts at 0 or 1?)
|
Daniel@0
|
356 lengths = []
|
Daniel@0
|
357 start_idx = [0] + data_idx[:-1]
|
Daniel@0
|
358 dist = np.zeros((len(data_idx),len(data_idx)))
|
Daniel@0
|
359 for idx1 in range(0,len(data_idx)):
|
Daniel@0
|
360 for idx2 in range(0,len(data_idx)):
|
Daniel@0
|
361 if (idx2 < idx1):
|
Daniel@0
|
362 # select encoded chromas for the clips
|
Daniel@0
|
363 data1_mask = np.zeros(len(data_coded), dtype=bool)
|
Daniel@0
|
364 data1_mask[start_idx[idx1]:data_idx[idx1]-1] = True
|
Daniel@0
|
365
|
Daniel@0
|
366 data2_mask = np.zeros(len(data_coded), dtype=bool)
|
Daniel@0
|
367 data2_mask[start_idx[idx2]:data_idx[idx2]-1] = True
|
Daniel@0
|
368
|
Daniel@0
|
369 a_coded = encode(data_coded[data1_mask],format = encoding)
|
Daniel@0
|
370 b_coded = encode(data_coded[data2_mask],format = encoding)
|
Daniel@0
|
371 # get compression lengths
|
Daniel@0
|
372 if compressor == 'zlib':
|
Daniel@0
|
373 (a,b,ab) = compressed_length(a_coded,b_coded,compressor)
|
Daniel@0
|
374
|
Daniel@0
|
375 else:
|
Daniel@0
|
376 # get complement chroma set
|
Daniel@0
|
377 ref_mask = ~data1_mask & ~data2_mask
|
Daniel@0
|
378 ref_coded = encode(data_coded[ref_mask],format = encoding)
|
Daniel@0
|
379 (a,b,ab) = delta_compressed_length(a_coded,b_coded,ref_coded,compressor)
|
Daniel@0
|
380
|
Daniel@0
|
381 #NCD(z - min(v, w))/ max(v, w);
|
Daniel@0
|
382 dist[idx1][idx2] = (ab - min(a,b))/float(max(a,b))
|
Daniel@0
|
383
|
Daniel@0
|
384 # the above normalised compression distance is symmetric
|
Daniel@0
|
385 # this is required by the nds routine below
|
Daniel@0
|
386 dist[idx2][idx1] = dist[idx1][idx2]
|
Daniel@0
|
387
|
Daniel@0
|
388 return dist
|
Daniel@0
|
389
|
Daniel@0
|
390 def encode(data, format = 'string'):
|
Daniel@0
|
391
|
Daniel@0
|
392 # Encoding
|
Daniel@0
|
393 if format == 'binary':
|
Daniel@0
|
394 data_coded = data.tostring()
|
Daniel@0
|
395 elif format == 'string':
|
Daniel@0
|
396 data_coded = str(data)
|
Daniel@0
|
397 return data_coded
|
Daniel@0
|
398
|
Daniel@0
|
399 def compressed_length(a_coded,b_coded, type = 'zlib'):
|
Daniel@0
|
400
|
Daniel@0
|
401 # Compression
|
Daniel@0
|
402 if type == 'zlib':
|
Daniel@0
|
403 # zlib is quite helpful https://docs.python.org/2/library/zlib.html#module-zlib
|
Daniel@0
|
404 a = len(zlib.compress(a_coded, 9))
|
Daniel@0
|
405 b = len(zlib.compress(a_coded, 9))
|
Daniel@0
|
406 ab = len(zlib.compress(a_coded + b_coded, 9))
|
Daniel@0
|
407
|
Daniel@0
|
408 return (a,b,ab)
|
Daniel@0
|
409
|
Daniel@0
|
410 def delta_compressed_length(a_coded,b_coded,ref_coded, type = 'zxd'):
|
Daniel@0
|
411 # Compression
|
Daniel@0
|
412 # zbs - use bsdiff
|
Daniel@0
|
413 # zxd - uses xdelta3
|
Daniel@0
|
414 # zvcd - uses open-vcdiff
|
Daniel@0
|
415 # zvcz - uses vczip
|
Daniel@0
|
416 # zdiff - converts binary to text and uses diff to produce an ed script
|
Daniel@0
|
417
|
Daniel@0
|
418 if type == 'zxd' or type == 'zbs' or type == 'zvcz' or type == 'zdiff' or type == 'zvcd':
|
Daniel@0
|
419
|
Daniel@0
|
420 freference = tempfile.NamedTemporaryFile(delete=False)
|
Daniel@0
|
421 freference.write(ref_coded)
|
Daniel@0
|
422 freference.close()
|
Daniel@0
|
423 #print_status('Ref File: ' + freference.name)
|
Daniel@0
|
424
|
Daniel@0
|
425 # to be optimised with bufs later
|
Daniel@0
|
426 # get length of a regarding reference
|
Daniel@0
|
427 command = '/home/dml/src/hg/dml-cliopatria/cpack/dml/scripts/compression/%s encode %s | /home/dml/src/hg/dml-cliopatria/cpack/dml/scripts/compression/length' % (type, freference.name)
|
Daniel@0
|
428 # print_status(command)
|
Daniel@0
|
429 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True)
|
Daniel@0
|
430 output,err = p1.communicate(input=a_coded)
|
Daniel@0
|
431 a = int(output)
|
Daniel@0
|
432
|
Daniel@0
|
433 # get length of b regarding reference
|
Daniel@0
|
434 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True)
|
Daniel@0
|
435 output,err = p1.communicate(input=b_coded)
|
Daniel@0
|
436 b = int(output)
|
Daniel@0
|
437
|
Daniel@0
|
438 # get length of a,b regarding reference
|
Daniel@0
|
439 p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True)
|
Daniel@0
|
440 output,err = p1.communicate(input=a_coded + b_coded)
|
Daniel@0
|
441 ab = int(output)
|
Daniel@0
|
442
|
Daniel@0
|
443
|
Daniel@0
|
444 #print_status('Compressed Output' + compressed)
|
Daniel@0
|
445 #print_status('Compressed Size' + str(len(compressed)))
|
Daniel@0
|
446 os.remove(freference.name)
|
Daniel@0
|
447 return (a,b,ab)
|
Daniel@0
|
448
|
Daniel@0
|
449 # histogram of the last entry in a list
|
Daniel@0
|
450 # returns the most frequently used key
|
Daniel@0
|
451 def chord_histogram(chordstr = []):
|
Daniel@0
|
452 global chord_keys
|
Daniel@0
|
453 # build histogram
|
Daniel@0
|
454
|
Daniel@0
|
455 histo = dict.fromkeys(chord_keys,0)
|
Daniel@0
|
456 for chord in chordstr:
|
Daniel@0
|
457 histo[chord] = histo.get(chord,0) + 1
|
Daniel@0
|
458 #print_status(str(histo.keys()))
|
Daniel@0
|
459
|
Daniel@0
|
460 counts = np.array(histo.values(),float)
|
Daniel@0
|
461 if max(counts) > 0:
|
Daniel@0
|
462 counts = counts / max(counts)
|
Daniel@0
|
463 return (counts)
|
Daniel@0
|
464
|
Daniel@0
|
465
|