Daniel@0: # Part of DML (Digital Music Laboratory) Daniel@0: # Copyright 2014-2015 Daniel Wolff, City University Daniel@0: Daniel@0: # This program is free software; you can redistribute it and/or Daniel@0: # modify it under the terms of the GNU General Public License Daniel@0: # as published by the Free Software Foundation; either version 2 Daniel@0: # of the License, or (at your option) any later version. Daniel@0: # Daniel@0: # This program is distributed in the hope that it will be useful, Daniel@0: # but WITHOUT ANY WARRANTY; without even the implied warranty of Daniel@0: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Daniel@0: # GNU General Public License for more details. Daniel@0: # Daniel@0: # You should have received a copy of the GNU General Public Daniel@0: # License along with this library; if not, write to the Free Software Daniel@0: # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Daniel@0: Daniel@0: # -*- coding: utf-8 -*- Daniel@0: __author__='wolffd' Daniel@0: Daniel@0: # this script derives all pairwise similarity measures for the chroma vectors provided. Daniel@0: # as a first experiment, only the mean chroma vectors per piece are compared Daniel@0: # using euclidean distance Daniel@0: Daniel@0: Daniel@0: # parameters to be forwarded to API Daniel@0: Daniel@0: # similarity type: Daniel@0: # euclidean, compression Daniel@0: # simtype = 'euclidean' Daniel@0: Daniel@0: # parallelisation Daniel@0: num_cores = 10 Daniel@0: Daniel@0: Daniel@0: #min_clusters = 40## unused Daniel@0: #max_clusters = 256## unused Daniel@0: Daniel@0: #set_clusters = 40 Daniel@0: #max_clips = 50 Daniel@0: encoding = 'binary' Daniel@0: #compressor = 'zxd' Daniel@0: mds_init_tries = 4 Daniel@0: mds_max_iter = 100 Daniel@0: mfccbins = 12 Daniel@0: Daniel@0: # resample chroma / timbre values at this fraction for compression distance. 0 to switch off Daniel@0: # we want a vector just every second. Daniel@0: # The standard sample rate and window size are 44100 / 1024 for chroma / timbre Daniel@0: # this is dependent on the sim_downsample parameter Daniel@0: resample_factor = 44100/1024; Daniel@0: Daniel@0: Daniel@0: from rdflib import RDF, RDFS Daniel@0: from csvutils import * Daniel@0: from aggregate import * Daniel@0: from n3Parser import get_rdf_graph_from_n3 Daniel@0: Daniel@0: # numpy, scipy Daniel@0: import numpy as np Daniel@0: from scipy.spatial import distance Daniel@0: from sklearn.metrics.pairwise import pairwise_distances Daniel@0: from scipy.signal import resample Daniel@0: Daniel@0: #scikitlearn Daniel@0: from sklearn.datasets import make_blobs Daniel@0: from sklearn.cluster import KMeans Daniel@0: # from sklearn.metrics import silhouette_samples, silhouette_score Daniel@0: from sklearn import manifold Daniel@0: Daniel@0: Daniel@0: # chord processing Daniel@0: from chord_seq_key_relative import chords_from_csv, keys_from_csv, chord2function, fun2txt, fun2num,most_frequent_key,chord_roots,type_labels Daniel@0: Daniel@0: # subprocess, command line and threading Daniel@0: import os, tempfile Daniel@0: import subprocess, threading Daniel@0: Daniel@0: # for system / compression calls Daniel@0: import zlib Daniel@0: Daniel@0: Daniel@0: def chroma_from_csv(filename): Daniel@0: # we assume CSV: time, chroma 1, ... chroma 12] Daniel@0: # return (time, [chroma 1-12]) Daniel@0: return csv_map_rows(filename,13, lambda row:(float(row[0]),np.array(row[1:12],dtype=float))) Daniel@0: Daniel@0: def mfcc_from_csv(filename): Daniel@0: # we assume CSV: time, mfcc 1, ... mfcc 20] Daniel@0: # return (time, [chroma 1-12]) Daniel@0: return csv_map_rows(filename,21, lambda row:(float(row[0]),np.array(row[1:20],dtype=float))) Daniel@0: Daniel@0: Daniel@0: chroma_parser_table = { 'csv':chroma_from_csv} Daniel@0: mfcc_parser_table = { 'csv':mfcc_from_csv} Daniel@0: Daniel@0: # in chord_seq_relative Daniel@0: key_parser_table = { 'csv':keys_from_csv } Daniel@0: chord_parser_table = { 'csv':chords_from_csv } Daniel@0: Daniel@0: ## generate global dict of chord_keys Daniel@0: chord_keys = [] Daniel@0: for chordnum in range(1,12+1): Daniel@0: for typenum in range(1,11+1): Daniel@0: chord_keys.append("%02d%02d" % (chordnum,typenum)) Daniel@0: Daniel@0: def per_file(inputs,opts={}): Daniel@0: chromas = [] Daniel@0: chromas_idx = [] Daniel@0: mfccs = [] Daniel@0: mfccs_idx = [] Daniel@0: chords = [] Daniel@0: chords_idx = [] Daniel@0: uris = [] Daniel@0: Daniel@0: # get options from API Daniel@0: # print_status(str(opts)) Daniel@0: simtype = opts['sim_type'] Daniel@0: set_clusters = opts['sim_clusters'] # def 40 Daniel@0: downsample = opts['sim_downsample'] # def 1 Daniel@0: limit = opts['sim_reclimit'] # def 50 Daniel@0: compressor = opts['sim_compressor'] # def 'zlib' Daniel@0: Daniel@0: # parse feature list Daniel@0: features = opts['sim_features'].split(',') # features, def: chroma Daniel@0: use_chromagram = any(ext in 'chromagram' for ext in features) Daniel@0: use_mfcc = any(ext in 'mfcc' for ext in features) Daniel@0: use_chords = any(ext in 'chords' for ext in features) Daniel@0: Daniel@0: # check number of inputs Daniel@0: if len(inputs) > limit: Daniel@0: #return { 'error': ''} Daniel@0: print_status('Similarity: Too many inputs, truncating collection') Daniel@0: inputs = inputs[0:limit] Daniel@0: Daniel@0: Daniel@0: # accumulation for euclidean just gets the mean values over the whole clips Daniel@0: # todo: add std and other statistics? Daniel@0: def accum_euclidean(item): Daniel@0: # accumulate chroma vectors for this piece Daniel@0: if use_chromagram: Daniel@0: chroma = [ res[1] for res in decode_tagged(chroma_parser_table,item['chromagram'])] Daniel@0: # print_status('Chroma Raw Data' + str(chroma)) Daniel@0: # get mean chroma vector Daniel@0: chroma_mean = np.mean(np.array(chroma), axis = 0) Daniel@0: #print_status('Chroma Means' + str(chroma_mean)) Daniel@0: Daniel@0: # add vector to chromas table Daniel@0: chromas.append(chroma_mean) Daniel@0: Daniel@0: if use_mfcc: Daniel@0: mfcc = [ res[1] for res in decode_tagged(mfcc_parser_table,item['mfcc'])] Daniel@0: mfcc_mean = np.mean(np.array(mfcc), axis = 0) Daniel@0: mfccs.append(mfcc_mean) Daniel@0: Daniel@0: if use_chords: Daniel@0: # get duration and normalised frequency for all tuning pitches (A3,A4,A5) Daniel@0: keys = decode_tagged(key_parser_table,item['keys']) Daniel@0: # get most frequent key Daniel@0: key,mode = most_frequent_key(keys) Daniel@0: relchords = [] Daniel@0: for (time,chord) in decode_tagged(chord_parser_table,item['chords']): Daniel@0: Daniel@0: # get chord function Daniel@0: (root,fun,typ, bfun) = chord2function(chord, key,mode) Daniel@0: Daniel@0: # translate into text Daniel@0: #txt = fun2txt(fun,typ, bfun, mode) Daniel@0: #print_status('Chord: ' + chord + ', function: ' + txt) Daniel@0: num = fun2num(fun,typ, bfun, mode) Daniel@0: if num > 0: Daniel@0: # add to chords of this clip Daniel@0: #relchords.append((time,key,mode,fun,typ,bfun)) Daniel@0: Daniel@0: # get the root note of the chord and chord type Daniel@0: # ignore mode and base note Daniel@0: # format of num [1x mode, 2x function, 2x type, 2x base note] Daniel@0: relchords.append(str(num)[1:5]) Daniel@0: Daniel@0: # append histogram of all chords for this recording Daniel@0: hist = chord_histogram(relchords) Daniel@0: chords.append(hist) Daniel@0: Daniel@0: Daniel@0: # add uri if everything went well Daniel@0: uris.append(item['list']) Daniel@0: Daniel@0: # accumulation for compression: Daniel@0: # save all chroma vectors Daniel@0: # possibly build a codebook Daniel@0: # otherwise compare for quantisation Daniel@0: Daniel@0: def accum_compression(item): Daniel@0: Daniel@0: # get chromas Daniel@0: if use_chromagram: Daniel@0: # accumulate chroma vectors for this piece Daniel@0: chroma = [ res[1] for res in decode_tagged(chroma_parser_table,item['chromagram'])] Daniel@0: # print_status('Chroma Raw Data' + str(chroma)) Daniel@0: Daniel@0: # downsample if necessary Daniel@0: if downsample == 1: Daniel@0: #chroma = resample(chroma, len(chroma)//resample_factor, axis=0, window=None) Daniel@0: #chroma = [chroma[i] for i in np.random.randint(0,len(chroma),len(chroma)//resample_factor)] Daniel@0: chroma = [chroma[i*resample_factor] for i in range(0,len(chroma)//resample_factor)] Daniel@0: Daniel@0: chromas.extend(chroma) Daniel@0: chromas_idx.append(len(chromas)) Daniel@0: Daniel@0: Daniel@0: if use_mfcc: Daniel@0: mfcc = [ res[1] for res in decode_tagged(mfcc_parser_table,item['mfcc'])] Daniel@0: Daniel@0: if downsample == 1: Daniel@0: # mfcc = np.random.randint(0,len(mfcc),len(mfcc)//resample_factor)] Daniel@0: mfcc = [mfcc[i*resample_factor] for i in range(0,len(mfcc)//resample_factor)] Daniel@0: mfccs.extend(mfcc) Daniel@0: mfccs_idx.append(len(mfccs)) Daniel@0: Daniel@0: if use_chords: Daniel@0: # get duration and normalised frequency for all tuning pitches (A3,A4,A5) Daniel@0: keys = decode_tagged(key_parser_table,item['keys']) Daniel@0: # get most frequent key Daniel@0: key,mode = most_frequent_key(keys) Daniel@0: relchords = [] Daniel@0: for (time,chord) in decode_tagged(chord_parser_table,item['chords']): Daniel@0: Daniel@0: # get chord function Daniel@0: (root,fun,typ, bfun) = chord2function(chord, key,mode) Daniel@0: Daniel@0: # translate into text Daniel@0: #txt = fun2txt(fun,typ, bfun, mode) Daniel@0: #print_status('Chord: ' + chord + ', function: ' + txt) Daniel@0: num = fun2num(fun,typ, bfun, mode) Daniel@0: if num > 0: Daniel@0: # add to chords of this clip Daniel@0: #relchords.append((time,key,mode,fun,typ,bfun)) Daniel@0: Daniel@0: # get the root note of the chord and chord type Daniel@0: # ignore mode and base note Daniel@0: # format of num [1x mode, 2x function, 2x type, 2x base note] Daniel@0: relchords.append(int(str(num)[1:5])) Daniel@0: Daniel@0: # append histogram of all chords for this recording Daniel@0: #hist = chord_histogram(relchords) Daniel@0: chords.extend(relchords) Daniel@0: chords_idx.append(len(chords)) Daniel@0: Daniel@0: # add uri if everything went well Daniel@0: uris.append(item['list']) Daniel@0: Daniel@0: Daniel@0: Daniel@0: # --- Daniel@0: # this is the euclidean distance Daniel@0: # --- Daniel@0: if (simtype == 'euclidean'): Daniel@0: # accumulate over all inputs Daniel@0: st=for_each(inputs,accum_euclidean) Daniel@0: Daniel@0: # concatenate feature input for all features Daniel@0: arr = np.empty((len(uris),0), float) Daniel@0: # concatenate data to nparray for euclidean distance Daniel@0: if use_chromagram: Daniel@0: arr = np.append(arr, np.array(chromas), axis=1) Daniel@0: Daniel@0: if use_mfcc: Daniel@0: arr = np.append(arr, np.array(mfccs), axis=1) Daniel@0: Daniel@0: if use_chords: Daniel@0: # get chord dictionaries Daniel@0: #print(str(np.array(chords).shape)) Daniel@0: arr = np.append(arr,np.array(chords) , axis=1) Daniel@0: Daniel@0: #dist = distance.pdist(chromas, 'euclidean') Daniel@0: dist = pairwise_distances(arr, metric = 'euclidean', n_jobs = num_cores) Daniel@0: Daniel@0: # return to non-condensed matrix for simplicity. Daniel@0: # this can be reversed using the very same function for data Daniel@0: # efficiency Daniel@0: #dist = distance.squareform(dist) Daniel@0: Daniel@0: # --- Daniel@0: # this is the normalised compression distance Daniel@0: # --- Daniel@0: elif (simtype == 'compression'): Daniel@0: # accumulate over all inputs Daniel@0: print_status('Similarity Module: Accumulating') Daniel@0: st=for_each(inputs,accum_compression) Daniel@0: Daniel@0: dist = np.zeros((len(uris),len(uris))) Daniel@0: count = 0 Daniel@0: if use_chromagram: Daniel@0: print_status('Similarity Module: Chroma Quantisation') Daniel@0: chromas_coded = vector_quantisation(np.array(chromas), set_clusters,num_cores) Daniel@0: print_status('Similarity Module: Chroma Compression Results') Daniel@0: dist += similarity_by_mask(chromas_coded,chromas_idx,compressor,encoding) Daniel@0: count +=1 Daniel@0: Daniel@0: if use_mfcc: Daniel@0: print_status('Similarity Module: MFCC Quantisation') Daniel@0: mfccs_coded = vector_quantisation(np.array(mfccs), set_clusters,num_cores) Daniel@0: print_status('Similarity Module: MFCC Compression Results') Daniel@0: dist += similarity_by_mask(mfccs_coded,mfccs_idx,compressor,encoding) Daniel@0: count +=1 Daniel@0: Daniel@0: if use_chords: Daniel@0: print_status('Similarity Module: Chord Compression Results') Daniel@0: dist += similarity_by_mask(np.array(chords),chords_idx,compressor,encoding) Daniel@0: count +=1 Daniel@0: Daniel@0: dist = dist / count Daniel@0: Daniel@0: Daniel@0: # get rid of zeros in between Daniel@0: #for idx1 in range(0,len(chromas_idx)): Daniel@0: # dist[idx1][idx1] = 1 Daniel@0: Daniel@0: print_status('dist' + str(dist)) Daniel@0: Daniel@0: # Do MDS scaling with precomputed distance Daniel@0: mds = manifold.MDS(n_components = 2, max_iter=mds_max_iter, n_init=mds_init_tries, dissimilarity='precomputed') Daniel@0: Daniel@0: coordinates = mds.fit_transform(dist) Daniel@0: Daniel@0: return { 'result': { 'list': uris, 'mds': coordinates.tolist()}, 'stats' : st } Daniel@0: # return { 'result': { 'list': uris, 'distance': dist.tolist(), 'mds': coordinates.tolist()}, Daniel@0: # 'stats' : st } Daniel@0: Daniel@0: Daniel@0: Daniel@0: def vector_quantisation(data, set_clusters,num_cores): Daniel@0: # --- Daniel@0: # build codebook! Daniel@0: # --- Daniel@0: # --- 1 quantise chroma data Daniel@0: # --- 1a use scikit-learn k-means Daniel@0: # http://scikit-learn.org/stable/modules/clustering.html Daniel@0: Daniel@0: # quick try Daniel@0: clusterer = KMeans(n_clusters=set_clusters,n_jobs = num_cores) Daniel@0: Daniel@0: # --- 2 get compression distance Daniel@0: # get all single compressed sizes? Daniel@0: data_coded = clusterer.fit_predict(data) Daniel@0: #print_status('Chromas Coded' + str(chromas_coded)) Daniel@0: # print_status('Coding Histogram' + str(np.histogram(chromas_coded))) Daniel@0: return data_coded Daniel@0: Daniel@0: Daniel@0: Daniel@0: def similarity_by_mask(data_coded,data_idx,compressor,encoding): Daniel@0: Daniel@0: # idx is expected to start with the first chroma index of the second piece Daniel@0: # TODO: check indexing (starts at 0 or 1?) Daniel@0: lengths = [] Daniel@0: start_idx = [0] + data_idx[:-1] Daniel@0: dist = np.zeros((len(data_idx),len(data_idx))) Daniel@0: for idx1 in range(0,len(data_idx)): Daniel@0: for idx2 in range(0,len(data_idx)): Daniel@0: if (idx2 < idx1): Daniel@0: # select encoded chromas for the clips Daniel@0: data1_mask = np.zeros(len(data_coded), dtype=bool) Daniel@0: data1_mask[start_idx[idx1]:data_idx[idx1]-1] = True Daniel@0: Daniel@0: data2_mask = np.zeros(len(data_coded), dtype=bool) Daniel@0: data2_mask[start_idx[idx2]:data_idx[idx2]-1] = True Daniel@0: Daniel@0: a_coded = encode(data_coded[data1_mask],format = encoding) Daniel@0: b_coded = encode(data_coded[data2_mask],format = encoding) Daniel@0: # get compression lengths Daniel@0: if compressor == 'zlib': Daniel@0: (a,b,ab) = compressed_length(a_coded,b_coded,compressor) Daniel@0: Daniel@0: else: Daniel@0: # get complement chroma set Daniel@0: ref_mask = ~data1_mask & ~data2_mask Daniel@0: ref_coded = encode(data_coded[ref_mask],format = encoding) Daniel@0: (a,b,ab) = delta_compressed_length(a_coded,b_coded,ref_coded,compressor) Daniel@0: Daniel@0: #NCD(z - min(v, w))/ max(v, w); Daniel@0: dist[idx1][idx2] = (ab - min(a,b))/float(max(a,b)) Daniel@0: Daniel@0: # the above normalised compression distance is symmetric Daniel@0: # this is required by the nds routine below Daniel@0: dist[idx2][idx1] = dist[idx1][idx2] Daniel@0: Daniel@0: return dist Daniel@0: Daniel@0: def encode(data, format = 'string'): Daniel@0: Daniel@0: # Encoding Daniel@0: if format == 'binary': Daniel@0: data_coded = data.tostring() Daniel@0: elif format == 'string': Daniel@0: data_coded = str(data) Daniel@0: return data_coded Daniel@0: Daniel@0: def compressed_length(a_coded,b_coded, type = 'zlib'): Daniel@0: Daniel@0: # Compression Daniel@0: if type == 'zlib': Daniel@0: # zlib is quite helpful https://docs.python.org/2/library/zlib.html#module-zlib Daniel@0: a = len(zlib.compress(a_coded, 9)) Daniel@0: b = len(zlib.compress(a_coded, 9)) Daniel@0: ab = len(zlib.compress(a_coded + b_coded, 9)) Daniel@0: Daniel@0: return (a,b,ab) Daniel@0: Daniel@0: def delta_compressed_length(a_coded,b_coded,ref_coded, type = 'zxd'): Daniel@0: # Compression Daniel@0: # zbs - use bsdiff Daniel@0: # zxd - uses xdelta3 Daniel@0: # zvcd - uses open-vcdiff Daniel@0: # zvcz - uses vczip Daniel@0: # zdiff - converts binary to text and uses diff to produce an ed script Daniel@0: Daniel@0: if type == 'zxd' or type == 'zbs' or type == 'zvcz' or type == 'zdiff' or type == 'zvcd': Daniel@0: Daniel@0: freference = tempfile.NamedTemporaryFile(delete=False) Daniel@0: freference.write(ref_coded) Daniel@0: freference.close() Daniel@0: #print_status('Ref File: ' + freference.name) Daniel@0: Daniel@0: # to be optimised with bufs later Daniel@0: # get length of a regarding reference Daniel@0: command = '/home/dml/src/hg/dml-cliopatria/cpack/dml/scripts/compression/%s encode %s | /home/dml/src/hg/dml-cliopatria/cpack/dml/scripts/compression/length' % (type, freference.name) Daniel@0: # print_status(command) Daniel@0: p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True) Daniel@0: output,err = p1.communicate(input=a_coded) Daniel@0: a = int(output) Daniel@0: Daniel@0: # get length of b regarding reference Daniel@0: p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True) Daniel@0: output,err = p1.communicate(input=b_coded) Daniel@0: b = int(output) Daniel@0: Daniel@0: # get length of a,b regarding reference Daniel@0: p1 = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE,shell=True) Daniel@0: output,err = p1.communicate(input=a_coded + b_coded) Daniel@0: ab = int(output) Daniel@0: Daniel@0: Daniel@0: #print_status('Compressed Output' + compressed) Daniel@0: #print_status('Compressed Size' + str(len(compressed))) Daniel@0: os.remove(freference.name) Daniel@0: return (a,b,ab) Daniel@0: Daniel@0: # histogram of the last entry in a list Daniel@0: # returns the most frequently used key Daniel@0: def chord_histogram(chordstr = []): Daniel@0: global chord_keys Daniel@0: # build histogram Daniel@0: Daniel@0: histo = dict.fromkeys(chord_keys,0) Daniel@0: for chord in chordstr: Daniel@0: histo[chord] = histo.get(chord,0) + 1 Daniel@0: #print_status(str(histo.keys())) Daniel@0: Daniel@0: counts = np.array(histo.values(),float) Daniel@0: if max(counts) > 0: Daniel@0: counts = counts / max(counts) Daniel@0: return (counts) Daniel@0: Daniel@0: