hybrid-music-recommender-using-content-based-and-social-information: Code/time_freq_representation/make

annotate Code/time_freq_representation/make_lists.py @ 47:b0186d4a4496 tip

Move 7Digital dataset to Downloads

author	Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date	Sat, 09 Jul 2022 00:50:43 -0500
parents	68a62ca32441
children

rev	line source
p@24	1 """
p@24	2 This script creates lists of audio files contained in a folder.
p@24	3
p@24	4 Source code:
p@24	5 https://github.com/sidsig/ICASSP-MLP-Code/blob/master/make_lists.py
p@24	6
p@24	7 Modified by:
p@24	8 Paulo Chiliguano
p@24	9 MSc candidate Sound and Music Computing
p@24	10 Queen Mary University of London
p@24	11 2015
p@24	12
p@24	13 References:
p@24	14 - Sigtia, S., & Dixon, S. (2014, May). Improved music feature learning with deep neural
p@24	15 networks. In Acoustics, Speech and Signal Processing (ICASSP), 2014 IEEE International
p@24	16 Conference on (pp. 6959-6963). IEEE.
p@24	17 """
p@24	18
p@24	19 import numpy
p@24	20 import numpy.random as random
p@24	21 import os
p@24	22 import pickle
p@24	23 import sys
p@24	24 import utils as U
p@24	25 #import pdb
p@24	26
p@24	27 def read_file(filename):
p@24	28 """
p@24	29 Loads a file into a list
p@24	30 """
p@24	31 file_list=[l.strip() for l in open(filename,'r').readlines()]
p@24	32 return file_list
p@24	33
p@24	34 def get_folds(filelist, n_folds):
p@24	35 n_per_fold = len(filelist) / n_folds
p@24	36 folds = []
p@24	37 for i in range(n_folds-1):
p@24	38 folds.append(filelist[i * n_per_fold: (i + 1) * n_per_fold])
p@24	39 i = n_folds - 1
p@24	40 folds.append(filelist[i * n_per_fold:])
p@24	41 return folds
p@24	42
p@24	43 def generate_mirex_list(train_list, annotations):
p@24	44 out_list = []
p@24	45 for song in train_list:
p@24	46 annot = annotations.get(song,None)
p@24	47 if annot is None:
p@24	48 print 'No annotations for song %s' % song
p@24	49 continue
p@24	50 assert(type('') == type(annot))
p@24	51 out_list.append('%s\t%s\n' % (song,annot))
p@24	52
p@24	53 return out_list
p@24	54
p@24	55
p@24	56 def make_file_list(gtzan_path, n_folds=5,):
p@24	57 """
p@24	58 Generates lists
p@24	59 """
p@24	60 audio_path = os.path.join(gtzan_path,'audio')
p@24	61 out_path = os.path.join(gtzan_path,'lists')
p@24	62 files_list = []
p@24	63 for ext in ['.au', '.mp3', '.wav']:
p@24	64 files = U.getFiles(audio_path, ext)
p@24	65 files_list.extend(files)
p@24	66 random.shuffle(files_list)
p@24	67
p@24	68 if not os.path.exists(out_path):
p@24	69 os.makedirs(out_path)
p@24	70
p@24	71 audio_list_path = os.path.join(out_path, 'audio_files.txt')
p@24	72 open(audio_list_path,'w').writelines(['%s\n' % f for f in files_list])
p@24	73
p@24	74 annotations = get_annotations(files_list)
p@24	75
p@24	76 ground_truth_path = os.path.join(out_path, 'ground_truth.txt')
p@24	77 open(ground_truth_path,'w').writelines(generate_mirex_list(files_list, annotations))
p@24	78 generate_ground_truth_pickle(ground_truth_path)
p@24	79
p@24	80 folds = get_folds(files_list, n_folds=n_folds)
p@24	81
p@24	82 ### Single fold for quick experiments
p@24	83 create_fold(0, 1, folds, annotations, out_path)
p@24	84
p@24	85 for n in range(n_folds):
p@24	86 create_fold(n, n_folds, folds, annotations, out_path)
p@24	87
p@24	88
p@24	89 def create_fold(n, n_folds, folds, annotations, out_path):
p@24	90 train_path = os.path.join(out_path, 'train_%i_of_%i.txt' % (n+1, n_folds))
p@24	91 valid_path = os.path.join(out_path, 'valid_%i_of_%i.txt' % (n+1, n_folds))
p@24	92 test_path = os.path.join(out_path, 'test_%i_of_%i.txt' % (n+1, n_folds))
p@24	93
p@24	94 test_list = folds[n]
p@24	95 train_list = []
p@24	96 for m in range(len(folds)):
p@24	97 if m != n:
p@24	98 train_list.extend(folds[m])
p@24	99
p@24	100 open(train_path,'w').writelines(generate_mirex_list(train_list, annotations))
p@24	101 open(test_path,'w').writelines(generate_mirex_list(test_list, annotations))
p@24	102 split_list_file(train_path, train_path, valid_path, ratio=0.8)
p@24	103
p@24	104 def split_list_file(input_file, out_file1, out_file2, ratio=0.8):
p@24	105 input_list = open(input_file,'r').readlines()
p@24	106
p@24	107 n = len(input_list)
p@24	108 nsplit = int(n *ratio)
p@24	109
p@24	110 list1 = input_list[:nsplit]
p@24	111 list2 = input_list[nsplit:]
p@24	112
p@24	113 open(out_file1, 'w').writelines(list1)
p@24	114 open(out_file2, 'w').writelines(list2)
p@24	115
p@24	116
p@24	117 def get_annotation(filename):
p@24	118 genre = os.path.split(U.parseFile(filename)[0])[-1]
p@24	119 return genre
p@24	120
p@24	121 def get_annotations(files_list):
p@24	122 annotations = {}
p@24	123 for filename in files_list:
p@24	124 annotations[filename] = get_annotation(filename)
p@24	125
p@24	126 return annotations
p@24	127
p@24	128 def generate_ground_truth_pickle(gt_file):
p@24	129 gt_path,_ = os.path.split(gt_file)
p@24	130 tag_file = os.path.join(gt_path,'tags.txt')
p@24	131 gt_pickle = os.path.join(gt_path,'ground_truth.pickle')
p@24	132
p@24	133 lines = open(gt_file,'r').readlines()
p@24	134
p@24	135 tag_set = set()
p@24	136 for line in lines:
p@24	137 filename,tag = line.strip().split('\t')
p@24	138 tag_set.add(tag)
p@24	139 tag_list = sorted(list(tag_set))
p@24	140 open(tag_file,'w').writelines('\n'.join(tag_list + ['']))
p@24	141
p@24	142 tag_dict = dict([(tag,i) for i,tag in enumerate(tag_list)])
p@24	143 n_tags = len(tag_dict)
p@24	144
p@24	145 mp3_dict = {}
p@24	146 for line in lines:
p@24	147 filename,tag = line.strip().split('\t')
p@24	148 tag_vector = mp3_dict.get(filename,numpy.zeros(n_tags))
p@24	149 if tag != '':
p@24	150 tag_vector[tag_dict[tag]] = 1.
p@24	151 mp3_dict[filename] = tag_vector
p@24	152 pickle.dump(mp3_dict,open(gt_pickle,'w'))
p@24	153
p@24	154 if __name__ == '__main__':
p@24	155 if len(sys.argv) < 2:
p@24	156 print 'Usage: python %s gtzan_path [n_folds=10]' % sys.argv[0]
p@24	157 sys.exit()
p@24	158
p@24	159 gtzan_path = os.path.abspath(sys.argv[1])
p@24	160 if len(sys.argv) > 2:
p@24	161 n_folds = int(sys.argv[2])
p@24	162 else:
p@24	163 n_folds = 10
p@24	164
p@24	165 make_file_list(gtzan_path, n_folds)

Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information

annotate Code/time_freq_representation/make_lists.py @ 47:b0186d4a4496 tip