Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/read_taste_profile.py @ 10:cc503565339e
Downloading a preview mp3 subset from 7digital
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Wed, 15 Jul 2015 19:25:16 +0100 |
parents | 5b45b9f0540e |
children | 38f44dd7e54b |
line wrap: on
line source
import os import csv import pandas as pd import numpy as np import itertools import time # List of h5 files (audio streams) #with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out: # writer = csv.writer(out, delimiter=',') # for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"): # for file in files: # if file.endswith(".h5"): # #print(os.path.join(root, file)) # track = ''.join(['SO',str(file)[2:-3]]) # print(track) # writer.writerow([track]) # Read songIDs from Million Song Dataset songID-trackID mismatches with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'wb') as out: writer = csv.writer(out, delimiter=',') next = f.readline() while next != "": writer.writerow([next[8:26]]) #print(next[8:26]) next = f.readline() # Read unique songIDs from Taste Profile dataset location = r'~/dataset/train_triplets.txt' df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays']) ddf = df.drop_duplicates(subset = 'song') ddf.to_csv('train_triplets_song.csv',columns=['song'], header=False, index=False) with open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_song.csv', 'rb') as input2, open('/homes/pchilguano/dataset/test_echonest1.txt', 'wb') as output: reader1 = csv.reader(input1) reader2 = csv.reader(input2) l1 = list(reader1) l1.sort() l2 = list(reader2) l2.sort() l3 = [x for x in l1 if x not in l2] with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as f: reader = csv.reader(f) your_list = list(reader) your_list.sort() chain = list(itertools.chain(*your_list)) #store = pd.HDFStore('/homes/pchilguano/dataset/store.h5') chunksize = 20000 count = 0 frame = pd.DataFrame() for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize): chunk = chunk.sort(columns='song') chunk = chunk[chunk['song'].isin(chain)] #frame = chunk.query('song == your_list') frame = frame.append(chunk.pivot(index='user', columns='song', values='plays'), ignore_index=True) count = count + 1 print([count, frame.shape]) #for item in your_list: # chunk = chunk[chunk['song'].isin(item)] #store.append('df', chunk[chunk['song'].isin(item)]) #store.close() df = store['df'] chunks = (df.groupby().sum() for df in store.select('df', chunksize=500)) res = next(chunks) # will raise if there are no chunks! for c in chunks: res = res.add(c, fill_value=0) sdf = df.to_sparse()