Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/read_taste_profile.py @ 8:28f15e232028
test to get audio samples URL from 7 digital
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Tue, 14 Jul 2015 23:41:55 +0100 |
parents | 4de098e10bbb |
children | 5b45b9f0540e |
line wrap: on
line source
import os import csv import pandas as pd import numpy as np import itertools import time # List of h5 files (audio streams) #with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out: # writer = csv.writer(out, delimiter=',') # for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"): # for file in files: # if file.endswith(".h5"): # #print(os.path.join(root, file)) # track = ''.join(['SO',str(file)[2:-3]]) # print(track) # writer.writerow([track]) #with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'rb') as f: # reader = csv.reader(f) # your_list = list(reader) # your_list.sort() # chain = list(itertools.chain(*your_list)) #store = pd.HDFStore('/homes/pchilguano/dataset/store.h5') location = r'~/dataset/train_triplets.txt' chunksize = 10000 count = 0 frame = pd.DataFrame() for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize): #chunk.sort(columns='song') #frame = chunk.query('song == your_list') frame = frame.append(chunk.pivot(index='user', columns='song', values='plays'), ignore_index=True) count = count + 1 print(count) #for item in your_list: #chunk['song'].isin(item) #store.append('df', chunk[chunk['song'].isin(item)]) #store.close() df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays']) ddf = df.drop_duplicates(subset = 'song') ddf.to_csv('train_triplets_song.csv',columns=['song'], header=False, index=False) with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'wb') as out: writer = csv.writer(out, delimiter=',') next = f.readline() while next != "": writer.writerow([next[8:26]]) print(next[8:26]) next = f.readline() #mismatch.to_csv('sid_mismatches_song.csv',columns=1, header=False, index=False) from pyechonest import song, config config.ECHO_NEST_API_KEY="LINDFDUTQZQ781IE8" with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as input, open('/homes/pchilguano/dataset/test_echonest_url.txt', 'wb') as output: writer = csv.writer(output, delimiter=',') next = input.readline() while next != "": time.sleep(1) s = song.Song(next[:-2]) time.sleep(1) ss_tracks = s.get_tracks('7digital-UK') if len(ss_tracks) != 0: ss_track = ss_tracks[0] preview_url = ss_track.get('preview_url') print(preview_url) writer.writerow([next[:-2], preview_url]) next = input.readline() df = store['df'] chunks = (df.groupby().sum() for df in store.select('df', chunksize=500)) res = next(chunks) # will raise if there are no chunks! for c in chunks: res = res.add(c, fill_value=0) sdf = df.to_sparse()