view Code/read_taste_profile.py @ 8:28f15e232028

test to get audio samples URL from 7 digital
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Tue, 14 Jul 2015 23:41:55 +0100
parents 4de098e10bbb
children 5b45b9f0540e
line wrap: on
line source
import os
import csv
import pandas as pd
import numpy as np
import itertools
import time

# List of h5 files (audio streams)
#with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out:
#	writer = csv.writer(out, delimiter=',')
#	for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"):
#		for file in files:
#			if file.endswith(".h5"):
#				#print(os.path.join(root, file))
#				track = ''.join(['SO',str(file)[2:-3]])
#				print(track)
#				writer.writerow([track])
				
#with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'rb') as f:
#	reader = csv.reader(f)
#	your_list = list(reader)
#	your_list.sort()
#	chain = list(itertools.chain(*your_list))


#store = pd.HDFStore('/homes/pchilguano/dataset/store.h5')
location = r'~/dataset/train_triplets.txt'
chunksize = 10000
count = 0
frame = pd.DataFrame()
for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize):
	#chunk.sort(columns='song')
	#frame = chunk.query('song == your_list')
	frame = frame.append(chunk.pivot(index='user', columns='song', values='plays'), ignore_index=True)
	count = count + 1
	print(count)
	#for item in your_list:
		#chunk['song'].isin(item)
	#store.append('df', chunk[chunk['song'].isin(item)])
#store.close()
	
	
df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
ddf = df.drop_duplicates(subset = 'song')
ddf.to_csv('train_triplets_song.csv',columns=['song'], header=False, index=False)

with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'wb') as out:
	writer = csv.writer(out, delimiter=',')
	next = f.readline()
	while next != "":
		writer.writerow([next[8:26]])
		print(next[8:26])
		next = f.readline()
#mismatch.to_csv('sid_mismatches_song.csv',columns=1, header=False, index=False)


from pyechonest import song, config
config.ECHO_NEST_API_KEY="LINDFDUTQZQ781IE8"
with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as input, open('/homes/pchilguano/dataset/test_echonest_url.txt', 'wb') as output:
	writer = csv.writer(output, delimiter=',')
	next = input.readline()
	while next != "":
		time.sleep(1)
		s = song.Song(next[:-2])
		time.sleep(1)
		ss_tracks = s.get_tracks('7digital-UK')
		if len(ss_tracks) != 0:
			ss_track = ss_tracks[0]
			preview_url = ss_track.get('preview_url')
			print(preview_url)
			writer.writerow([next[:-2], preview_url])
		next = input.readline()
	
	
df = store['df']
chunks = (df.groupby().sum() for df in store.select('df', chunksize=500))
res = next(chunks)  # will raise if there are no chunks!
for c in chunks:
    res = res.add(c, fill_value=0)


sdf = df.to_sparse()