view Code/read_taste_profile.py @ 10:cc503565339e

Downloading a preview mp3 subset from 7digital
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Wed, 15 Jul 2015 19:25:16 +0100
parents 5b45b9f0540e
children 38f44dd7e54b
line wrap: on
line source
import os
import csv
import pandas as pd
import numpy as np
import itertools
import time

# List of h5 files (audio streams)
#with open('/homes/pchilguano/dataset/cal10kHDF5.csv', 'wb') as out:
#	writer = csv.writer(out, delimiter=',')
#	for root, dirs, files in os.walk("/homes/pchilguano/dataset/cal10k"):
#		for file in files:
#			if file.endswith(".h5"):
#				#print(os.path.join(root, file))
#				track = ''.join(['SO',str(file)[2:-3]])
#				print(track)
#				writer.writerow([track])

# Read songIDs from Million Song Dataset songID-trackID mismatches 
with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'wb') as out:
    writer = csv.writer(out, delimiter=',')
    next = f.readline()
    while next != "":
        writer.writerow([next[8:26]])
        #print(next[8:26])
        next = f.readline()

# Read unique songIDs from Taste Profile dataset
location = r'~/dataset/train_triplets.txt'		
df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
ddf = df.drop_duplicates(subset = 'song')
ddf.to_csv('train_triplets_song.csv',columns=['song'], header=False, index=False)

with open('/homes/pchilguano/dataset/sid_mismatches_song.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_song.csv', 'rb') as input2, open('/homes/pchilguano/dataset/test_echonest1.txt', 'wb') as output:
    reader1 = csv.reader(input1)
    reader2 = csv.reader(input2)
    l1 = list(reader1)
    l1.sort()
    l2 = list(reader2)
    l2.sort()
    l3 = [x for x in l1 if x not in l2]
	
		
with open('/homes/pchilguano/dataset/test_echonest.txt', 'rb') as f:
	reader = csv.reader(f)
	your_list = list(reader)
	your_list.sort()
	chain = list(itertools.chain(*your_list))


#store = pd.HDFStore('/homes/pchilguano/dataset/store.h5')

chunksize = 20000
count = 0
frame = pd.DataFrame()
for chunk in pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'], chunksize=chunksize):
	chunk = chunk.sort(columns='song')
	chunk = chunk[chunk['song'].isin(chain)]
	#frame = chunk.query('song == your_list')
	frame = frame.append(chunk.pivot(index='user', columns='song', values='plays'), ignore_index=True)
	count = count + 1
	print([count, frame.shape])
	#for item in your_list:
	#	chunk = chunk[chunk['song'].isin(item)]
	#store.append('df', chunk[chunk['song'].isin(item)])
#store.close()
	
	

	
df = store['df']
chunks = (df.groupby().sum() for df in store.select('df', chunksize=500))
res = next(chunks)  # will raise if there are no chunks!
for c in chunks:
    res = res.add(c, fill_value=0)


sdf = df.to_sparse()