view Code/read_taste_profile.py @ 11:38f44dd7e54b

Creating subset from Taste Profile Retrieving mp3 from 7Digital Successfully Some error due to uniencode of Artist and Song metadata.
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Fri, 17 Jul 2015 21:52:18 +0100
parents cc503565339e
children
line wrap: on
line source
import csv
import pandas as pd
import itertools

# Read songIDs from Million Song Dataset songID-trackID mismatches 
with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out:
    writer = csv.writer(out, delimiter=',')
    next = f.readline()
    while next != "":
        writer.writerow([next[8:26]])
        #print(next[8:26])
        next = f.readline()

# Read unique songIDs from Taste Profile dataset
location = r'~/dataset/train_triplets.txt'		
df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
ddf = df.drop_duplicates(subset = 'song')
ddf.to_csv('train_triplets_songID.csv',columns=['song'], header=False, index=False)

# Delete songIDs mismatches from Taste Profile Dataset 
with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
    l1 = list(csv.reader(input1))
    chain1 = list(itertools.chain(*l1))
    l2 = list(csv.reader(input2))
    chain2 = list(itertools.chain(*l2))
    l3 = set(chain2) - set(chain1)
    wr = csv.writer(myfile, delimiter=',')
    for item in l3:
        wr.writerow([item])