view Code/read_songID.py @ 18:c0a08cbdfacd

First script
author Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk>
date Tue, 28 Jul 2015 20:58:57 +0100
parents
children e68dbee1f6db
line wrap: on
line source
import csv
import pandas as pd
import itertools

# Read songIDs from Million Song Dataset songID-trackID mismatches 
with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out:
    writer = csv.writer(out, delimiter=',')
    next = f.readline()
    while next != "":
        writer.writerow([next[8:26]])
        #print(next[8:26])
        next = f.readline()

# Read unique songIDs from Taste Profile dataset
location = r'~/dataset/train_triplets.txt'		
df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays'])
ddf = df.drop_duplicates(subset = 'song')
ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False)

# Delete songIDs mismatches from Taste Profile Dataset 
with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile:
    l1 = list(csv.reader(input1))
    chain1 = list(itertools.chain(*l1))
    l2 = list(csv.reader(input2))
    chain2 = list(itertools.chain(*l2))
    l3 = set(chain2) - set(chain1)
    wr = csv.writer(myfile, delimiter=',')
    for item in l3:
        wr.writerow([item])

# Save Taste Profile dataset without SongID mismatches
mdf = df[df.song.isin(l3)]
mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False)

result = pd.DataFrame()
for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None):
    chunk = chunk[chunk.song.isin(l3)]    
    result = result.append(chunk.pivot(index='user', columns='song', values='plays')    
    , ignore_index=True)
    print (result.shape)