Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/read_songID.py @ 18:c0a08cbdfacd
First script
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Tue, 28 Jul 2015 20:58:57 +0100 |
parents | |
children | e68dbee1f6db |
line wrap: on
line source
import csv import pandas as pd import itertools # Read songIDs from Million Song Dataset songID-trackID mismatches with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out: writer = csv.writer(out, delimiter=',') next = f.readline() while next != "": writer.writerow([next[8:26]]) #print(next[8:26]) next = f.readline() # Read unique songIDs from Taste Profile dataset location = r'~/dataset/train_triplets.txt' df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays']) ddf = df.drop_duplicates(subset = 'song') ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False) # Delete songIDs mismatches from Taste Profile Dataset with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: l1 = list(csv.reader(input1)) chain1 = list(itertools.chain(*l1)) l2 = list(csv.reader(input2)) chain2 = list(itertools.chain(*l2)) l3 = set(chain2) - set(chain1) wr = csv.writer(myfile, delimiter=',') for item in l3: wr.writerow([item]) # Save Taste Profile dataset without SongID mismatches mdf = df[df.song.isin(l3)] mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False) result = pd.DataFrame() for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None): chunk = chunk[chunk.song.isin(l3)] result = result.append(chunk.pivot(index='user', columns='song', values='plays') , ignore_index=True) print (result.shape)