Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
changeset 18:c0a08cbdfacd
First script
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Tue, 28 Jul 2015 20:58:57 +0100 |
parents | c63dac455296 |
children | f1504bb2c552 |
files | Code/read_songID.py |
diffstat | 1 files changed, 42 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Code/read_songID.py Tue Jul 28 20:58:57 2015 +0100 @@ -0,0 +1,42 @@ +import csv +import pandas as pd +import itertools + +# Read songIDs from Million Song Dataset songID-trackID mismatches +with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out: + writer = csv.writer(out, delimiter=',') + next = f.readline() + while next != "": + writer.writerow([next[8:26]]) + #print(next[8:26]) + next = f.readline() + +# Read unique songIDs from Taste Profile dataset +location = r'~/dataset/train_triplets.txt' +df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays']) +ddf = df.drop_duplicates(subset = 'song') +ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False) + +# Delete songIDs mismatches from Taste Profile Dataset +with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: + l1 = list(csv.reader(input1)) + chain1 = list(itertools.chain(*l1)) + l2 = list(csv.reader(input2)) + chain2 = list(itertools.chain(*l2)) + l3 = set(chain2) - set(chain1) + wr = csv.writer(myfile, delimiter=',') + for item in l3: + wr.writerow([item]) + +# Save Taste Profile dataset without SongID mismatches +mdf = df[df.song.isin(l3)] +mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False) + +result = pd.DataFrame() +for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None): + chunk = chunk[chunk.song.isin(l3)] + result = result.append(chunk.pivot(index='user', columns='song', values='plays') + , ignore_index=True) + print (result.shape) + + \ No newline at end of file