Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/read_taste_profile.py @ 13:0a0d6203638a
Working on WMF
author | Paulo Chiliguano <p.e.chiliguano@se14.qmul.ac.uk> |
---|---|
date | Tue, 21 Jul 2015 10:03:34 +0100 |
parents | 38f44dd7e54b |
children |
line wrap: on
line source
import csv import pandas as pd import itertools # Read songIDs from Million Song Dataset songID-trackID mismatches with open('/homes/pchilguano/dataset/sid_mismatches.txt', 'rb') as f, open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'wb') as out: writer = csv.writer(out, delimiter=',') next = f.readline() while next != "": writer.writerow([next[8:26]]) #print(next[8:26]) next = f.readline() # Read unique songIDs from Taste Profile dataset location = r'~/dataset/train_triplets.txt' df = pd.read_csv(location, delim_whitespace=True, header=None, names=['user','song','plays']) ddf = df.drop_duplicates(subset = 'song') ddf.to_csv('train_triplets_songID.csv',columns=['song'], header=False, index=False) # Delete songIDs mismatches from Taste Profile Dataset with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: l1 = list(csv.reader(input1)) chain1 = list(itertools.chain(*l1)) l2 = list(csv.reader(input2)) chain2 = list(itertools.chain(*l2)) l3 = set(chain2) - set(chain1) wr = csv.writer(myfile, delimiter=',') for item in l3: wr.writerow([item])