Mercurial > hg > hybrid-music-recommender-using-content-based-and-social-information
view Code/read_songID.py @ 21:e68dbee1f6db
Modified code
New datasets
Updated report
author | Paulo Chiliguano <p.e.chiilguano@se14.qmul.ac.uk> |
---|---|
date | Tue, 11 Aug 2015 10:50:36 +0100 |
parents | c0a08cbdfacd |
children |
line wrap: on
line source
import time import pandas as pd # Read songIDs from Million Song Dataset songID-trackID mismatches start_time = time.time() print 'Reading songID mismatches...' filename = '/Users/paulochiliguano/Documents/msc-project/Dataset/\ sid_mismatches.txt' with open(filename, 'rb') as f: mismatches = set() next = f.readline() while next != "": songID = next[8:26] mismatches.add(songID) #print(next[8:26]) next = f.readline() # Delete rows with songIDs mismatches from Taste Profile Subset print 'Reading Taste Profile subset...' result = pd.DataFrame() filename = '/Users/paulochiliguano/Documents/msc-project/Dataset/\ train_triplets.txt' filename_out = '/Users/paulochiliguano/Documents/msc-project/Dataset/\ train_triplets_wo_mismatches.csv' for chunk in pd.read_csv( filename, low_memory=False, delim_whitespace=True, chunksize=20000, names=['user', 'song', 'plays'], header=None): chunk = chunk[~chunk.song.isin(mismatches)] chunk.to_csv(filename_out, mode='a', header=False, index=False) #result = result.append(chunk, ignore_index=True) elapsed_time = time.time() - start_time print 'Execution time: %.3f seconds' % elapsed_time #result.to_pickle('/homes/pchilguano/dataset/train_triplets_wo_mismatch.pkl') # Select (most active) users with more than 1000 songs played start_time = time.time() print 'Reading (filtered) Taste Profile subset...' df = pd.read_csv( filename_out, delim_whitespace=False, header=None, names=['user','song','plays']) print 'Selecting most active users (>= 1000 songs played)...' df_active = df.groupby('user').filter(lambda x: len(x) > 1000) print 'Reducing Taste Profile subset to 1500 songs...' counts = df_active['song'].value_counts().head(1500) df_active = df_active.loc[df['song'].isin(counts.index), :] df_active.to_pickle('/Users/paulochiliguano/Documents/msc-project/Dataset/\ CF_dataset.pkl') filename = '/Users/paulochiliguano/Documents/msc-project/Dataset/\ CF_dataset_songID.txt' with open(filename, 'wb') as f: for item in counts.index.tolist(): f.write("%s\n" % item) elapsed_time = time.time() - start_time print 'Execution time: %.3f seconds' % elapsed_time ''' #important #df['user'].value_counts().head(50) ddf = df.drop_duplicates(subset = 'song') ddf.to_csv('/homes/pchilguano/dataset/train_triplets_songID.csv',columns=['song'], header=False, index=False) with open('/homes/pchilguano/dataset/sid_mismatches_songID.txt', 'rb') as input1, open('/homes/pchilguano/dataset/train_triplets_songID.csv', 'rb') as input2, open('/homes/pchilguano/dataset/echonest_songID.txt', 'wb') as myfile: l1 = list(csv.reader(input1)) chain1 = list(itertools.chain(*l1)) l2 = list(csv.reader(input2)) chain2 = list(itertools.chain(*l2)) l3 = set(chain2) - set(chain1) wr = csv.writer(myfile, delimiter=',') for item in l3: wr.writerow([item]) # Save Taste Profile dataset without SongID mismatches mdf = df[df.song.isin(l3)] mdf.to_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', header=False, index=False) result = pd.DataFrame() for chunk in pd.read_csv('/homes/pchilguano/dataset/train_triplets_wo_mismatches.csv', low_memory = False, delim_whitespace=False, chunksize=10000, names=['user','song','plays'], header=None): chunk = chunk[chunk.song.isin(l3)] result = result.append(chunk.pivot(index='user', columns='song', values='plays') , ignore_index=True) print (result.shape) '''