Mercurial > hg > horiscopes
view V3/runme.py @ 11:903559cb34d0
modify horiscope parse, to ensure that missing people get noted
author | DaveM |
---|---|
date | Tue, 23 Jan 2018 14:29:54 +0000 |
parents | 85c9aa9d90c5 |
children | 18e337b2550d |
line wrap: on
line source
#!/usr/bin/env python import dParse as dp import requests import re import time import csv import random import pdb import os import pickle from HTMLParser import HTMLParser # from lxml import html from bs4 import BeautifulSoup def parsePage(resp): # pdb.set_trace() person = dict() soup = BeautifulSoup(resp.content, 'lxml') tcCell = soup.find_all('div', attrs={'class':'tc'}) for cell in tcCell: tableCell = cell.find_all('td'); if len(tableCell) > 2: C = tableCell[0].strong.contents[0].encode('utf-8') D = tableCell[2].strong.contents[0].encode('utf-8') # print (C,D) A = re.search("\/>(.*)<br/>.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1])) # A0 = A.group(1) # A1 = A.group(2).split('\xc2\xb0')[0] # A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0] # print (A.group(1),A.group(2),A.group(3)) person[(C,D)] = (A.group(1),A.group(2),A.group(3)) return person def setURL(p): url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/' payload = dp.makePayload(p) return (url,payload) def requestURL(url,payload): r = requests.get(url, params=payload) time.sleep(5) return r def makeURLPayload(url,payload): url += '?' for p in payload: url += '&' + str(p) url += '=' + str(payload[p]) return url def printToFile(filename,data,removeAdds): if removeAdds == True: del data['DOB'] del data['TOB'] del data['pDOB'] del data['pTOB'] del data['COB'] del data['pCOB'] del data['horiscope'] # keys = data[0].keys() keys = [] for d in data: keys = keys + d.keys() keys = sorted(uniqueList(keys)) with open(filename,'w') as stream: dict_writer = csv.DictWriter(stream, keys, extrasaction='ignore') dict_writer.writeheader() dict_writer.writerows(data) def loadPick(filename): with open(filename, 'rb') as handle: b = pickle.load(handle) return b def savePick(filename,data): with open(filename, 'wb') as handle: pickle.dump(data,handle) def tempPF(fName,data): f__ = open(fName,'w') f__.write(data) f__.close() def parseHoriscope(people,saveFile): horiscopeList = [] for person in people: if person['pDOB'] is None or person['pDOB'] == '': print 'SKIPPING person '+ person['ID'] + ' pDOB is None' horiscopeList.append({'ID':person['ID']}) else: print 'parsing person '+ person['ID'] url,payload = setURL(person) resp = requestURL(url,payload) person['horiscope'] = parsePage(resp) if not person['horiscope']: # debug if dict is empty print 'attempt failed, try again' url,payload = setURL(person) resp = requestURL(url,payload) person['horiscope'] = parsePage(resp) if not person['horiscope']: print 'attempt two failed' # pdb.set_trace() for d in person['horiscope'].keys(): person[d] = person['horiscope'][d] horiscopeList.append(person) if saveFile is not None: savePick(saveFile,horiscopeList) return horiscopeList # savePick(pickFile,person) # savePick('2'+pickFile,horiscopeList) # printToFile('final_'+outFile,horiscopeList) def printDict(d): for d_ in d: print (d,d_) def refactorHoriscope(hor): d = {} d['ID'] = hor['ID'] for h in hor['horiscope']: hs = sorted(h) d[(hs[0], hs[1], hor['horiscope'][h][0])] = 1 d[(hs[0], hs[1])] = float(str(hor['horiscope'][h][1]) + '.' + str(hor['horiscope'][h][2])) return d def uniqueList(seq): # order preserving noDupes = []w [noDupes.append(i) for i in seq if not noDupes.count(i)] return noDupes def merge_two_dicts(x, y): z = x.copy() # start with x's keys and values z.update(y) # modifies z with y's keys and values & returns None return z def findMissing(unique,keyList): missing = [] for u in unique: if u not in keyList: missing.append(u) return u def presentResults(saveFile): data = [] data2 = [] hlist = loadPick(saveFile) keyList = [] for h in hlist: d = refactorHoriscope(h) keyList.append(d.keys()) data.append(d) uniqueKeys = uniqueList(keyList) # for da in data: # missingKeys = findMissing(uniqueKeys,da.keys()) # # pdb.set_trace() # d2 = dict(zip(missingKeys,[0]*len(missingKeys))) # da = merge_two_dicts(da,d2) # data2.append(da) return data def readInCompatibilityScore(filename): stream = csv.DictReader(open(filename,'rb')) dictList = [] for line in stream: dictList.append(regulateData(line)) return dictList def testMain(): people = dp.parseCSV('individuals.csv') def _main(): pickFile = 'outData.pick' # people = dict() if not os.path.exists(pickFile): print 'reParse file' people = dp.parseCSV('individuals.csv') savePick(pickFile,people) else: print 'read in ' + pickFile people = loadPick(pickFile) parseSaveFile = pickFile.split('.')[0]+'_collect.pick' parseHoriscope(people,parseSaveFile) horiscopeData = presentResults(parseSaveFile) if __name__ == "__main__": _main()