view V3/runme.py @ 11:903559cb34d0

modify horiscope parse, to ensure that missing people get noted
author DaveM
date Tue, 23 Jan 2018 14:29:54 +0000
parents 85c9aa9d90c5
children 18e337b2550d
line wrap: on
line source
#!/usr/bin/env python
import dParse as dp
import requests
import re
import time
import csv
import random
import pdb
import os
import pickle
from HTMLParser import HTMLParser
# from lxml import html
from bs4 import BeautifulSoup

def parsePage(resp):
	# pdb.set_trace()
	person = dict()
	soup = BeautifulSoup(resp.content, 'lxml')
	tcCell = soup.find_all('div', attrs={'class':'tc'})
	for cell in tcCell:
		tableCell = cell.find_all('td');
		if len(tableCell) > 2:
			C = tableCell[0].strong.contents[0].encode('utf-8')
			D = tableCell[2].strong.contents[0].encode('utf-8')
			# print (C,D)
			A = re.search("\/>(.*)<br/>.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1]))
			# A0 = A.group(1)
			# A1 = A.group(2).split('\xc2\xb0')[0]
			# A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0]
			# print (A.group(1),A.group(2),A.group(3))
			person[(C,D)] = (A.group(1),A.group(2),A.group(3))
	return person


def setURL(p):
	url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/'
	payload = dp.makePayload(p)
	return (url,payload)

def requestURL(url,payload):
	r = requests.get(url, params=payload)
	time.sleep(5)
	return r

def makeURLPayload(url,payload):
	url += '?'
	for p in payload:
		url += '&' + str(p)
		url += '=' + str(payload[p])
	return url

def printToFile(filename,data,removeAdds):
	if removeAdds == True:
		del data['DOB']
		del data['TOB']
		del data['pDOB']
		del data['pTOB']
		del data['COB']
		del data['pCOB']
		del data['horiscope']
	# keys = data[0].keys()
	keys = []
	for d in data:
		keys = keys + d.keys()
	keys = sorted(uniqueList(keys))
	with open(filename,'w') as stream:
		dict_writer = csv.DictWriter(stream, keys, extrasaction='ignore')
		dict_writer.writeheader()
		dict_writer.writerows(data)

def loadPick(filename):
	with open(filename, 'rb') as handle:
		b = pickle.load(handle)
	return b

def savePick(filename,data):
	with open(filename, 'wb') as handle:
		pickle.dump(data,handle)

def tempPF(fName,data):
	f__ = open(fName,'w')
	f__.write(data)
	f__.close()

def parseHoriscope(people,saveFile):
	horiscopeList = []
	for person in people:
		if person['pDOB'] is None or person['pDOB'] == '':
			print 'SKIPPING person '+ person['ID'] + ' pDOB is None'
			horiscopeList.append({'ID':person['ID']})
		else:
			print 'parsing person '+ person['ID']
			url,payload = setURL(person)
			resp = requestURL(url,payload)
			person['horiscope'] = parsePage(resp)
			if not person['horiscope']: # debug if dict is empty
				print 'attempt failed,  try again'
				url,payload = setURL(person)
				resp = requestURL(url,payload)
				person['horiscope'] = parsePage(resp)
				if not person['horiscope']:
					print 'attempt two failed'
					# pdb.set_trace()
			for d in person['horiscope'].keys():
				person[d] = person['horiscope'][d]
			horiscopeList.append(person)
			if saveFile is not None:
				savePick(saveFile,horiscopeList)
	return horiscopeList
	# savePick(pickFile,person)
	# savePick('2'+pickFile,horiscopeList)
	# printToFile('final_'+outFile,horiscopeList)

def printDict(d):
	for d_ in d:
		print (d,d_)

def refactorHoriscope(hor):
	d = {}
	d['ID'] = hor['ID']
	for h in hor['horiscope']:
		hs = sorted(h)
		d[(hs[0], hs[1], hor['horiscope'][h][0])] = 1
		d[(hs[0], hs[1])] = float(str(hor['horiscope'][h][1]) + '.' + str(hor['horiscope'][h][2]))
	return d

def uniqueList(seq): 
   # order preserving
   noDupes = []w
   [noDupes.append(i) for i in seq if not noDupes.count(i)]
   return noDupes

def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

def findMissing(unique,keyList):
	missing = []
	for u in unique:
		if u not in keyList:
			missing.append(u)
	return u

def presentResults(saveFile):
	data = []
	data2 = []
	hlist = loadPick(saveFile)
	keyList = []
	for h in hlist:
		d = refactorHoriscope(h)
		keyList.append(d.keys())
		data.append(d)
	uniqueKeys = uniqueList(keyList)
	# for da in data:
	# 	missingKeys = findMissing(uniqueKeys,da.keys())
	# 	# pdb.set_trace()
	# 	d2 = dict(zip(missingKeys,[0]*len(missingKeys)))
	# 	da = merge_two_dicts(da,d2)
	# 	data2.append(da)
	return data

def readInCompatibilityScore(filename):
	stream = csv.DictReader(open(filename,'rb'))
	dictList = []
	for line in stream:
		dictList.append(regulateData(line))
	return dictList
		

def testMain():
	people = dp.parseCSV('individuals.csv')

def _main():
	pickFile = 'outData.pick'
	# people = dict()
	if not os.path.exists(pickFile):
		print 'reParse file'
		people = dp.parseCSV('individuals.csv')
		savePick(pickFile,people)
	else:
		print 'read in ' + pickFile
		people = loadPick(pickFile)
	parseSaveFile = pickFile.split('.')[0]+'_collect.pick'
	parseHoriscope(people,parseSaveFile)
	horiscopeData = presentResults(parseSaveFile)
	

if __name__ == "__main__":
	_main()