changeset 6:3d5ca8e78f8f

update to pDOB parsing, fixing some bugs
author DaveM
date Mon, 18 Dec 2017 17:41:16 +0000
parents 73cf5cabef86
children f3a7cfc52104
files V2/dParse.py V2/runme.py timesheet.xlsx
diffstat 3 files changed, 176 insertions(+), 52 deletions(-) [+]
line wrap: on
line diff
--- a/V2/dParse.py	Mon Dec 11 13:23:54 2017 +0000
+++ b/V2/dParse.py	Mon Dec 18 17:41:16 2017 +0000
@@ -5,6 +5,7 @@
 import unicodedata
 from geopy.geocoders import Nominatim
 from geopy.exc import GeocoderTimedOut
+import random
 import pdb
 
 DEFAULT_TIME_H = 12
@@ -34,8 +35,11 @@
 	dataDict['TOB'] = parseTOB(dataDict[TOB_Q])
 	dataDict['pDOB'] = parsePartnerDOB(dataDict[p_DOBQ])
 	dataDict['pTOB'] = parseTOB(dataDict[p_TOBQ])
-	# dataDict['COB'] = parseBirthTown(dataDict[COB])
-	# dataDict['pCOB'] = parseBirthTown(dataDict[p_COB])
+	# MAKE RANDOM PLACE
+	# dataDict['COB'] = (random.uniform(-90, 90),random.uniform(-90, 90))
+	# dataDict['pCOB'] = (random.uniform(-90, 90),random.uniform(-90, 90))
+	dataDict['COB'] = parseBirthTown(dataDict[COB])
+	dataDict['pCOB'] = parseBirthTown(dataDict[p_COB])
 	return dataDict
 
 
@@ -62,17 +66,16 @@
 					# print s
 				else:
 					s = DEAULT_LOCATION
-		except GeocoderTimedOut as e:
+		except:
 			timeoutTime += 1
-			print("Error: geocode failed on input %s with message %s, incrementing timeout time to %d"%(s, e.msg,timeoutTime))
+			print("Error: geocode failed on input %s, incrementing timeout time to %d"%(s,timeoutTime))
 			time.sleep(5)
 			geolocator = Nominatim(timeout=timeoutTime)
-
-
-
 	# places = geograpy.get_place_context(text=s)
 
 def parsePartnerDOB(dob):
+	# print dob
+	# pdb.set_trace()
 	dob = dob.strip()
 	if(dob.count('-') == 2):
 		dob = dob.replace('-','/')
@@ -85,28 +88,33 @@
 		# print dob
 		if len(dob) == 8: # ddmmyyyy
 			dob_.append(dob[:2])
-			dob_.append(dob[2:3])
+			dob_.append(dob[2:4])
 			dob_.append(dob[4:])
 		elif len(dob) == 7 and dob[1] == '1' and (dob[2] == '0' or dob[2] == '1' or dob[2] == '2'): # dmmyyyy
 			dob_.append(dob[0])
-			dob_.append(dob[1:2])
-			dob_.append(dob[4:])
+			dob_.append(dob[1:3])
+			dob_.append(dob[3:])
+		elif(len(dob) == 7):
+			if int(dob[:2]) > 31:# dmmyyyy
+				dob_.append(dob[0])
+				dob_.append(dob[1:3])
+				dob_.append(dob[3:])
 		elif len(dob) == 7: # ddmyyyy
-			dob_.append(dob[0:1])
+			dob_.append(dob[0:2])
 			dob_.append(dob[2])
-			dob_.append(dob[4:])
+			dob_.append(dob[3:])
 		elif len(dob) == 6 and dob[3:4] != '19': # ddmmyy
 			dob_.append(dob[:2])
-			dob_.append(dob[2:3])
-			dob_.append(dob[2:])
+			dob_.append(dob[2:4])
+			dob_.append(dob[4:])
 		elif len(dob) == 5 and dob[1] == '1' and (dob[2] == '0' or dob[2] == '1' or dob[2] == '2'): # dmmyy
 			dob_.append(dob[0])
-			dob_.append(dob[1:2])
-			dob_.append(dob[2:])
+			dob_.append(dob[1:3])
+			dob_.append(dob[3:])
 		elif len(dob) == 5: # ddmyy
 			dob_.append(dob[:2])
 			dob_.append(dob[2])
-			dob_.append(dob[2:])
+			dob_.append(dob[3:])
 		elif len(dob) == 4: # dmyy
 			dob_.append(dob[0])
 			dob_.append(dob[1])
@@ -114,7 +122,7 @@
 		else:
 			if(len(dob) < 4):
 				return None
-			print dob
+			# print dob
 			# print filter(lambda x: x.isdigit(),dob)
 			print 'no / partnerDOB issue'
 		# deal with no /'s
@@ -124,10 +132,16 @@
 		y = int(filter(lambda x: x.isdigit(),dob_[2]))
 		if y < 100:
 			y = y + 1900
-		if(d > 31 or m > 12 or y > 2017 or y < 1900):
+		if (m > 12 and d <= 12):
+			temp = d
+			d = m
+			m = temp 
+		if(d > 31 or d < 1 or m > 12 or m < 1 or y > 2017 or y < 1900):
 			print 'error with DOB '+d+'/'+m+'/'+y
+			pdb.set_trace()
 	except TypeError:
 		return None
+	# print  (d,m,y)
 	return (d,m,y)
 
 def monthStringToNum(s):
@@ -198,7 +212,7 @@
 	try:
 		if ':' in T:
 			T_ = T.split(':')
-			# pdb.set_trace()
+			
 			H = int(T_[0])
 			M = int(T_[1])
 		else:
@@ -221,5 +235,76 @@
 		M = DEFAULT_TIME_M
 	return (H,M)
 
+def makePayload(dataDict):
+	if type(dataDict['COB']) is str:
+		cob_0 = float(dataDict['COB'].split(',')[0][1:])
+		cob_1 = float(dataDict['COB'].split(',')[1])
+		dataDict['COB'] = (cob_0,cob_1)
+	if type(dataDict['pCOB']) is str:
+		pcob_0 = float(dataDict['pCOB'].split(',')[0][1:])
+		pcob_1 = float(dataDict['pCOB'].split(',')[1])
+		dataDict['pCOB'] = (pcob_0,pcob_1)
+	if type(dataDict['DOB']) is str:
+		dataDict['DOB'] = dataDict['DOB'][1:-1].split(',')
+	if type(dataDict['pDOB']) is str:
+		dataDict['pDOB'] = dataDict['pDOB'][1:-1].split(',')
+	if type(dataDict['TOB']) is str:
+		dataDict['TOB'] = dataDict['TOB'][1:-1].split(',')
+	if type(dataDict['pTOB']) is str:
+		dataDict['pTOB'] = dataDict['pTOB'][1:-1].split(',')
+	# pdb.set_trace()
 
+	print dataDict['pDOB']
 
+	R = {'send_calculation':'1', #Req
+		'muz_narozeni_den':dataDict['DOB'][0],
+		'muz_narozeni_mesic':dataDict['DOB'][1],
+		'muz_narozeni_rok':dataDict['DOB'][2],
+		'muz_narozeni_hodina':dataDict['TOB'][0],
+		'muz_narozeni_minuta':dataDict['TOB'][1],
+		'muz_narozeni_city':'',
+		'muz_narozeni_mesto_hidden':'Manually+place%3A+%C2%B0%27N%2C+%C2%B0%27E',#auto
+		'muz_narozeni_stat_hidden':'XX',
+		'muz_narozeni_podstat_kratky_hidden':'',
+		'muz_narozeni_podstat_hidden':'',
+		'muz_narozeni_podstat2_kratky_hidden':'',
+		'muz_narozeni_podstat3_kratky_hidden':'',
+		'muz_narozeni_input_hidden':'',
+		'muz_narozeni_sirka_stupne':str(abs(dataDict['COB'][0])).split('.')[0],
+		'muz_narozeni_sirka_minuty':str(float('0.'+str(dataDict['COB'][0]).split('.')[1])*60).split('.')[0],
+		'muz_narozeni_sirka_smer': '1' if dataDict['COB'][0]<0 else '0', #address N Dir (0':'N',1':'S)
+		'muz_narozeni_delka_stupne':str(abs(dataDict['COB'][1])).split('.')[0], #address E - Main
+		'muz_narozeni_delka_minuty':str(float('0.'+str(dataDict['COB'][1]).split('.')[1])*60).split('.')[0],
+		'muz_narozeni_delka_smer': '1' if dataDict['COB'][1]<0 else '0', #address E Dir (0':'E',1':'W)
+		'muz_narozeni_timezone_form':'auto',
+		'muz_narozeni_timezone_dst_form':'auto',
+		'send_calculation':'1',
+		'zena_narozeni_den':dataDict['pDOB'][0],
+		'zena_narozeni_mesic':dataDict['pDOB'][1],
+		'zena_narozeni_rok':dataDict['pDOB'][2],
+		'zena_narozeni_hodina':dataDict['pTOB'][0],
+		'zena_narozeni_minuta':dataDict['pTOB'][1],
+		'zena_narozeni_city':'',
+		'zena_narozeni_mesto_hidden':'Manually+place%3A+%C2%B0%27N%2C+%C2%B0%27E',
+		'zena_narozeni_stat_hidden':'XX',
+		'zena_narozeni_podstat_kratky_hidden':'',
+		'zena_narozeni_podstat_hidden':'',
+		'zena_narozeni_podstat2_kratky_hidden':'',
+		'zena_narozeni_podstat3_kratky_hidden':'',
+		'zena_narozeni_input_hidden':'',
+		'zena_narozeni_sirka_stupne':str(abs(dataDict['pCOB'][0])).split('.')[0],
+		'zena_narozeni_sirka_minuty':str(float('0.'+str(dataDict['pCOB'][0]).split('.')[1])*60).split('.')[0],
+		'zena_narozeni_sirka_smer': '1' if dataDict['pCOB'][0]<0 else '0',
+		'zena_narozeni_delka_stupne':str(abs(dataDict['pCOB'][1])).split('.')[0],
+		'zena_narozeni_delka_minuty':str(float('0.'+str(dataDict['pCOB'][1]).split('.')[1])*60).split('.')[0],
+		'zena_narozeni_delka_smer': '1' if dataDict['pCOB'][1]<0 else '0',
+		'zena_narozeni_timezone_form':'auto',
+		'zena_narozeni_timezone_dst_form':'auto',
+		'switch_interpretations':'0',
+		'house_system':'placidus',
+		'uhel_orbis':'#tabs_redraw'}
+	return R
+
+
+
+
--- a/V2/runme.py	Mon Dec 11 13:23:54 2017 +0000
+++ b/V2/runme.py	Mon Dec 18 17:41:16 2017 +0000
@@ -5,27 +5,31 @@
 import time
 import csv
 import random
+import pdb
+import os
+import pickle
 from HTMLParser import HTMLParser
 # from lxml import html
 from bs4 import BeautifulSoup
 
 def parsePage(resp):
+	# pdb.set_trace()
+	person = dict()
 	soup = BeautifulSoup(resp.content, 'lxml')
 	tcCell = soup.find_all('div', attrs={'class':'tc'})
 	for cell in tcCell:
-		person = dict()
 		tableCell = cell.find_all('td');
 		if len(tableCell) > 2:
 			C = tableCell[0].strong.contents[0].encode('utf-8')
 			D = tableCell[2].strong.contents[0].encode('utf-8')
-			print (C,D)
+			# print (C,D)
 			A = re.search("\/>(.*)<br/>.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1]))
 			# A0 = A.group(1)
 			# A1 = A.group(2).split('\xc2\xb0')[0]
 			# A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0]
-			print (A.group(1),A.group(2),A.group(3))
+			# print (A.group(1),A.group(2),A.group(3))
 			person[(C,D)] = (A.group(1),A.group(2),A.group(3))
-		return person
+	return person
 
 
 def setURL(p):
@@ -41,14 +45,8 @@
 	# url = "https://horoscopes.astro-seek.com/calculate-love-compatibility/?send_calculation=1&muz_narozeni_den=1&muz_narozeni_mesic=1&muz_narozeni_rok=1970&muz_narozeni_hodina=00&muz_narozeni_minuta=00&muz_narozeni_city=London%2C+United+Kingdom&muz_narozeni_mesto_hidden=London&muz_narozeni_stat_hidden=GB&muz_narozeni_podstat_kratky_hidden=England&muz_narozeni_podstat_hidden=England&muz_narozeni_podstat2_kratky_hidden=Greater+London&muz_narozeni_podstat3_kratky_hidden=undefined&muz_narozeni_input_hidden=&muz_narozeni_sirka_stupne=51&muz_narozeni_sirka_minuty=30&muz_narozeni_sirka_smer=0&muz_narozeni_delka_stupne=0&muz_narozeni_delka_minuty=8&muz_narozeni_delka_smer=1&muz_narozeni_timezone_form=auto&muz_narozeni_timezone_dst_form=auto&send_calculation=1&zena_narozeni_den=1&zena_narozeni_mesic=1&zena_narozeni_rok=1970&zena_narozeni_hodina=00&zena_narozeni_minuta=00&zena_narozeni_city=Berlin%2C+Germany&zena_narozeni_mesto_hidden=Berlin&zena_narozeni_stat_hidden=DE&zena_narozeni_podstat_kratky_hidden=Berlin&zena_narozeni_podstat_hidden=Berlin&zena_narozeni_podstat2_kratky_hidden=undefined&zena_narozeni_podstat3_kratky_hidden=undefined&zena_narozeni_input_hidden=&zena_narozeni_sirka_stupne=52&zena_narozeni_sirka_minuty=31&zena_narozeni_sirka_smer=0&zena_narozeni_delka_stupne=13&zena_narozeni_delka_minuty=24&zena_narozeni_delka_smer=0&zena_narozeni_timezone_form=auto&zena_narozeni_timezone_dst_form=auto&switch_interpretations=0&house_system=placidus&uhel_orbis=#tabs_redraw"
 	# payload = {'muz_narozeni_den':'1','muz_narozeni_mesic':'1','muz_narozeni_rok':'1970'}
 	url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/'
-	mDay = random.randint(1,29)
-	mMonth = random.randint(1,12)
-	mYear = random.randint(1,100)+1917
-	fDay = random.randint(1,29)
-	fMonth = random.randint(1,12)
-	fYear = random.randint(1,100)+1917
-
-	payload = {'?send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear}
+	# payload = {'send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear}
+	payload = dp.makePayload(p)
 	return (url,payload)
 
 def requestURL(url,payload):
@@ -56,40 +54,81 @@
 	time.sleep(5)
 	return r
 
-# def parseCSV(filename):
-# 	stream = csv.DictReader(open(filename,'rb'))
-# 	dictList = []
-# 	for line in stream:
-# 		dictList.append(regulateData(line))
+def makeURLPayload(url,payload):
+	url += '?'
+	for p in payload:
+		url += '&' + str(p)
+		url += '=' + str(payload[p])
+	return url
 
-# 	# dictList = headerParse(dictList)
-# 	# dictList = validateData(dictList)
-# 	return dictList
-
-def printToFile(filename,data):
+def printToFile(filename,data,removeAdds):
+	if removeAdds == True:
+		del data['DOB']
+		del data['TOB']
+		del data['pDOB']
+		del data['pTOB']
+		del data['COB']
+		del data['pCOB']
+		del data['horiscope']
 	keys = data[0].keys()
 	with open(filename,'w') as stream:
 		dict_writer = csv.DictWriter(stream, keys)
 		dict_writer.writeheader()
 		dict_writer.writerows(data)
 
+def loadPick(filename):
+	with open(filename, 'rb') as handle:
+		b = pickle.load(handle)
+	return b
+
+def savePick(filename,data):
+	with open(filename, 'wb') as handle:
+		pickle.dump(data,handle)
+
+def tempPF(fName,data):
+	f__ = open(fName,'w')
+	f__.write(data)
+	f__.close()
+
 def testMain():
 	people = dp.parseCSV('individuals.csv')
 
 def _main():
+	pickFile = 'outData.pick'
 	# people = dict()
-	people = dp.parseCSV('individuals.csv')
+	if not os.path.exists(pickFile):
+		print 'reParse file'
+		people = dp.parseCSV('individuals.csv')
+		savePick(pickFile,people)
+	else:
+		print 'read in ' + pickFile
+		people = loadPick(pickFile)
+
 	horiscopeList = []
-	# people = [1,2,3,4,5]
 	for person in people:
-		print 'parsing person '+ person['ID']
-		url,payload = setURL('')
-		resp = requestURL(url,payload)
-		
-		person['horiscope'] = parsePage(resp)
-		horiscopeList.append(person)
+		if person['pDOB'] is None or person['pDOB'] == '':
+			print 'SKIPPING person '+ person['ID'] + ' pDOB is None'
+		else:
+			print 'parsing person '+ person['ID']
+			url,payload = setURL(person)
+			resp = requestURL(url,payload)
+			person['horiscope'] = parsePage(resp)
+			if not person['horiscope']: # debug if dict is empty
+				print 'attempt failed,  try again'
+				url,payload = setURL(person)
+				resp = requestURL(url,payload)
+				person['horiscope'] = parsePage(resp)
+				if not person['horiscope']:
+					print 'attempt two failed'
+					# pdb.set_trace()
+			for d in person['horiscope'].keys():
+				person[d] = person['horiscope'][d]
+			horiscopeList.append(person)
+			savePick(pickFile,person)
 	print horiscopeList
-	printToFile('outputdata.csv',horiscopeList)
+	savePick(pickFile,person)
+	savePick('2'+pickFile,horiscopeList)
+	printToFile('final_'+outFile,horiscopeList)
 
 if __name__ == "__main__":
-	testMain()
+	_main()
Binary file timesheet.xlsx has changed