# HG changeset patch # User DaveM # Date 1513618876 0 # Node ID 3d5ca8e78f8fe5dc5c9e91ffb81a3e35dae417dd # Parent 73cf5cabef8694f4e6ce722ae45abe5efaa5c23f update to pDOB parsing, fixing some bugs diff -r 73cf5cabef86 -r 3d5ca8e78f8f V2/dParse.py --- a/V2/dParse.py Mon Dec 11 13:23:54 2017 +0000 +++ b/V2/dParse.py Mon Dec 18 17:41:16 2017 +0000 @@ -5,6 +5,7 @@ import unicodedata from geopy.geocoders import Nominatim from geopy.exc import GeocoderTimedOut +import random import pdb DEFAULT_TIME_H = 12 @@ -34,8 +35,11 @@ dataDict['TOB'] = parseTOB(dataDict[TOB_Q]) dataDict['pDOB'] = parsePartnerDOB(dataDict[p_DOBQ]) dataDict['pTOB'] = parseTOB(dataDict[p_TOBQ]) - # dataDict['COB'] = parseBirthTown(dataDict[COB]) - # dataDict['pCOB'] = parseBirthTown(dataDict[p_COB]) + # MAKE RANDOM PLACE + # dataDict['COB'] = (random.uniform(-90, 90),random.uniform(-90, 90)) + # dataDict['pCOB'] = (random.uniform(-90, 90),random.uniform(-90, 90)) + dataDict['COB'] = parseBirthTown(dataDict[COB]) + dataDict['pCOB'] = parseBirthTown(dataDict[p_COB]) return dataDict @@ -62,17 +66,16 @@ # print s else: s = DEAULT_LOCATION - except GeocoderTimedOut as e: + except: timeoutTime += 1 - print("Error: geocode failed on input %s with message %s, incrementing timeout time to %d"%(s, e.msg,timeoutTime)) + print("Error: geocode failed on input %s, incrementing timeout time to %d"%(s,timeoutTime)) time.sleep(5) geolocator = Nominatim(timeout=timeoutTime) - - - # places = geograpy.get_place_context(text=s) def parsePartnerDOB(dob): + # print dob + # pdb.set_trace() dob = dob.strip() if(dob.count('-') == 2): dob = dob.replace('-','/') @@ -85,28 +88,33 @@ # print dob if len(dob) == 8: # ddmmyyyy dob_.append(dob[:2]) - dob_.append(dob[2:3]) + dob_.append(dob[2:4]) dob_.append(dob[4:]) elif len(dob) == 7 and dob[1] == '1' and (dob[2] == '0' or dob[2] == '1' or dob[2] == '2'): # dmmyyyy dob_.append(dob[0]) - dob_.append(dob[1:2]) - dob_.append(dob[4:]) + dob_.append(dob[1:3]) + dob_.append(dob[3:]) + elif(len(dob) == 7): + if int(dob[:2]) > 31:# dmmyyyy + dob_.append(dob[0]) + dob_.append(dob[1:3]) + dob_.append(dob[3:]) elif len(dob) == 7: # ddmyyyy - dob_.append(dob[0:1]) + dob_.append(dob[0:2]) dob_.append(dob[2]) - dob_.append(dob[4:]) + dob_.append(dob[3:]) elif len(dob) == 6 and dob[3:4] != '19': # ddmmyy dob_.append(dob[:2]) - dob_.append(dob[2:3]) - dob_.append(dob[2:]) + dob_.append(dob[2:4]) + dob_.append(dob[4:]) elif len(dob) == 5 and dob[1] == '1' and (dob[2] == '0' or dob[2] == '1' or dob[2] == '2'): # dmmyy dob_.append(dob[0]) - dob_.append(dob[1:2]) - dob_.append(dob[2:]) + dob_.append(dob[1:3]) + dob_.append(dob[3:]) elif len(dob) == 5: # ddmyy dob_.append(dob[:2]) dob_.append(dob[2]) - dob_.append(dob[2:]) + dob_.append(dob[3:]) elif len(dob) == 4: # dmyy dob_.append(dob[0]) dob_.append(dob[1]) @@ -114,7 +122,7 @@ else: if(len(dob) < 4): return None - print dob + # print dob # print filter(lambda x: x.isdigit(),dob) print 'no / partnerDOB issue' # deal with no /'s @@ -124,10 +132,16 @@ y = int(filter(lambda x: x.isdigit(),dob_[2])) if y < 100: y = y + 1900 - if(d > 31 or m > 12 or y > 2017 or y < 1900): + if (m > 12 and d <= 12): + temp = d + d = m + m = temp + if(d > 31 or d < 1 or m > 12 or m < 1 or y > 2017 or y < 1900): print 'error with DOB '+d+'/'+m+'/'+y + pdb.set_trace() except TypeError: return None + # print (d,m,y) return (d,m,y) def monthStringToNum(s): @@ -198,7 +212,7 @@ try: if ':' in T: T_ = T.split(':') - # pdb.set_trace() + H = int(T_[0]) M = int(T_[1]) else: @@ -221,5 +235,76 @@ M = DEFAULT_TIME_M return (H,M) +def makePayload(dataDict): + if type(dataDict['COB']) is str: + cob_0 = float(dataDict['COB'].split(',')[0][1:]) + cob_1 = float(dataDict['COB'].split(',')[1]) + dataDict['COB'] = (cob_0,cob_1) + if type(dataDict['pCOB']) is str: + pcob_0 = float(dataDict['pCOB'].split(',')[0][1:]) + pcob_1 = float(dataDict['pCOB'].split(',')[1]) + dataDict['pCOB'] = (pcob_0,pcob_1) + if type(dataDict['DOB']) is str: + dataDict['DOB'] = dataDict['DOB'][1:-1].split(',') + if type(dataDict['pDOB']) is str: + dataDict['pDOB'] = dataDict['pDOB'][1:-1].split(',') + if type(dataDict['TOB']) is str: + dataDict['TOB'] = dataDict['TOB'][1:-1].split(',') + if type(dataDict['pTOB']) is str: + dataDict['pTOB'] = dataDict['pTOB'][1:-1].split(',') + # pdb.set_trace() + print dataDict['pDOB'] + R = {'send_calculation':'1', #Req + 'muz_narozeni_den':dataDict['DOB'][0], + 'muz_narozeni_mesic':dataDict['DOB'][1], + 'muz_narozeni_rok':dataDict['DOB'][2], + 'muz_narozeni_hodina':dataDict['TOB'][0], + 'muz_narozeni_minuta':dataDict['TOB'][1], + 'muz_narozeni_city':'', + 'muz_narozeni_mesto_hidden':'Manually+place%3A+%C2%B0%27N%2C+%C2%B0%27E',#auto + 'muz_narozeni_stat_hidden':'XX', + 'muz_narozeni_podstat_kratky_hidden':'', + 'muz_narozeni_podstat_hidden':'', + 'muz_narozeni_podstat2_kratky_hidden':'', + 'muz_narozeni_podstat3_kratky_hidden':'', + 'muz_narozeni_input_hidden':'', + 'muz_narozeni_sirka_stupne':str(abs(dataDict['COB'][0])).split('.')[0], + 'muz_narozeni_sirka_minuty':str(float('0.'+str(dataDict['COB'][0]).split('.')[1])*60).split('.')[0], + 'muz_narozeni_sirka_smer': '1' if dataDict['COB'][0]<0 else '0', #address N Dir (0':'N',1':'S) + 'muz_narozeni_delka_stupne':str(abs(dataDict['COB'][1])).split('.')[0], #address E - Main + 'muz_narozeni_delka_minuty':str(float('0.'+str(dataDict['COB'][1]).split('.')[1])*60).split('.')[0], + 'muz_narozeni_delka_smer': '1' if dataDict['COB'][1]<0 else '0', #address E Dir (0':'E',1':'W) + 'muz_narozeni_timezone_form':'auto', + 'muz_narozeni_timezone_dst_form':'auto', + 'send_calculation':'1', + 'zena_narozeni_den':dataDict['pDOB'][0], + 'zena_narozeni_mesic':dataDict['pDOB'][1], + 'zena_narozeni_rok':dataDict['pDOB'][2], + 'zena_narozeni_hodina':dataDict['pTOB'][0], + 'zena_narozeni_minuta':dataDict['pTOB'][1], + 'zena_narozeni_city':'', + 'zena_narozeni_mesto_hidden':'Manually+place%3A+%C2%B0%27N%2C+%C2%B0%27E', + 'zena_narozeni_stat_hidden':'XX', + 'zena_narozeni_podstat_kratky_hidden':'', + 'zena_narozeni_podstat_hidden':'', + 'zena_narozeni_podstat2_kratky_hidden':'', + 'zena_narozeni_podstat3_kratky_hidden':'', + 'zena_narozeni_input_hidden':'', + 'zena_narozeni_sirka_stupne':str(abs(dataDict['pCOB'][0])).split('.')[0], + 'zena_narozeni_sirka_minuty':str(float('0.'+str(dataDict['pCOB'][0]).split('.')[1])*60).split('.')[0], + 'zena_narozeni_sirka_smer': '1' if dataDict['pCOB'][0]<0 else '0', + 'zena_narozeni_delka_stupne':str(abs(dataDict['pCOB'][1])).split('.')[0], + 'zena_narozeni_delka_minuty':str(float('0.'+str(dataDict['pCOB'][1]).split('.')[1])*60).split('.')[0], + 'zena_narozeni_delka_smer': '1' if dataDict['pCOB'][1]<0 else '0', + 'zena_narozeni_timezone_form':'auto', + 'zena_narozeni_timezone_dst_form':'auto', + 'switch_interpretations':'0', + 'house_system':'placidus', + 'uhel_orbis':'#tabs_redraw'} + return R + + + + diff -r 73cf5cabef86 -r 3d5ca8e78f8f V2/runme.py --- a/V2/runme.py Mon Dec 11 13:23:54 2017 +0000 +++ b/V2/runme.py Mon Dec 18 17:41:16 2017 +0000 @@ -5,27 +5,31 @@ import time import csv import random +import pdb +import os +import pickle from HTMLParser import HTMLParser # from lxml import html from bs4 import BeautifulSoup def parsePage(resp): + # pdb.set_trace() + person = dict() soup = BeautifulSoup(resp.content, 'lxml') tcCell = soup.find_all('div', attrs={'class':'tc'}) for cell in tcCell: - person = dict() tableCell = cell.find_all('td'); if len(tableCell) > 2: C = tableCell[0].strong.contents[0].encode('utf-8') D = tableCell[2].strong.contents[0].encode('utf-8') - print (C,D) + # print (C,D) A = re.search("\/>(.*)
.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1])) # A0 = A.group(1) # A1 = A.group(2).split('\xc2\xb0')[0] # A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0] - print (A.group(1),A.group(2),A.group(3)) + # print (A.group(1),A.group(2),A.group(3)) person[(C,D)] = (A.group(1),A.group(2),A.group(3)) - return person + return person def setURL(p): @@ -41,14 +45,8 @@ # url = "https://horoscopes.astro-seek.com/calculate-love-compatibility/?send_calculation=1&muz_narozeni_den=1&muz_narozeni_mesic=1&muz_narozeni_rok=1970&muz_narozeni_hodina=00&muz_narozeni_minuta=00&muz_narozeni_city=London%2C+United+Kingdom&muz_narozeni_mesto_hidden=London&muz_narozeni_stat_hidden=GB&muz_narozeni_podstat_kratky_hidden=England&muz_narozeni_podstat_hidden=England&muz_narozeni_podstat2_kratky_hidden=Greater+London&muz_narozeni_podstat3_kratky_hidden=undefined&muz_narozeni_input_hidden=&muz_narozeni_sirka_stupne=51&muz_narozeni_sirka_minuty=30&muz_narozeni_sirka_smer=0&muz_narozeni_delka_stupne=0&muz_narozeni_delka_minuty=8&muz_narozeni_delka_smer=1&muz_narozeni_timezone_form=auto&muz_narozeni_timezone_dst_form=auto&send_calculation=1&zena_narozeni_den=1&zena_narozeni_mesic=1&zena_narozeni_rok=1970&zena_narozeni_hodina=00&zena_narozeni_minuta=00&zena_narozeni_city=Berlin%2C+Germany&zena_narozeni_mesto_hidden=Berlin&zena_narozeni_stat_hidden=DE&zena_narozeni_podstat_kratky_hidden=Berlin&zena_narozeni_podstat_hidden=Berlin&zena_narozeni_podstat2_kratky_hidden=undefined&zena_narozeni_podstat3_kratky_hidden=undefined&zena_narozeni_input_hidden=&zena_narozeni_sirka_stupne=52&zena_narozeni_sirka_minuty=31&zena_narozeni_sirka_smer=0&zena_narozeni_delka_stupne=13&zena_narozeni_delka_minuty=24&zena_narozeni_delka_smer=0&zena_narozeni_timezone_form=auto&zena_narozeni_timezone_dst_form=auto&switch_interpretations=0&house_system=placidus&uhel_orbis=#tabs_redraw" # payload = {'muz_narozeni_den':'1','muz_narozeni_mesic':'1','muz_narozeni_rok':'1970'} url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/' - mDay = random.randint(1,29) - mMonth = random.randint(1,12) - mYear = random.randint(1,100)+1917 - fDay = random.randint(1,29) - fMonth = random.randint(1,12) - fYear = random.randint(1,100)+1917 - - payload = {'?send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear} + # payload = {'send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear} + payload = dp.makePayload(p) return (url,payload) def requestURL(url,payload): @@ -56,40 +54,81 @@ time.sleep(5) return r -# def parseCSV(filename): -# stream = csv.DictReader(open(filename,'rb')) -# dictList = [] -# for line in stream: -# dictList.append(regulateData(line)) +def makeURLPayload(url,payload): + url += '?' + for p in payload: + url += '&' + str(p) + url += '=' + str(payload[p]) + return url -# # dictList = headerParse(dictList) -# # dictList = validateData(dictList) -# return dictList - -def printToFile(filename,data): +def printToFile(filename,data,removeAdds): + if removeAdds == True: + del data['DOB'] + del data['TOB'] + del data['pDOB'] + del data['pTOB'] + del data['COB'] + del data['pCOB'] + del data['horiscope'] keys = data[0].keys() with open(filename,'w') as stream: dict_writer = csv.DictWriter(stream, keys) dict_writer.writeheader() dict_writer.writerows(data) +def loadPick(filename): + with open(filename, 'rb') as handle: + b = pickle.load(handle) + return b + +def savePick(filename,data): + with open(filename, 'wb') as handle: + pickle.dump(data,handle) + +def tempPF(fName,data): + f__ = open(fName,'w') + f__.write(data) + f__.close() + def testMain(): people = dp.parseCSV('individuals.csv') def _main(): + pickFile = 'outData.pick' # people = dict() - people = dp.parseCSV('individuals.csv') + if not os.path.exists(pickFile): + print 'reParse file' + people = dp.parseCSV('individuals.csv') + savePick(pickFile,people) + else: + print 'read in ' + pickFile + people = loadPick(pickFile) + horiscopeList = [] - # people = [1,2,3,4,5] for person in people: - print 'parsing person '+ person['ID'] - url,payload = setURL('') - resp = requestURL(url,payload) - - person['horiscope'] = parsePage(resp) - horiscopeList.append(person) + if person['pDOB'] is None or person['pDOB'] == '': + print 'SKIPPING person '+ person['ID'] + ' pDOB is None' + else: + print 'parsing person '+ person['ID'] + url,payload = setURL(person) + resp = requestURL(url,payload) + person['horiscope'] = parsePage(resp) + if not person['horiscope']: # debug if dict is empty + print 'attempt failed, try again' + url,payload = setURL(person) + resp = requestURL(url,payload) + person['horiscope'] = parsePage(resp) + if not person['horiscope']: + print 'attempt two failed' + # pdb.set_trace() + for d in person['horiscope'].keys(): + person[d] = person['horiscope'][d] + horiscopeList.append(person) + savePick(pickFile,person) print horiscopeList - printToFile('outputdata.csv',horiscopeList) + savePick(pickFile,person) + savePick('2'+pickFile,horiscopeList) + printToFile('final_'+outFile,horiscopeList) if __name__ == "__main__": - testMain() + _main() diff -r 73cf5cabef86 -r 3d5ca8e78f8f timesheet.xlsx Binary file timesheet.xlsx has changed