DaveM@2: #!/usr/bin/env python DaveM@3: import dParse as dp DaveM@2: import requests DaveM@2: import re DaveM@2: import time DaveM@2: import csv DaveM@2: import random DaveM@6: import pdb DaveM@6: import os DaveM@6: import pickle DaveM@2: from HTMLParser import HTMLParser DaveM@2: # from lxml import html DaveM@2: from bs4 import BeautifulSoup DaveM@2: DaveM@2: def parsePage(resp): DaveM@6: # pdb.set_trace() DaveM@6: person = dict() DaveM@2: soup = BeautifulSoup(resp.content, 'lxml') DaveM@2: tcCell = soup.find_all('div', attrs={'class':'tc'}) DaveM@2: for cell in tcCell: DaveM@2: tableCell = cell.find_all('td'); DaveM@2: if len(tableCell) > 2: DaveM@2: C = tableCell[0].strong.contents[0].encode('utf-8') DaveM@2: D = tableCell[2].strong.contents[0].encode('utf-8') DaveM@6: # print (C,D) DaveM@2: A = re.search("\/>(.*)
.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1])) DaveM@2: # A0 = A.group(1) DaveM@2: # A1 = A.group(2).split('\xc2\xb0')[0] DaveM@2: # A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0] DaveM@6: # print (A.group(1),A.group(2),A.group(3)) DaveM@2: person[(C,D)] = (A.group(1),A.group(2),A.group(3)) DaveM@6: return person DaveM@2: DaveM@2: DaveM@2: def setURL(p): DaveM@2: """ DaveM@2: Code impacting factors into URL DaveM@2: IMPACTING FACTORS DaveM@2: Date of Birth DaveM@2: Birth Time DaveM@2: Country of birth DaveM@2: City of birth (And state of birth) DaveM@2: """ DaveM@2: ## For some reason we need to post men first then women. DaveM@2: # url = "https://horoscopes.astro-seek.com/calculate-love-compatibility/?send_calculation=1&muz_narozeni_den=1&muz_narozeni_mesic=1&muz_narozeni_rok=1970&muz_narozeni_hodina=00&muz_narozeni_minuta=00&muz_narozeni_city=London%2C+United+Kingdom&muz_narozeni_mesto_hidden=London&muz_narozeni_stat_hidden=GB&muz_narozeni_podstat_kratky_hidden=England&muz_narozeni_podstat_hidden=England&muz_narozeni_podstat2_kratky_hidden=Greater+London&muz_narozeni_podstat3_kratky_hidden=undefined&muz_narozeni_input_hidden=&muz_narozeni_sirka_stupne=51&muz_narozeni_sirka_minuty=30&muz_narozeni_sirka_smer=0&muz_narozeni_delka_stupne=0&muz_narozeni_delka_minuty=8&muz_narozeni_delka_smer=1&muz_narozeni_timezone_form=auto&muz_narozeni_timezone_dst_form=auto&send_calculation=1&zena_narozeni_den=1&zena_narozeni_mesic=1&zena_narozeni_rok=1970&zena_narozeni_hodina=00&zena_narozeni_minuta=00&zena_narozeni_city=Berlin%2C+Germany&zena_narozeni_mesto_hidden=Berlin&zena_narozeni_stat_hidden=DE&zena_narozeni_podstat_kratky_hidden=Berlin&zena_narozeni_podstat_hidden=Berlin&zena_narozeni_podstat2_kratky_hidden=undefined&zena_narozeni_podstat3_kratky_hidden=undefined&zena_narozeni_input_hidden=&zena_narozeni_sirka_stupne=52&zena_narozeni_sirka_minuty=31&zena_narozeni_sirka_smer=0&zena_narozeni_delka_stupne=13&zena_narozeni_delka_minuty=24&zena_narozeni_delka_smer=0&zena_narozeni_timezone_form=auto&zena_narozeni_timezone_dst_form=auto&switch_interpretations=0&house_system=placidus&uhel_orbis=#tabs_redraw" DaveM@2: # payload = {'muz_narozeni_den':'1','muz_narozeni_mesic':'1','muz_narozeni_rok':'1970'} DaveM@2: url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/' DaveM@6: # payload = {'send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear} DaveM@6: payload = dp.makePayload(p) DaveM@2: return (url,payload) DaveM@2: DaveM@2: def requestURL(url,payload): DaveM@2: r = requests.get(url, params=payload) DaveM@2: time.sleep(5) DaveM@2: return r DaveM@2: DaveM@6: def makeURLPayload(url,payload): DaveM@6: url += '?' DaveM@6: for p in payload: DaveM@6: url += '&' + str(p) DaveM@6: url += '=' + str(payload[p]) DaveM@6: return url DaveM@3: DaveM@6: def printToFile(filename,data,removeAdds): DaveM@6: if removeAdds == True: DaveM@6: del data['DOB'] DaveM@6: del data['TOB'] DaveM@6: del data['pDOB'] DaveM@6: del data['pTOB'] DaveM@6: del data['COB'] DaveM@6: del data['pCOB'] DaveM@6: del data['horiscope'] DaveM@2: keys = data[0].keys() DaveM@2: with open(filename,'w') as stream: DaveM@2: dict_writer = csv.DictWriter(stream, keys) DaveM@2: dict_writer.writeheader() DaveM@2: dict_writer.writerows(data) DaveM@2: DaveM@6: def loadPick(filename): DaveM@6: with open(filename, 'rb') as handle: DaveM@6: b = pickle.load(handle) DaveM@6: return b DaveM@6: DaveM@6: def savePick(filename,data): DaveM@6: with open(filename, 'wb') as handle: DaveM@6: pickle.dump(data,handle) DaveM@6: DaveM@6: def tempPF(fName,data): DaveM@6: f__ = open(fName,'w') DaveM@6: f__.write(data) DaveM@6: f__.close() DaveM@6: DaveM@3: def testMain(): DaveM@4: people = dp.parseCSV('individuals.csv') DaveM@3: DaveM@3: def _main(): DaveM@6: pickFile = 'outData.pick' DaveM@2: # people = dict() DaveM@6: if not os.path.exists(pickFile): DaveM@6: print 'reParse file' DaveM@6: people = dp.parseCSV('individuals.csv') DaveM@6: savePick(pickFile,people) DaveM@6: else: DaveM@6: print 'read in ' + pickFile DaveM@6: people = loadPick(pickFile) DaveM@6: DaveM@2: horiscopeList = [] DaveM@2: for person in people: DaveM@6: if person['pDOB'] is None or person['pDOB'] == '': DaveM@6: print 'SKIPPING person '+ person['ID'] + ' pDOB is None' DaveM@6: else: DaveM@6: print 'parsing person '+ person['ID'] DaveM@6: url,payload = setURL(person) DaveM@6: resp = requestURL(url,payload) DaveM@6: person['horiscope'] = parsePage(resp) DaveM@6: if not person['horiscope']: # debug if dict is empty DaveM@6: print 'attempt failed, try again' DaveM@6: url,payload = setURL(person) DaveM@6: resp = requestURL(url,payload) DaveM@6: person['horiscope'] = parsePage(resp) DaveM@6: if not person['horiscope']: DaveM@6: print 'attempt two failed' DaveM@6: # pdb.set_trace() DaveM@6: for d in person['horiscope'].keys(): DaveM@6: person[d] = person['horiscope'][d] DaveM@6: horiscopeList.append(person) DaveM@6: savePick(pickFile,person) DaveM@2: print horiscopeList DaveM@6: savePick(pickFile,person) DaveM@6: savePick('2'+pickFile,horiscopeList) DaveM@6: printToFile('final_'+outFile,horiscopeList) DaveM@2: DaveM@2: if __name__ == "__main__": DaveM@6: _main()