Mercurial > hg > horiscopes
comparison V2/runme.py @ 6:3d5ca8e78f8f
update to pDOB parsing, fixing some bugs
author | DaveM |
---|---|
date | Mon, 18 Dec 2017 17:41:16 +0000 |
parents | 99115e36316b |
children |
comparison
equal
deleted
inserted
replaced
5:73cf5cabef86 | 6:3d5ca8e78f8f |
---|---|
3 import requests | 3 import requests |
4 import re | 4 import re |
5 import time | 5 import time |
6 import csv | 6 import csv |
7 import random | 7 import random |
8 import pdb | |
9 import os | |
10 import pickle | |
8 from HTMLParser import HTMLParser | 11 from HTMLParser import HTMLParser |
9 # from lxml import html | 12 # from lxml import html |
10 from bs4 import BeautifulSoup | 13 from bs4 import BeautifulSoup |
11 | 14 |
12 def parsePage(resp): | 15 def parsePage(resp): |
16 # pdb.set_trace() | |
17 person = dict() | |
13 soup = BeautifulSoup(resp.content, 'lxml') | 18 soup = BeautifulSoup(resp.content, 'lxml') |
14 tcCell = soup.find_all('div', attrs={'class':'tc'}) | 19 tcCell = soup.find_all('div', attrs={'class':'tc'}) |
15 for cell in tcCell: | 20 for cell in tcCell: |
16 person = dict() | |
17 tableCell = cell.find_all('td'); | 21 tableCell = cell.find_all('td'); |
18 if len(tableCell) > 2: | 22 if len(tableCell) > 2: |
19 C = tableCell[0].strong.contents[0].encode('utf-8') | 23 C = tableCell[0].strong.contents[0].encode('utf-8') |
20 D = tableCell[2].strong.contents[0].encode('utf-8') | 24 D = tableCell[2].strong.contents[0].encode('utf-8') |
21 print (C,D) | 25 # print (C,D) |
22 A = re.search("\/>(.*)<br/>.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1])) | 26 A = re.search("\/>(.*)<br/>.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1])) |
23 # A0 = A.group(1) | 27 # A0 = A.group(1) |
24 # A1 = A.group(2).split('\xc2\xb0')[0] | 28 # A1 = A.group(2).split('\xc2\xb0')[0] |
25 # A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0] | 29 # A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0] |
26 print (A.group(1),A.group(2),A.group(3)) | 30 # print (A.group(1),A.group(2),A.group(3)) |
27 person[(C,D)] = (A.group(1),A.group(2),A.group(3)) | 31 person[(C,D)] = (A.group(1),A.group(2),A.group(3)) |
28 return person | 32 return person |
29 | 33 |
30 | 34 |
31 def setURL(p): | 35 def setURL(p): |
32 """ | 36 """ |
33 Code impacting factors into URL | 37 Code impacting factors into URL |
39 """ | 43 """ |
40 ## For some reason we need to post men first then women. | 44 ## For some reason we need to post men first then women. |
41 # url = "https://horoscopes.astro-seek.com/calculate-love-compatibility/?send_calculation=1&muz_narozeni_den=1&muz_narozeni_mesic=1&muz_narozeni_rok=1970&muz_narozeni_hodina=00&muz_narozeni_minuta=00&muz_narozeni_city=London%2C+United+Kingdom&muz_narozeni_mesto_hidden=London&muz_narozeni_stat_hidden=GB&muz_narozeni_podstat_kratky_hidden=England&muz_narozeni_podstat_hidden=England&muz_narozeni_podstat2_kratky_hidden=Greater+London&muz_narozeni_podstat3_kratky_hidden=undefined&muz_narozeni_input_hidden=&muz_narozeni_sirka_stupne=51&muz_narozeni_sirka_minuty=30&muz_narozeni_sirka_smer=0&muz_narozeni_delka_stupne=0&muz_narozeni_delka_minuty=8&muz_narozeni_delka_smer=1&muz_narozeni_timezone_form=auto&muz_narozeni_timezone_dst_form=auto&send_calculation=1&zena_narozeni_den=1&zena_narozeni_mesic=1&zena_narozeni_rok=1970&zena_narozeni_hodina=00&zena_narozeni_minuta=00&zena_narozeni_city=Berlin%2C+Germany&zena_narozeni_mesto_hidden=Berlin&zena_narozeni_stat_hidden=DE&zena_narozeni_podstat_kratky_hidden=Berlin&zena_narozeni_podstat_hidden=Berlin&zena_narozeni_podstat2_kratky_hidden=undefined&zena_narozeni_podstat3_kratky_hidden=undefined&zena_narozeni_input_hidden=&zena_narozeni_sirka_stupne=52&zena_narozeni_sirka_minuty=31&zena_narozeni_sirka_smer=0&zena_narozeni_delka_stupne=13&zena_narozeni_delka_minuty=24&zena_narozeni_delka_smer=0&zena_narozeni_timezone_form=auto&zena_narozeni_timezone_dst_form=auto&switch_interpretations=0&house_system=placidus&uhel_orbis=#tabs_redraw" | 45 # url = "https://horoscopes.astro-seek.com/calculate-love-compatibility/?send_calculation=1&muz_narozeni_den=1&muz_narozeni_mesic=1&muz_narozeni_rok=1970&muz_narozeni_hodina=00&muz_narozeni_minuta=00&muz_narozeni_city=London%2C+United+Kingdom&muz_narozeni_mesto_hidden=London&muz_narozeni_stat_hidden=GB&muz_narozeni_podstat_kratky_hidden=England&muz_narozeni_podstat_hidden=England&muz_narozeni_podstat2_kratky_hidden=Greater+London&muz_narozeni_podstat3_kratky_hidden=undefined&muz_narozeni_input_hidden=&muz_narozeni_sirka_stupne=51&muz_narozeni_sirka_minuty=30&muz_narozeni_sirka_smer=0&muz_narozeni_delka_stupne=0&muz_narozeni_delka_minuty=8&muz_narozeni_delka_smer=1&muz_narozeni_timezone_form=auto&muz_narozeni_timezone_dst_form=auto&send_calculation=1&zena_narozeni_den=1&zena_narozeni_mesic=1&zena_narozeni_rok=1970&zena_narozeni_hodina=00&zena_narozeni_minuta=00&zena_narozeni_city=Berlin%2C+Germany&zena_narozeni_mesto_hidden=Berlin&zena_narozeni_stat_hidden=DE&zena_narozeni_podstat_kratky_hidden=Berlin&zena_narozeni_podstat_hidden=Berlin&zena_narozeni_podstat2_kratky_hidden=undefined&zena_narozeni_podstat3_kratky_hidden=undefined&zena_narozeni_input_hidden=&zena_narozeni_sirka_stupne=52&zena_narozeni_sirka_minuty=31&zena_narozeni_sirka_smer=0&zena_narozeni_delka_stupne=13&zena_narozeni_delka_minuty=24&zena_narozeni_delka_smer=0&zena_narozeni_timezone_form=auto&zena_narozeni_timezone_dst_form=auto&switch_interpretations=0&house_system=placidus&uhel_orbis=#tabs_redraw" |
42 # payload = {'muz_narozeni_den':'1','muz_narozeni_mesic':'1','muz_narozeni_rok':'1970'} | 46 # payload = {'muz_narozeni_den':'1','muz_narozeni_mesic':'1','muz_narozeni_rok':'1970'} |
43 url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/' | 47 url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/' |
44 mDay = random.randint(1,29) | 48 # payload = {'send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear} |
45 mMonth = random.randint(1,12) | 49 payload = dp.makePayload(p) |
46 mYear = random.randint(1,100)+1917 | |
47 fDay = random.randint(1,29) | |
48 fMonth = random.randint(1,12) | |
49 fYear = random.randint(1,100)+1917 | |
50 | |
51 payload = {'?send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear} | |
52 return (url,payload) | 50 return (url,payload) |
53 | 51 |
54 def requestURL(url,payload): | 52 def requestURL(url,payload): |
55 r = requests.get(url, params=payload) | 53 r = requests.get(url, params=payload) |
56 time.sleep(5) | 54 time.sleep(5) |
57 return r | 55 return r |
58 | 56 |
59 # def parseCSV(filename): | 57 def makeURLPayload(url,payload): |
60 # stream = csv.DictReader(open(filename,'rb')) | 58 url += '?' |
61 # dictList = [] | 59 for p in payload: |
62 # for line in stream: | 60 url += '&' + str(p) |
63 # dictList.append(regulateData(line)) | 61 url += '=' + str(payload[p]) |
62 return url | |
64 | 63 |
65 # # dictList = headerParse(dictList) | 64 def printToFile(filename,data,removeAdds): |
66 # # dictList = validateData(dictList) | 65 if removeAdds == True: |
67 # return dictList | 66 del data['DOB'] |
68 | 67 del data['TOB'] |
69 def printToFile(filename,data): | 68 del data['pDOB'] |
69 del data['pTOB'] | |
70 del data['COB'] | |
71 del data['pCOB'] | |
72 del data['horiscope'] | |
70 keys = data[0].keys() | 73 keys = data[0].keys() |
71 with open(filename,'w') as stream: | 74 with open(filename,'w') as stream: |
72 dict_writer = csv.DictWriter(stream, keys) | 75 dict_writer = csv.DictWriter(stream, keys) |
73 dict_writer.writeheader() | 76 dict_writer.writeheader() |
74 dict_writer.writerows(data) | 77 dict_writer.writerows(data) |
75 | 78 |
79 def loadPick(filename): | |
80 with open(filename, 'rb') as handle: | |
81 b = pickle.load(handle) | |
82 return b | |
83 | |
84 def savePick(filename,data): | |
85 with open(filename, 'wb') as handle: | |
86 pickle.dump(data,handle) | |
87 | |
88 def tempPF(fName,data): | |
89 f__ = open(fName,'w') | |
90 f__.write(data) | |
91 f__.close() | |
92 | |
76 def testMain(): | 93 def testMain(): |
77 people = dp.parseCSV('individuals.csv') | 94 people = dp.parseCSV('individuals.csv') |
78 | 95 |
79 def _main(): | 96 def _main(): |
97 pickFile = 'outData.pick' | |
80 # people = dict() | 98 # people = dict() |
81 people = dp.parseCSV('individuals.csv') | 99 if not os.path.exists(pickFile): |
100 print 'reParse file' | |
101 people = dp.parseCSV('individuals.csv') | |
102 savePick(pickFile,people) | |
103 else: | |
104 print 'read in ' + pickFile | |
105 people = loadPick(pickFile) | |
106 | |
82 horiscopeList = [] | 107 horiscopeList = [] |
83 # people = [1,2,3,4,5] | |
84 for person in people: | 108 for person in people: |
85 print 'parsing person '+ person['ID'] | 109 if person['pDOB'] is None or person['pDOB'] == '': |
86 url,payload = setURL('') | 110 print 'SKIPPING person '+ person['ID'] + ' pDOB is None' |
87 resp = requestURL(url,payload) | 111 else: |
88 | 112 print 'parsing person '+ person['ID'] |
89 person['horiscope'] = parsePage(resp) | 113 url,payload = setURL(person) |
90 horiscopeList.append(person) | 114 resp = requestURL(url,payload) |
115 person['horiscope'] = parsePage(resp) | |
116 if not person['horiscope']: # debug if dict is empty | |
117 print 'attempt failed, try again' | |
118 url,payload = setURL(person) | |
119 resp = requestURL(url,payload) | |
120 person['horiscope'] = parsePage(resp) | |
121 if not person['horiscope']: | |
122 print 'attempt two failed' | |
123 # pdb.set_trace() | |
124 for d in person['horiscope'].keys(): | |
125 person[d] = person['horiscope'][d] | |
126 horiscopeList.append(person) | |
127 savePick(pickFile,person) | |
91 print horiscopeList | 128 print horiscopeList |
92 printToFile('outputdata.csv',horiscopeList) | 129 savePick(pickFile,person) |
130 savePick('2'+pickFile,horiscopeList) | |
131 printToFile('final_'+outFile,horiscopeList) | |
93 | 132 |
94 if __name__ == "__main__": | 133 if __name__ == "__main__": |
95 testMain() | 134 _main() |