DaveM@2
|
1 #!/usr/bin/env python
|
DaveM@3
|
2 import dParse as dp
|
DaveM@2
|
3 import requests
|
DaveM@2
|
4 import re
|
DaveM@2
|
5 import time
|
DaveM@2
|
6 import csv
|
DaveM@2
|
7 import random
|
DaveM@6
|
8 import pdb
|
DaveM@6
|
9 import os
|
DaveM@6
|
10 import pickle
|
DaveM@2
|
11 from HTMLParser import HTMLParser
|
DaveM@2
|
12 # from lxml import html
|
DaveM@2
|
13 from bs4 import BeautifulSoup
|
DaveM@2
|
14
|
DaveM@2
|
15 def parsePage(resp):
|
DaveM@6
|
16 # pdb.set_trace()
|
DaveM@6
|
17 person = dict()
|
DaveM@2
|
18 soup = BeautifulSoup(resp.content, 'lxml')
|
DaveM@2
|
19 tcCell = soup.find_all('div', attrs={'class':'tc'})
|
DaveM@2
|
20 for cell in tcCell:
|
DaveM@2
|
21 tableCell = cell.find_all('td');
|
DaveM@2
|
22 if len(tableCell) > 2:
|
DaveM@2
|
23 C = tableCell[0].strong.contents[0].encode('utf-8')
|
DaveM@2
|
24 D = tableCell[2].strong.contents[0].encode('utf-8')
|
DaveM@6
|
25 # print (C,D)
|
DaveM@2
|
26 A = re.search("\/>(.*)<br/>.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1]))
|
DaveM@2
|
27 # A0 = A.group(1)
|
DaveM@2
|
28 # A1 = A.group(2).split('\xc2\xb0')[0]
|
DaveM@2
|
29 # A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0]
|
DaveM@6
|
30 # print (A.group(1),A.group(2),A.group(3))
|
DaveM@2
|
31 person[(C,D)] = (A.group(1),A.group(2),A.group(3))
|
DaveM@6
|
32 return person
|
DaveM@2
|
33
|
DaveM@2
|
34
|
DaveM@2
|
35 def setURL(p):
|
DaveM@2
|
36 """
|
DaveM@2
|
37 Code impacting factors into URL
|
DaveM@2
|
38 IMPACTING FACTORS
|
DaveM@2
|
39 Date of Birth
|
DaveM@2
|
40 Birth Time
|
DaveM@2
|
41 Country of birth
|
DaveM@2
|
42 City of birth (And state of birth)
|
DaveM@2
|
43 """
|
DaveM@2
|
44 ## For some reason we need to post men first then women.
|
DaveM@2
|
45 # url = "https://horoscopes.astro-seek.com/calculate-love-compatibility/?send_calculation=1&muz_narozeni_den=1&muz_narozeni_mesic=1&muz_narozeni_rok=1970&muz_narozeni_hodina=00&muz_narozeni_minuta=00&muz_narozeni_city=London%2C+United+Kingdom&muz_narozeni_mesto_hidden=London&muz_narozeni_stat_hidden=GB&muz_narozeni_podstat_kratky_hidden=England&muz_narozeni_podstat_hidden=England&muz_narozeni_podstat2_kratky_hidden=Greater+London&muz_narozeni_podstat3_kratky_hidden=undefined&muz_narozeni_input_hidden=&muz_narozeni_sirka_stupne=51&muz_narozeni_sirka_minuty=30&muz_narozeni_sirka_smer=0&muz_narozeni_delka_stupne=0&muz_narozeni_delka_minuty=8&muz_narozeni_delka_smer=1&muz_narozeni_timezone_form=auto&muz_narozeni_timezone_dst_form=auto&send_calculation=1&zena_narozeni_den=1&zena_narozeni_mesic=1&zena_narozeni_rok=1970&zena_narozeni_hodina=00&zena_narozeni_minuta=00&zena_narozeni_city=Berlin%2C+Germany&zena_narozeni_mesto_hidden=Berlin&zena_narozeni_stat_hidden=DE&zena_narozeni_podstat_kratky_hidden=Berlin&zena_narozeni_podstat_hidden=Berlin&zena_narozeni_podstat2_kratky_hidden=undefined&zena_narozeni_podstat3_kratky_hidden=undefined&zena_narozeni_input_hidden=&zena_narozeni_sirka_stupne=52&zena_narozeni_sirka_minuty=31&zena_narozeni_sirka_smer=0&zena_narozeni_delka_stupne=13&zena_narozeni_delka_minuty=24&zena_narozeni_delka_smer=0&zena_narozeni_timezone_form=auto&zena_narozeni_timezone_dst_form=auto&switch_interpretations=0&house_system=placidus&uhel_orbis=#tabs_redraw"
|
DaveM@2
|
46 # payload = {'muz_narozeni_den':'1','muz_narozeni_mesic':'1','muz_narozeni_rok':'1970'}
|
DaveM@2
|
47 url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/'
|
DaveM@6
|
48 # payload = {'send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear}
|
DaveM@6
|
49 payload = dp.makePayload(p)
|
DaveM@2
|
50 return (url,payload)
|
DaveM@2
|
51
|
DaveM@2
|
52 def requestURL(url,payload):
|
DaveM@2
|
53 r = requests.get(url, params=payload)
|
DaveM@2
|
54 time.sleep(5)
|
DaveM@2
|
55 return r
|
DaveM@2
|
56
|
DaveM@6
|
57 def makeURLPayload(url,payload):
|
DaveM@6
|
58 url += '?'
|
DaveM@6
|
59 for p in payload:
|
DaveM@6
|
60 url += '&' + str(p)
|
DaveM@6
|
61 url += '=' + str(payload[p])
|
DaveM@6
|
62 return url
|
DaveM@3
|
63
|
DaveM@6
|
64 def printToFile(filename,data,removeAdds):
|
DaveM@6
|
65 if removeAdds == True:
|
DaveM@6
|
66 del data['DOB']
|
DaveM@6
|
67 del data['TOB']
|
DaveM@6
|
68 del data['pDOB']
|
DaveM@6
|
69 del data['pTOB']
|
DaveM@6
|
70 del data['COB']
|
DaveM@6
|
71 del data['pCOB']
|
DaveM@6
|
72 del data['horiscope']
|
DaveM@2
|
73 keys = data[0].keys()
|
DaveM@2
|
74 with open(filename,'w') as stream:
|
DaveM@2
|
75 dict_writer = csv.DictWriter(stream, keys)
|
DaveM@2
|
76 dict_writer.writeheader()
|
DaveM@2
|
77 dict_writer.writerows(data)
|
DaveM@2
|
78
|
DaveM@6
|
79 def loadPick(filename):
|
DaveM@6
|
80 with open(filename, 'rb') as handle:
|
DaveM@6
|
81 b = pickle.load(handle)
|
DaveM@6
|
82 return b
|
DaveM@6
|
83
|
DaveM@6
|
84 def savePick(filename,data):
|
DaveM@6
|
85 with open(filename, 'wb') as handle:
|
DaveM@6
|
86 pickle.dump(data,handle)
|
DaveM@6
|
87
|
DaveM@6
|
88 def tempPF(fName,data):
|
DaveM@6
|
89 f__ = open(fName,'w')
|
DaveM@6
|
90 f__.write(data)
|
DaveM@6
|
91 f__.close()
|
DaveM@6
|
92
|
DaveM@3
|
93 def testMain():
|
DaveM@4
|
94 people = dp.parseCSV('individuals.csv')
|
DaveM@3
|
95
|
DaveM@3
|
96 def _main():
|
DaveM@6
|
97 pickFile = 'outData.pick'
|
DaveM@2
|
98 # people = dict()
|
DaveM@6
|
99 if not os.path.exists(pickFile):
|
DaveM@6
|
100 print 'reParse file'
|
DaveM@6
|
101 people = dp.parseCSV('individuals.csv')
|
DaveM@6
|
102 savePick(pickFile,people)
|
DaveM@6
|
103 else:
|
DaveM@6
|
104 print 'read in ' + pickFile
|
DaveM@6
|
105 people = loadPick(pickFile)
|
DaveM@6
|
106
|
DaveM@2
|
107 horiscopeList = []
|
DaveM@2
|
108 for person in people:
|
DaveM@6
|
109 if person['pDOB'] is None or person['pDOB'] == '':
|
DaveM@6
|
110 print 'SKIPPING person '+ person['ID'] + ' pDOB is None'
|
DaveM@6
|
111 else:
|
DaveM@6
|
112 print 'parsing person '+ person['ID']
|
DaveM@6
|
113 url,payload = setURL(person)
|
DaveM@6
|
114 resp = requestURL(url,payload)
|
DaveM@6
|
115 person['horiscope'] = parsePage(resp)
|
DaveM@6
|
116 if not person['horiscope']: # debug if dict is empty
|
DaveM@6
|
117 print 'attempt failed, try again'
|
DaveM@6
|
118 url,payload = setURL(person)
|
DaveM@6
|
119 resp = requestURL(url,payload)
|
DaveM@6
|
120 person['horiscope'] = parsePage(resp)
|
DaveM@6
|
121 if not person['horiscope']:
|
DaveM@6
|
122 print 'attempt two failed'
|
DaveM@6
|
123 # pdb.set_trace()
|
DaveM@6
|
124 for d in person['horiscope'].keys():
|
DaveM@6
|
125 person[d] = person['horiscope'][d]
|
DaveM@6
|
126 horiscopeList.append(person)
|
DaveM@6
|
127 savePick(pickFile,person)
|
DaveM@2
|
128 print horiscopeList
|
DaveM@6
|
129 savePick(pickFile,person)
|
DaveM@6
|
130 savePick('2'+pickFile,horiscopeList)
|
DaveM@6
|
131 printToFile('final_'+outFile,horiscopeList)
|
DaveM@2
|
132
|
DaveM@2
|
133 if __name__ == "__main__":
|
DaveM@6
|
134 _main()
|