comparison V2/runme.py @ 6:3d5ca8e78f8f

update to pDOB parsing, fixing some bugs
author DaveM
date Mon, 18 Dec 2017 17:41:16 +0000
parents 99115e36316b
children
comparison
equal deleted inserted replaced
5:73cf5cabef86 6:3d5ca8e78f8f
3 import requests 3 import requests
4 import re 4 import re
5 import time 5 import time
6 import csv 6 import csv
7 import random 7 import random
8 import pdb
9 import os
10 import pickle
8 from HTMLParser import HTMLParser 11 from HTMLParser import HTMLParser
9 # from lxml import html 12 # from lxml import html
10 from bs4 import BeautifulSoup 13 from bs4 import BeautifulSoup
11 14
12 def parsePage(resp): 15 def parsePage(resp):
16 # pdb.set_trace()
17 person = dict()
13 soup = BeautifulSoup(resp.content, 'lxml') 18 soup = BeautifulSoup(resp.content, 'lxml')
14 tcCell = soup.find_all('div', attrs={'class':'tc'}) 19 tcCell = soup.find_all('div', attrs={'class':'tc'})
15 for cell in tcCell: 20 for cell in tcCell:
16 person = dict()
17 tableCell = cell.find_all('td'); 21 tableCell = cell.find_all('td');
18 if len(tableCell) > 2: 22 if len(tableCell) > 2:
19 C = tableCell[0].strong.contents[0].encode('utf-8') 23 C = tableCell[0].strong.contents[0].encode('utf-8')
20 D = tableCell[2].strong.contents[0].encode('utf-8') 24 D = tableCell[2].strong.contents[0].encode('utf-8')
21 print (C,D) 25 # print (C,D)
22 A = re.search("\/>(.*)<br/>.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1])) 26 A = re.search("\/>(.*)<br/>.*\(([0-9]*)\\xc2\\xb0([0-9]*)(.*)\)",str(tableCell[1]))
23 # A0 = A.group(1) 27 # A0 = A.group(1)
24 # A1 = A.group(2).split('\xc2\xb0')[0] 28 # A1 = A.group(2).split('\xc2\xb0')[0]
25 # A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0] 29 # A2 = A.group(2).split('\xc2\xb0')[1].split('\xe2')[0]
26 print (A.group(1),A.group(2),A.group(3)) 30 # print (A.group(1),A.group(2),A.group(3))
27 person[(C,D)] = (A.group(1),A.group(2),A.group(3)) 31 person[(C,D)] = (A.group(1),A.group(2),A.group(3))
28 return person 32 return person
29 33
30 34
31 def setURL(p): 35 def setURL(p):
32 """ 36 """
33 Code impacting factors into URL 37 Code impacting factors into URL
39 """ 43 """
40 ## For some reason we need to post men first then women. 44 ## For some reason we need to post men first then women.
41 # url = "https://horoscopes.astro-seek.com/calculate-love-compatibility/?send_calculation=1&muz_narozeni_den=1&muz_narozeni_mesic=1&muz_narozeni_rok=1970&muz_narozeni_hodina=00&muz_narozeni_minuta=00&muz_narozeni_city=London%2C+United+Kingdom&muz_narozeni_mesto_hidden=London&muz_narozeni_stat_hidden=GB&muz_narozeni_podstat_kratky_hidden=England&muz_narozeni_podstat_hidden=England&muz_narozeni_podstat2_kratky_hidden=Greater+London&muz_narozeni_podstat3_kratky_hidden=undefined&muz_narozeni_input_hidden=&muz_narozeni_sirka_stupne=51&muz_narozeni_sirka_minuty=30&muz_narozeni_sirka_smer=0&muz_narozeni_delka_stupne=0&muz_narozeni_delka_minuty=8&muz_narozeni_delka_smer=1&muz_narozeni_timezone_form=auto&muz_narozeni_timezone_dst_form=auto&send_calculation=1&zena_narozeni_den=1&zena_narozeni_mesic=1&zena_narozeni_rok=1970&zena_narozeni_hodina=00&zena_narozeni_minuta=00&zena_narozeni_city=Berlin%2C+Germany&zena_narozeni_mesto_hidden=Berlin&zena_narozeni_stat_hidden=DE&zena_narozeni_podstat_kratky_hidden=Berlin&zena_narozeni_podstat_hidden=Berlin&zena_narozeni_podstat2_kratky_hidden=undefined&zena_narozeni_podstat3_kratky_hidden=undefined&zena_narozeni_input_hidden=&zena_narozeni_sirka_stupne=52&zena_narozeni_sirka_minuty=31&zena_narozeni_sirka_smer=0&zena_narozeni_delka_stupne=13&zena_narozeni_delka_minuty=24&zena_narozeni_delka_smer=0&zena_narozeni_timezone_form=auto&zena_narozeni_timezone_dst_form=auto&switch_interpretations=0&house_system=placidus&uhel_orbis=#tabs_redraw" 45 # url = "https://horoscopes.astro-seek.com/calculate-love-compatibility/?send_calculation=1&muz_narozeni_den=1&muz_narozeni_mesic=1&muz_narozeni_rok=1970&muz_narozeni_hodina=00&muz_narozeni_minuta=00&muz_narozeni_city=London%2C+United+Kingdom&muz_narozeni_mesto_hidden=London&muz_narozeni_stat_hidden=GB&muz_narozeni_podstat_kratky_hidden=England&muz_narozeni_podstat_hidden=England&muz_narozeni_podstat2_kratky_hidden=Greater+London&muz_narozeni_podstat3_kratky_hidden=undefined&muz_narozeni_input_hidden=&muz_narozeni_sirka_stupne=51&muz_narozeni_sirka_minuty=30&muz_narozeni_sirka_smer=0&muz_narozeni_delka_stupne=0&muz_narozeni_delka_minuty=8&muz_narozeni_delka_smer=1&muz_narozeni_timezone_form=auto&muz_narozeni_timezone_dst_form=auto&send_calculation=1&zena_narozeni_den=1&zena_narozeni_mesic=1&zena_narozeni_rok=1970&zena_narozeni_hodina=00&zena_narozeni_minuta=00&zena_narozeni_city=Berlin%2C+Germany&zena_narozeni_mesto_hidden=Berlin&zena_narozeni_stat_hidden=DE&zena_narozeni_podstat_kratky_hidden=Berlin&zena_narozeni_podstat_hidden=Berlin&zena_narozeni_podstat2_kratky_hidden=undefined&zena_narozeni_podstat3_kratky_hidden=undefined&zena_narozeni_input_hidden=&zena_narozeni_sirka_stupne=52&zena_narozeni_sirka_minuty=31&zena_narozeni_sirka_smer=0&zena_narozeni_delka_stupne=13&zena_narozeni_delka_minuty=24&zena_narozeni_delka_smer=0&zena_narozeni_timezone_form=auto&zena_narozeni_timezone_dst_form=auto&switch_interpretations=0&house_system=placidus&uhel_orbis=#tabs_redraw"
42 # payload = {'muz_narozeni_den':'1','muz_narozeni_mesic':'1','muz_narozeni_rok':'1970'} 46 # payload = {'muz_narozeni_den':'1','muz_narozeni_mesic':'1','muz_narozeni_rok':'1970'}
43 url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/' 47 url = 'https://horoscopes.astro-seek.com/calculate-love-compatibility/'
44 mDay = random.randint(1,29) 48 # payload = {'send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear}
45 mMonth = random.randint(1,12) 49 payload = dp.makePayload(p)
46 mYear = random.randint(1,100)+1917
47 fDay = random.randint(1,29)
48 fMonth = random.randint(1,12)
49 fYear = random.randint(1,100)+1917
50
51 payload = {'?send_calculation':'1','muz_narozeni_den':mDay,'muz_narozeni_mesic':mMonth,'muz_narozeni_rok':mYear,'zena_narozeni_den':fDay,'zena_narozeni_mesic':fMonth,'zena_narozeni_rok':fYear}
52 return (url,payload) 50 return (url,payload)
53 51
54 def requestURL(url,payload): 52 def requestURL(url,payload):
55 r = requests.get(url, params=payload) 53 r = requests.get(url, params=payload)
56 time.sleep(5) 54 time.sleep(5)
57 return r 55 return r
58 56
59 # def parseCSV(filename): 57 def makeURLPayload(url,payload):
60 # stream = csv.DictReader(open(filename,'rb')) 58 url += '?'
61 # dictList = [] 59 for p in payload:
62 # for line in stream: 60 url += '&' + str(p)
63 # dictList.append(regulateData(line)) 61 url += '=' + str(payload[p])
62 return url
64 63
65 # # dictList = headerParse(dictList) 64 def printToFile(filename,data,removeAdds):
66 # # dictList = validateData(dictList) 65 if removeAdds == True:
67 # return dictList 66 del data['DOB']
68 67 del data['TOB']
69 def printToFile(filename,data): 68 del data['pDOB']
69 del data['pTOB']
70 del data['COB']
71 del data['pCOB']
72 del data['horiscope']
70 keys = data[0].keys() 73 keys = data[0].keys()
71 with open(filename,'w') as stream: 74 with open(filename,'w') as stream:
72 dict_writer = csv.DictWriter(stream, keys) 75 dict_writer = csv.DictWriter(stream, keys)
73 dict_writer.writeheader() 76 dict_writer.writeheader()
74 dict_writer.writerows(data) 77 dict_writer.writerows(data)
75 78
79 def loadPick(filename):
80 with open(filename, 'rb') as handle:
81 b = pickle.load(handle)
82 return b
83
84 def savePick(filename,data):
85 with open(filename, 'wb') as handle:
86 pickle.dump(data,handle)
87
88 def tempPF(fName,data):
89 f__ = open(fName,'w')
90 f__.write(data)
91 f__.close()
92
76 def testMain(): 93 def testMain():
77 people = dp.parseCSV('individuals.csv') 94 people = dp.parseCSV('individuals.csv')
78 95
79 def _main(): 96 def _main():
97 pickFile = 'outData.pick'
80 # people = dict() 98 # people = dict()
81 people = dp.parseCSV('individuals.csv') 99 if not os.path.exists(pickFile):
100 print 'reParse file'
101 people = dp.parseCSV('individuals.csv')
102 savePick(pickFile,people)
103 else:
104 print 'read in ' + pickFile
105 people = loadPick(pickFile)
106
82 horiscopeList = [] 107 horiscopeList = []
83 # people = [1,2,3,4,5]
84 for person in people: 108 for person in people:
85 print 'parsing person '+ person['ID'] 109 if person['pDOB'] is None or person['pDOB'] == '':
86 url,payload = setURL('') 110 print 'SKIPPING person '+ person['ID'] + ' pDOB is None'
87 resp = requestURL(url,payload) 111 else:
88 112 print 'parsing person '+ person['ID']
89 person['horiscope'] = parsePage(resp) 113 url,payload = setURL(person)
90 horiscopeList.append(person) 114 resp = requestURL(url,payload)
115 person['horiscope'] = parsePage(resp)
116 if not person['horiscope']: # debug if dict is empty
117 print 'attempt failed, try again'
118 url,payload = setURL(person)
119 resp = requestURL(url,payload)
120 person['horiscope'] = parsePage(resp)
121 if not person['horiscope']:
122 print 'attempt two failed'
123 # pdb.set_trace()
124 for d in person['horiscope'].keys():
125 person[d] = person['horiscope'][d]
126 horiscopeList.append(person)
127 savePick(pickFile,person)
91 print horiscopeList 128 print horiscopeList
92 printToFile('outputdata.csv',horiscopeList) 129 savePick(pickFile,person)
130 savePick('2'+pickFile,horiscopeList)
131 printToFile('final_'+outFile,horiscopeList)
93 132
94 if __name__ == "__main__": 133 if __name__ == "__main__":
95 testMain() 134 _main()