diff V5/dParse.py @ 23:11d4e438045e

make version 5
author DaveM
date Mon, 09 Apr 2018 15:07:21 +0100
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/V5/dParse.py	Mon Apr 09 15:07:21 2018 +0100
@@ -0,0 +1,235 @@
+#!/usr/bin/env python
+
+import csv
+import time
+import unicodedata
+from geopy.geocoders import Nominatim
+from geopy.exc import GeocoderTimedOut
+import random
+import pdb
+
+DEFAULT_TIME = None
+# DEFAULT_TIME_M = None
+DEAULT_LOCATION = None
+
+def regulateData(dataDict):
+	print("Parse %s"%(str(dataDict['ID'])))
+	p_DOBQ = "What is your partner's date of birth? Please use the format DD/MM/YYYY (for example, 29/03/1981)."
+	p_TOBQ = "At what exact time were your partner born? Please use the format HHMM (for example, 2204)."
+	DOB_DQ = "Which day (numeric) have you been born?"
+	DOB_MQ = "Which month have you been born?"
+	DOB_YQ = "Year Of Birth"
+	TOB_Q = "At what exact time were you born? Please use the format HHMM (for example, 2204)."
+	COB = "What is your place of birth? Please specify city and country (for example, San Francisco, USA)."
+	p_COB = "What is your partner's place of birth? Please specify city and country (for example, San Francisco, USA)."
+	dataDict['DOB'] = parseDOB(dataDict[DOB_DQ],dataDict[DOB_MQ],dataDict[DOB_YQ])
+	# print (dataDict[DOB_DQ],dataDict[DOB_MQ],dataDict[DOB_YQ])
+	# print dataDict['DOB']
+	dataDict['TOB'] = parseTOB(dataDict[TOB_Q])
+	dataDict['pDOB'] = parsePartnerDOB(dataDict[p_DOBQ])
+	dataDict['pTOB'] = parseTOB(dataDict[p_TOBQ])
+	# MAKE RANDOM PLACE
+	# dataDict['COB'] = (random.uniform(-90, 90),random.uniform(-90, 90))
+	# dataDict['pCOB'] = (random.uniform(-90, 90),random.uniform(-90, 90))
+	dataDict['COB'] = parseBirthTown(dataDict[COB])
+	dataDict['pCOB'] = parseBirthTown(dataDict[p_COB])
+	return dataDict
+
+
+def parseBirthTown(s):
+	try:
+		s = s.encode('ascii')
+	except UnicodeDecodeError:
+		# pdb.set_trace()
+		s = s.decode('latin-1')
+		# s = unicodedata.normalize('NFKD',s.decode('utf-8')).encode('ascii','ignore')
+	timeoutTime = 2
+	geolocator = Nominatim(timeout=timeoutTime)
+	while s is not [] and timeoutTime < 60:
+		try:
+			location = geolocator.geocode(s)
+			if location is not None:
+				# print(location.raw)
+				# print (location.latitude, location.longitude)
+				return (location.latitude, location.longitude, location.raw)
+			else:
+				s = s.split(' ',1)
+				if len(s) == 2:
+					s = s[1]
+					# print s
+				else:
+					s = DEAULT_LOCATION
+		except:
+			timeoutTime += 1
+			print("Error: geocode failed on input %s, incrementing timeout time to %d"%(s,timeoutTime))
+			time.sleep(5)
+			geolocator = Nominatim(timeout=timeoutTime)
+	# places = geograpy.get_place_context(text=s)
+
+def parsePartnerDOB(dob):
+	# print dob
+	# pdb.set_trace()
+	dob = dob.strip()
+	if(dob.count('-') == 2):
+		dob = dob.replace('-','/')
+	if(dob.count(' ') == 2):
+		dob = dob.replace(' ','/')
+	dob_ = dob.split('/')
+	if(len(dob_) != 3):
+		dob = dob.replace('/','').strip()
+		dob_ = []
+		# print dob
+		if len(dob) == 8: # ddmmyyyy
+			dob_.append(dob[:2])
+			dob_.append(dob[2:4])
+			dob_.append(dob[4:])
+		elif len(dob) == 7 and dob[1] == '1' and (dob[2] == '0' or dob[2] == '1' or dob[2] == '2'): # dmmyyyy
+			dob_.append(dob[0])
+			dob_.append(dob[1:3])
+			dob_.append(dob[3:])
+		elif(len(dob) == 7):
+			if int(dob[:2]) > 31:# dmmyyyy
+				dob_.append(dob[0])
+				dob_.append(dob[1:3])
+				dob_.append(dob[3:])
+		elif len(dob) == 7: # ddmyyyy
+			dob_.append(dob[0:2])
+			dob_.append(dob[2])
+			dob_.append(dob[3:])
+		elif len(dob) == 6 and dob[3:4] != '19': # ddmmyy
+			dob_.append(dob[:2])
+			dob_.append(dob[2:4])
+			dob_.append(dob[4:])
+		elif len(dob) == 5 and dob[1] == '1' and (dob[2] == '0' or dob[2] == '1' or dob[2] == '2'): # dmmyy
+			dob_.append(dob[0])
+			dob_.append(dob[1:3])
+			dob_.append(dob[3:])
+		elif len(dob) == 5: # ddmyy
+			dob_.append(dob[:2])
+			dob_.append(dob[2])
+			dob_.append(dob[3:])
+		elif len(dob) == 4: # dmyy
+			dob_.append(dob[0])
+			dob_.append(dob[1])
+			dob_.append(dob[2:])
+		else:
+			if(len(dob) < 4):
+				return None
+			# print dob
+			# print filter(lambda x: x.isdigit(),dob)
+			print 'no / partnerDOB issue'
+		# deal with no /'s
+	try:
+		d = int(filter(lambda x: x.isdigit(),dob_[0]))
+		m = int(filter(lambda x: x.isdigit(),dob_[1]))
+		y = int(filter(lambda x: x.isdigit(),dob_[2]))
+		if y < 100:
+			y = y + 1900
+		if (m > 12 and d <= 12):
+			temp = d
+			d = m
+			m = temp 
+		if(d > 31 or d < 1 or m > 12 or m < 1 or y > 2017 or y < 1900):
+			print 'error with DOB '+d+'/'+m+'/'+y
+			pdb.set_trace()
+	except TypeError:
+		return None
+	# print  (d,m,y)
+	return (d,m,y)
+
+def monthStringToNum(s):
+	# print 'inMonthStringToNum'
+	m = {'jan':1,'feb':2,'mar':3,
+	'apr':4,'may':5,'jun':6,'jul':7,'aug':8,
+	'sep':9,'oct':10,'nov':11,'dec':12}
+	s_ = s.strip()[:3].lower()
+	try:
+		out = m[s_]
+		return out
+	except:
+		raise ValueError('Not a month')
+
+def checkMonthDay(d,m):
+	if d > 31: # take first two digits of day
+		d = int(str(d)[:2])
+		if d > 31:
+			d = int(str(d)[1])
+	if m > 12 and d < 12: # Day and month wrong way round - American
+		temp = m
+		m = d
+		d = temp
+	if(m == 2):
+		if d <= 29:
+			return (True,d,m)
+		else:
+			return (False,d,m)
+	elif m in [4,6,9,11]:
+		if d <= 30:
+			return (True,d,m)
+		else:
+			return (False,d,m)
+	elif m <= 12 and d <= 31:
+		return (True,d,m)
+	else:
+		return (False,d,m)
+
+def parseDOB(d,m,y):
+	d = int(filter(lambda x: x.isdigit(),d))
+	y = int(filter(lambda x: x.isdigit(),y))
+	try:
+		# print m
+		m = monthStringToNum(m.strip())
+	except ValueError:
+		m = int(m.strip())
+	if(y < 100):
+		y = y + 1900
+	(r,d,m) = checkMonthDay(d,m)
+	if not r:
+		print 'error with day month'
+		print (r,d,m)
+	return (d,m,y)
+
+def parseTOB(T):
+	timeFlag = None
+	T_ = T.replace('.','').lower().strip()
+	if 'am' in T_:
+		timeFlag = 0
+		T = T_.replace('am','')
+	if 'pm' in T_:
+		timeFlag = 1
+		T = T_.replace('pm','')
+	T = T.strip()
+	if T.count('.') == 1:
+		T = T.replace('.',':')
+	try:
+		if ':' in T:
+			T_ = T.split(':')
+			
+			H = int(T_[0])
+			M = int(T_[1])
+		else:
+			if len(T) == 4:
+				H = int(T[:2])
+				M = int(T[2:])
+			elif int(T) <= 24 :
+				H = int(T)
+				M = 0
+			elif int(T) > 100:
+				H = int(T)/100
+				M = int(T)%100
+			else:
+				return None
+		if timeFlag is not None:
+			if timeFlag == 0:
+				H = H%12
+			else:
+				H = H%12 + 12
+	except ValueError:
+		return None
+	if H > 24 or M > 60:
+		return None
+	else:
+		return (H,M)
+	
+
+