marco@16
|
1 #!/usr/bin/env python
|
marco@16
|
2 # -*- coding: utf-8 -*-
|
marco@16
|
3
|
marco@16
|
4 """
|
marco@16
|
5 Utility methods used within the module
|
marco@16
|
6 """
|
marco@16
|
7
|
marco@16
|
8 from sword2_logging import logging
|
marco@16
|
9 utils_l = logging.getLogger(__name__)
|
marco@16
|
10
|
marco@16
|
11 from time import time
|
marco@16
|
12 from datetime import datetime
|
marco@16
|
13
|
marco@16
|
14 from base64 import b64encode
|
marco@16
|
15
|
marco@16
|
16 try:
|
marco@16
|
17 from hashlib import md5
|
marco@16
|
18 except ImportError:
|
marco@16
|
19 import md5
|
marco@16
|
20
|
marco@16
|
21 import mimetypes
|
marco@16
|
22
|
marco@16
|
23 NS = {}
|
marco@16
|
24 NS['dcterms'] = "{http://purl.org/dc/terms/}%s"
|
marco@16
|
25 NS['sword'] ="{http://purl.org/net/sword/terms/}%s"
|
marco@16
|
26 NS['atom'] = "{http://www.w3.org/2005/Atom}%s"
|
marco@16
|
27 NS['app'] = "{http://www.w3.org/2007/app}%s"
|
marco@16
|
28
|
marco@16
|
29 def get_text(parent, tag, plural = False):
|
marco@16
|
30 """Takes an `etree.Element` and a tag name to search for and retrieves the text attribute from any
|
marco@16
|
31 of the parent element's direct children.
|
marco@16
|
32
|
marco@16
|
33 Returns a simple `str` if only a single element is found, or a list if multiple elements with the
|
marco@16
|
34 same tag. Ignores element attributes, returning only the text."""
|
marco@16
|
35 text = None
|
marco@16
|
36 for item in parent.findall(tag):
|
marco@16
|
37 t = item.text
|
marco@16
|
38 if not text:
|
marco@16
|
39 if plural:
|
marco@16
|
40 text = [t]
|
marco@16
|
41 else:
|
marco@16
|
42 text = t
|
marco@16
|
43 elif isinstance(text, list):
|
marco@16
|
44 text.append(t)
|
marco@16
|
45 else:
|
marco@16
|
46 text = [text, t]
|
marco@16
|
47 return text
|
marco@16
|
48
|
marco@16
|
49 def get_md5(data):
|
marco@16
|
50 """Takes either a `str` or a file-like object and passes back a tuple containing (md5sum, filesize)
|
marco@16
|
51
|
marco@16
|
52 The file is streamed as 1Mb chunks so should work for large files. File-like object must support `seek()`
|
marco@16
|
53 """
|
marco@16
|
54 if hasattr(data, "read") and hasattr(data, 'seek'):
|
marco@16
|
55 m = md5()
|
marco@16
|
56 chunk = data.read(1024*1024) # 1Mb
|
marco@16
|
57 f_size = 0
|
marco@16
|
58 while(chunk):
|
marco@16
|
59 f_size += len(chunk)
|
marco@16
|
60 m.update(chunk)
|
marco@16
|
61 chunk = data.read(1024*1024)
|
marco@16
|
62 data.seek(0)
|
marco@16
|
63 return m.hexdigest(), f_size
|
marco@16
|
64 else: # normal str
|
marco@16
|
65 m = md5()
|
marco@16
|
66 f_size = len(data)
|
marco@16
|
67 m.update(data)
|
marco@16
|
68 return m.hexdigest(), f_size
|
marco@16
|
69
|
marco@16
|
70
|
marco@16
|
71 class Timer(object):
|
marco@16
|
72 """Simple timer, providing a 'stopwatch' mechanism.
|
marco@16
|
73
|
marco@16
|
74 Usage example:
|
marco@16
|
75
|
marco@16
|
76 >>> from sword2.utils import Timer
|
marco@16
|
77 >>> from time import sleep
|
marco@16
|
78 >>> t = Timer()
|
marco@16
|
79 >>> t.get_timestamp()
|
marco@16
|
80 datetime.datetime(2011, 6, 7, 7, 40, 53, 87248)
|
marco@16
|
81 >>> t.get_loggable_timestamp()
|
marco@16
|
82 '2011-06-07T07:40:53.087516'
|
marco@16
|
83
|
marco@16
|
84 >>> # Start a few timers
|
marco@16
|
85 ... t.start("kaylee", "river", "inara")
|
marco@16
|
86 >>> sleep(3) # wait a little while
|
marco@16
|
87 >>> t.time_since_start("kaylee")
|
marco@16
|
88 (0, 3.0048139095306396)
|
marco@16
|
89
|
marco@16
|
90 # tuple -> (index of the logged .duration, time since the .start method was called)
|
marco@16
|
91 # eg 't.duration['kaylee'][0]' would equal 3.00481....
|
marco@16
|
92
|
marco@16
|
93 >>> sleep(2)
|
marco@16
|
94 >>> t.time_since_start("kaylee", "inara")
|
marco@16
|
95 [(1, 5.00858998298645), (0, 5.00858998298645)]
|
marco@16
|
96 >>> sleep(5)
|
marco@16
|
97 >>> t.time_since_start("kaylee", "river")
|
marco@16
|
98 [(2, 10.015379905700684), (0, 10.015379905700684)]
|
marco@16
|
99 >>> sleep(4)
|
marco@16
|
100 >>> t.time_since_start("kaylee", "inara", "river")
|
marco@16
|
101 [(3, 14.021538972854614), (1, 14.021538972854614), (1, 14.021538972854614)]
|
marco@16
|
102
|
marco@16
|
103 # The order of the response is the same as the order of the names in the method call.
|
marco@16
|
104
|
marco@16
|
105 >>> # report back
|
marco@16
|
106 ... t.duration['kaylee']
|
marco@16
|
107 [3.0048139095306396, 5.00858998298645, 10.015379905700684, 14.021538972854614]
|
marco@16
|
108 >>> t.duration['inara']
|
marco@16
|
109 [5.00858998298645, 14.021538972854614]
|
marco@16
|
110 >>> t.duration['river']
|
marco@16
|
111 [10.015379905700684, 14.021538972854614]
|
marco@16
|
112 >>>
|
marco@16
|
113 """
|
marco@16
|
114 def __init__(self):
|
marco@16
|
115 self.reset_all()
|
marco@16
|
116
|
marco@16
|
117 def reset_all(self):
|
marco@16
|
118 self.counts = {}
|
marco@16
|
119 self.duration = {}
|
marco@16
|
120 self.stop = {}
|
marco@16
|
121
|
marco@16
|
122 def reset(self, name):
|
marco@16
|
123 if name in self.counts:
|
marco@16
|
124 self.counts[name] = 0
|
marco@16
|
125
|
marco@16
|
126 def read_raw(self, name):
|
marco@16
|
127 return self.counts.get(name, None)
|
marco@16
|
128
|
marco@16
|
129 def read(self, name):
|
marco@16
|
130 if name in self.counts:
|
marco@16
|
131 return datetime.fromtimestamp(self.counts[name])
|
marco@16
|
132 else:
|
marco@16
|
133 return None
|
marco@16
|
134
|
marco@16
|
135 def start(self, *args):
|
marco@16
|
136 st_time = time()
|
marco@16
|
137 for arg in args:
|
marco@16
|
138 self.counts[arg] = st_time
|
marco@16
|
139
|
marco@16
|
140 def stop(self, *args):
|
marco@16
|
141 st_time = time()
|
marco@16
|
142 for arg in args:
|
marco@16
|
143 self.stop[arg] = st_time
|
marco@16
|
144
|
marco@16
|
145 def get_timestamp(self):
|
marco@16
|
146 # Convenience function
|
marco@16
|
147 return datetime.now()
|
marco@16
|
148
|
marco@16
|
149 def get_loggable_timestamp(self):
|
marco@16
|
150 """Human-readable by intent"""
|
marco@16
|
151 return datetime.now().isoformat()
|
marco@16
|
152
|
marco@16
|
153 def time_since_start(self, *args):
|
marco@16
|
154 r = []
|
marco@16
|
155 st_time = time()
|
marco@16
|
156 for name in args:
|
marco@16
|
157 if name in self.counts:
|
marco@16
|
158 duration = st_time - self.counts[name]
|
marco@16
|
159 if not self.duration.has_key(name):
|
marco@16
|
160 self.duration[name] = []
|
marco@16
|
161 self.duration[name].append(duration)
|
marco@16
|
162 r.append((len(self.duration[name]) - 1, duration))
|
marco@16
|
163 else:
|
marco@16
|
164 r.append((0, 0))
|
marco@16
|
165 if len(r) == 1:
|
marco@16
|
166 return r.pop()
|
marco@16
|
167 else:
|
marco@16
|
168 return r
|
marco@16
|
169
|
marco@16
|
170
|
marco@16
|
171 def get_content_type(filename):
|
marco@16
|
172 # Does a simple .ext -> mimetype mapping.
|
marco@16
|
173 # Generally better to specify the mimetype upfront.
|
marco@16
|
174 return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
|
marco@16
|
175
|
marco@16
|
176 def create_multipart_related(payloads):
|
marco@16
|
177 """ Expected: list of dicts with keys 'key', 'type'='content type','filename'=optional,'data'=payload, 'headers'={}
|
marco@16
|
178
|
marco@16
|
179 TODO: More mem-efficient to spool this to disc rather than hold in RAM, but until Httplib2 bug gets fixed (issue 151)
|
marco@16
|
180 this might be in vain.
|
marco@16
|
181
|
marco@16
|
182 Can handle more than just two files.
|
marco@16
|
183
|
marco@16
|
184 SWORD2 multipart POST/PUT expects two attachments - key = 'atom' w/ Atom Entry (metadata)
|
marco@16
|
185 key = 'payload' (file)
|
marco@16
|
186 """
|
marco@16
|
187 # Generate random boundary code
|
marco@16
|
188 # TODO check that it does not occur in the payload data
|
marco@16
|
189 bhash = md5(datetime.now().isoformat()).hexdigest() # eg 'd8bb3ea6f4e0a4b4682be0cfb4e0a24e'
|
marco@16
|
190 BOUNDARY = '===========%s_$' % bhash
|
marco@16
|
191 CRLF = '\r\n' # As some servers might barf without this.
|
marco@16
|
192 body = []
|
marco@16
|
193 for payload in payloads: # predicatable ordering...
|
marco@16
|
194 body.append('--' + BOUNDARY)
|
marco@16
|
195 if payload.get('type', None):
|
marco@16
|
196 body.append('Content-Type: %(type)s' % payload)
|
marco@16
|
197 else:
|
marco@16
|
198 body.append('Content-Type: %s' % get_content_type(payload.get("filename")))
|
marco@16
|
199
|
marco@16
|
200 if payload.get('filename', None):
|
marco@16
|
201 body.append('Content-Disposition: attachment; name="%(key)s"; filename="%(filename)s"' % (payload))
|
marco@16
|
202 else:
|
marco@16
|
203 body.append('Content-Disposition: attachment; name="%(key)s"' % (payload))
|
marco@16
|
204
|
marco@16
|
205 if payload.has_key("headers"):
|
marco@16
|
206 for f,v in payload['headers'].iteritems():
|
marco@16
|
207 body.append("%s: %s" % (f, v)) # TODO force ASCII?
|
marco@16
|
208
|
marco@16
|
209 body.append('MIME-Version: 1.0')
|
marco@16
|
210 if payload['key'] == 'payload':
|
marco@16
|
211 body.append('Content-Transfer-Encoding: base64')
|
marco@16
|
212 body.append('')
|
marco@16
|
213 body.append(b64encode(payload['data']))
|
marco@16
|
214 else:
|
marco@16
|
215 body.append('')
|
marco@16
|
216 body.append(payload['data'])
|
marco@16
|
217 body.append('--' + BOUNDARY + '--')
|
marco@16
|
218 body.append('')
|
marco@16
|
219 body_bytes = CRLF.join(body)
|
marco@16
|
220 content_type = 'multipart/related; boundary="%s"' % BOUNDARY
|
marco@16
|
221 return content_type, body_bytes
|