marco@16: #!/usr/bin/env python
marco@16: # -*- coding: utf-8 -*-
marco@16: 
marco@16: """
marco@16: Utility methods used within the module
marco@16: """
marco@16: 
marco@16: from sword2_logging import logging
marco@16: utils_l = logging.getLogger(__name__)
marco@16: 
marco@16: from time import time
marco@16: from datetime import datetime
marco@16: 
marco@16: from base64 import b64encode
marco@16: 
marco@16: try:
marco@16:     from hashlib import md5
marco@16: except ImportError:
marco@16:     import md5
marco@16: 
marco@16: import mimetypes
marco@16: 
marco@16: NS = {}
marco@16: NS['dcterms'] = "{http://purl.org/dc/terms/}%s"
marco@16: NS['sword'] ="{http://purl.org/net/sword/terms/}%s"
marco@16: NS['atom'] = "{http://www.w3.org/2005/Atom}%s"
marco@16: NS['app'] = "{http://www.w3.org/2007/app}%s"
marco@16: 
marco@16: def get_text(parent, tag, plural = False):
marco@16:     """Takes an `etree.Element` and a tag name to search for and retrieves the text attribute from any
marco@16:     of the parent element's direct children.
marco@16:     
marco@16:     Returns a simple `str` if only a single element is found, or a list if multiple elements with the
marco@16:     same tag. Ignores element attributes, returning only the text."""
marco@16:     text = None
marco@16:     for item in parent.findall(tag):
marco@16:         t = item.text
marco@16:         if not text:
marco@16:             if plural:
marco@16:                 text = [t]
marco@16:             else:
marco@16:                 text = t
marco@16:         elif isinstance(text, list):
marco@16:             text.append(t)
marco@16:         else:
marco@16:             text = [text, t]
marco@16:     return text
marco@16: 
marco@16: def get_md5(data):
marco@16:     """Takes either a `str` or a file-like object and passes back a tuple containing (md5sum, filesize)
marco@16:     
marco@16:     The file is streamed as 1Mb chunks so should work for large files. File-like object must support `seek()`
marco@16:     """
marco@16:     if hasattr(data, "read") and hasattr(data, 'seek'):
marco@16:         m = md5()
marco@16:         chunk = data.read(1024*1024)   # 1Mb
marco@16:         f_size = 0
marco@16:         while(chunk):
marco@16:             f_size += len(chunk)
marco@16:             m.update(chunk)
marco@16:             chunk = data.read(1024*1024)
marco@16:         data.seek(0)
marco@16:         return m.hexdigest(), f_size
marco@16:     else:       # normal str
marco@16:         m = md5()
marco@16:         f_size = len(data)
marco@16:         m.update(data)
marco@16:         return m.hexdigest(), f_size
marco@16:         
marco@16: 
marco@16: class Timer(object):
marco@16:     """Simple timer, providing a 'stopwatch' mechanism.
marco@16:     
marco@16:     Usage example:
marco@16:         
marco@16:     >>> from sword2.utils import Timer
marco@16:     >>> from time import sleep
marco@16:     >>> t = Timer()
marco@16:     >>> t.get_timestamp()
marco@16:     datetime.datetime(2011, 6, 7, 7, 40, 53, 87248)
marco@16:     >>> t.get_loggable_timestamp()
marco@16:     '2011-06-07T07:40:53.087516'
marco@16: 
marco@16:     >>> # Start a few timers
marco@16:     ... t.start("kaylee", "river", "inara")
marco@16:     >>> sleep(3)   # wait a little while
marco@16:     >>> t.time_since_start("kaylee")
marco@16:     (0, 3.0048139095306396)
marco@16: 
marco@16:     # tuple -> (index of the logged .duration, time since the .start method was called)
marco@16:     # eg 't.duration['kaylee'][0]' would equal 3.00481.... 
marco@16: 
marco@16:     >>> sleep(2)
marco@16:     >>> t.time_since_start("kaylee", "inara")
marco@16:     [(1, 5.00858998298645), (0, 5.00858998298645)]
marco@16:     >>> sleep(5)
marco@16:     >>> t.time_since_start("kaylee", "river")
marco@16:     [(2, 10.015379905700684), (0, 10.015379905700684)]
marco@16:     >>> sleep(4)
marco@16:     >>> t.time_since_start("kaylee", "inara", "river")
marco@16:     [(3, 14.021538972854614), (1, 14.021538972854614), (1, 14.021538972854614)]
marco@16:     
marco@16:     # The order of the response is the same as the order of the names in the method call.
marco@16:     
marco@16:     >>> # report back
marco@16:     ... t.duration['kaylee']
marco@16:     [3.0048139095306396, 5.00858998298645, 10.015379905700684, 14.021538972854614]
marco@16:     >>> t.duration['inara']
marco@16:     [5.00858998298645, 14.021538972854614]
marco@16:     >>> t.duration['river']
marco@16:     [10.015379905700684, 14.021538972854614]
marco@16:     >>> 
marco@16:     """
marco@16:     def __init__(self):
marco@16:         self.reset_all()
marco@16:         
marco@16:     def reset_all(self):
marco@16:         self.counts = {}    
marco@16:         self.duration = {}
marco@16:         self.stop = {}
marco@16: 
marco@16:     def reset(self, name):
marco@16:         if name in self.counts:
marco@16:             self.counts[name] = 0
marco@16:     
marco@16:     def read_raw(self, name):
marco@16:         return self.counts.get(name, None)
marco@16:     
marco@16:     def read(self, name):
marco@16:         if name in self.counts:
marco@16:             return datetime.fromtimestamp(self.counts[name])
marco@16:         else:
marco@16:             return None
marco@16: 
marco@16:     def start(self, *args):
marco@16:         st_time = time()
marco@16:         for arg in args:
marco@16:             self.counts[arg] = st_time
marco@16: 
marco@16:     def stop(self, *args):
marco@16:         st_time = time()
marco@16:         for arg in args:
marco@16:             self.stop[arg] = st_time
marco@16:     
marco@16:     def get_timestamp(self):
marco@16:         # Convenience function
marco@16:         return datetime.now()
marco@16:     
marco@16:     def get_loggable_timestamp(self):
marco@16:         """Human-readable by intent"""
marco@16:         return datetime.now().isoformat()
marco@16:         
marco@16:     def time_since_start(self, *args):
marco@16:         r = []
marco@16:         st_time = time()
marco@16:         for name in args:
marco@16:             if name in self.counts:
marco@16:                 duration = st_time - self.counts[name]
marco@16:                 if not self.duration.has_key(name):
marco@16:                     self.duration[name] = []
marco@16:                 self.duration[name].append(duration)
marco@16:                 r.append((len(self.duration[name]) - 1, duration))
marco@16:             else:
marco@16:                 r.append((0, 0))
marco@16:         if len(r) == 1:
marco@16:             return r.pop()
marco@16:         else:
marco@16:             return r
marco@16:             
marco@16: 
marco@16: def get_content_type(filename):
marco@16:     # Does a simple .ext -> mimetype mapping.
marco@16:     # Generally better to specify the mimetype upfront.
marco@16:     return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
marco@16: 
marco@16: def create_multipart_related(payloads):
marco@16:     """ Expected: list of dicts with keys 'key', 'type'='content type','filename'=optional,'data'=payload, 'headers'={} 
marco@16:     
marco@16:     TODO: More mem-efficient to spool this to disc rather than hold in RAM, but until Httplib2 bug gets fixed (issue 151)
marco@16:     this might be in vain.
marco@16:     
marco@16:     Can handle more than just two files. 
marco@16:     
marco@16:     SWORD2 multipart POST/PUT expects two attachments - key = 'atom' w/ Atom Entry (metadata)
marco@16:                                                         key = 'payload' (file)
marco@16:     """
marco@16:     # Generate random boundary code
marco@16:     # TODO check that it does not occur in the payload data
marco@16:     bhash = md5(datetime.now().isoformat()).hexdigest()    # eg 'd8bb3ea6f4e0a4b4682be0cfb4e0a24e'
marco@16:     BOUNDARY = '===========%s_$' % bhash
marco@16:     CRLF = '\r\n'   # As some servers might barf without this.
marco@16:     body = []
marco@16:     for payload in payloads:   # predicatable ordering...
marco@16:         body.append('--' + BOUNDARY)
marco@16:         if payload.get('type', None):
marco@16:             body.append('Content-Type: %(type)s' % payload)
marco@16:         else:
marco@16:             body.append('Content-Type: %s' % get_content_type(payload.get("filename")))
marco@16:             
marco@16:         if payload.get('filename', None):
marco@16:             body.append('Content-Disposition: attachment; name="%(key)s"; filename="%(filename)s"' % (payload))
marco@16:         else:
marco@16:             body.append('Content-Disposition: attachment; name="%(key)s"' % (payload))
marco@16:         
marco@16:         if payload.has_key("headers"):
marco@16:             for f,v in payload['headers'].iteritems():
marco@16:                 body.append("%s: %s" % (f, v))     # TODO force ASCII?
marco@16:         
marco@16:         body.append('MIME-Version: 1.0')
marco@16:         if payload['key'] == 'payload':
marco@16:             body.append('Content-Transfer-Encoding: base64')
marco@16:             body.append('')
marco@16:             body.append(b64encode(payload['data']))
marco@16:         else:
marco@16:             body.append('')
marco@16:             body.append(payload['data'])
marco@16:     body.append('--' + BOUNDARY + '--')
marco@16:     body.append('')
marco@16:     body_bytes = CRLF.join(body)
marco@16:     content_type = 'multipart/related; boundary="%s"' % BOUNDARY
marco@16:     return content_type, body_bytes