marco@16
|
1 #!/usr/bin/env python
|
marco@16
|
2 # -*- coding: utf-8 -*-
|
marco@16
|
3
|
marco@16
|
4 """ Collection classes
|
marco@16
|
5
|
marco@16
|
6 These classes are used in their documented manner but most collect or group various other items
|
marco@16
|
7 to make them suitable for use.
|
marco@16
|
8
|
marco@16
|
9 The key class is `Collection`, which is presents a simple read-only object which represents the
|
marco@16
|
10 information held within a collection element in a SWORD2 document such as the Service Document.
|
marco@16
|
11
|
marco@16
|
12 Two other classes, `Collection_Feed` and `Sword_Statement` are works in progress for now, with limited support
|
marco@16
|
13 for the things they logically handle.
|
marco@16
|
14
|
marco@16
|
15 """
|
marco@16
|
16
|
marco@16
|
17 from sword2_logging import logging
|
marco@16
|
18 from implementation_info import __version__
|
marco@16
|
19 coll_l = logging.getLogger(__name__)
|
marco@16
|
20
|
marco@16
|
21 from compatible_libs import etree
|
marco@16
|
22 from utils import NS, get_text
|
marco@16
|
23
|
marco@16
|
24 from deposit_receipt import Deposit_Receipt
|
marco@16
|
25
|
marco@16
|
26 from atom_objects import Category
|
marco@16
|
27
|
marco@16
|
28 from datetime import datetime
|
marco@16
|
29
|
marco@16
|
30
|
marco@16
|
31 class SDCollection(object):
|
marco@16
|
32 """
|
marco@16
|
33 `Collection` - holds, parses and presents simple attributes with information taken from a collection entry
|
marco@16
|
34 within a SWORD2 Service Document.
|
marco@16
|
35
|
marco@16
|
36 This will be instanciated by a `sword2.Service_Document` and as such, is unlikely to be called explicitly.
|
marco@16
|
37
|
marco@16
|
38 Usage:
|
marco@16
|
39
|
marco@16
|
40 >>> from sword2 import SDCollection
|
marco@16
|
41 >>> c = SDCollection()
|
marco@16
|
42
|
marco@16
|
43 .... pull an `etree.SubElement` from a service document into `collection_node`
|
marco@16
|
44
|
marco@16
|
45 >>> c.load_from_etree(collection_node)
|
marco@16
|
46 >>> c.collectionPolicy
|
marco@16
|
47 "This collection has the following policy for deposits"
|
marco@16
|
48 >>> c.title
|
marco@16
|
49 "Thesis Deposit"
|
marco@16
|
50 """
|
marco@16
|
51 def __init__(self, title=None,
|
marco@16
|
52 href=None,
|
marco@16
|
53 accept=[],
|
marco@16
|
54 accept_multipart=[],
|
marco@16
|
55 categories=[],
|
marco@16
|
56 collectionPolicy=None,
|
marco@16
|
57 description = None,
|
marco@16
|
58 mediation=None,
|
marco@16
|
59 treatment=None,
|
marco@16
|
60 acceptPackaging=[],
|
marco@16
|
61 service=[],
|
marco@16
|
62 dom=None):
|
marco@16
|
63 """
|
marco@16
|
64 Creates a `Collection` object - as used by `sword2.Service_Document`
|
marco@16
|
65
|
marco@16
|
66 #BETASWORD2URL
|
marco@16
|
67 See http://sword-app.svn.sourceforge.net/viewvc/sword-app/spec/trunk/SWORDProfile.html?revision=HEAD#protocoloperations_retreivingservicedocument
|
marco@16
|
68 for more details about the SWORD2 Service Document.
|
marco@16
|
69
|
marco@16
|
70 Usage:
|
marco@16
|
71
|
marco@16
|
72 Read useful information from the attributes of this object once loaded.
|
marco@16
|
73
|
marco@16
|
74 Attributes::
|
marco@16
|
75
|
marco@16
|
76 title -- <atom:title> - Title of collection, (`str`)
|
marco@16
|
77 href -- <collection href=... > - Collection IRI (`str`)
|
marco@16
|
78 accept -- <accept>*</accept> - the formats which this collection can take in (`list` of `str`)
|
marco@16
|
79 accept_multipart -- <accept alternate="multipart-related">*</accept> - the formats which this collection can take
|
marco@16
|
80 in via multipart-related (`list` of `str`)
|
marco@16
|
81 categories -- <atom:catogory> - Collection category (`list` of `sword2.Category`'s)
|
marco@16
|
82 collectionPolicy -- <sword:collectionPolicy> - Collection policy (`str`)
|
marco@16
|
83 description -- <dcterms:description> - Collection descriptive text (`str`)
|
marco@16
|
84 mediation -- <sword:mediation> - Support for mediated deposit (`True` or `False`)
|
marco@16
|
85 treatment -- <sword:treatment> - from the SWORD2 specifications:
|
marco@16
|
86 ".. either a human-readable statement describing treatment the deposited resource
|
marco@16
|
87 has received or a IRI that dereferences to such a description."
|
marco@16
|
88 acceptPackaging -- <sword:acceptPackaging> - Accepted package types (`list` of `str`)
|
marco@16
|
89 from the SWORD2 specifications: "The value SHOULD be a IRI for a known packaging format"
|
marco@16
|
90 service -- <sword:service> - References to nested service descriptions (`list` of `str`)
|
marco@16
|
91
|
marco@16
|
92 Example XML fragment that is expected: (xmlns="http://www.w3.org/2007/app")
|
marco@16
|
93
|
marco@16
|
94 ...
|
marco@16
|
95
|
marco@16
|
96 <collection href="http://swordapp.org/col-iri/43">
|
marco@16
|
97 <atom:title>Collection 43</atom:title>
|
marco@16
|
98 <accept>*/*</accept>
|
marco@16
|
99 <accept alternate="multipart-related">*/*</accept>
|
marco@16
|
100 <sword:collectionPolicy>Collection Policy</sword:collectionPolicy>
|
marco@16
|
101 <dcterms:abstract>Collection Description</dcterms:abstract>
|
marco@16
|
102 <sword:mediation>false</sword:mediation>
|
marco@16
|
103 <sword:treatment>Treatment description</sword:treatment>
|
marco@16
|
104 <sword:acceptPackaging>http://purl.org/net/sword/package/SimpleZip</sword:acceptPackaging>
|
marco@16
|
105 <sword:acceptPackaging>http://purl.org/net/sword/package/METSDSpaceSIP</sword:acceptPackaging>
|
marco@16
|
106 <sword:service>http://swordapp.org/sd-iri/e4</sword:service>
|
marco@16
|
107 </collection>
|
marco@16
|
108 ...
|
marco@16
|
109
|
marco@16
|
110 Parsing this fragment:
|
marco@16
|
111
|
marco@16
|
112 Again, this step is done by the `sword2.Service_Document`, but if the above XML was in the `doc` variable:
|
marco@16
|
113
|
marco@16
|
114 # Get an etree-compatible library, such as from `lxml.etree`, `xml.etree` or `elementtree.ElementTree`
|
marco@16
|
115 >>> from sword2.compatible_libs import etree
|
marco@16
|
116 >>> from sword2 import SDCollection
|
marco@16
|
117 >>> dom = etree.fromstring(doc)
|
marco@16
|
118
|
marco@16
|
119 # create an `SDCollection` instance from this XML document
|
marco@16
|
120 >>> c = SDCollection(dom = dom)
|
marco@16
|
121
|
marco@16
|
122 # query it
|
marco@16
|
123 >>> c.treatment
|
marco@16
|
124 "Treatment description"
|
marco@16
|
125 # Non-unique elements, for example:
|
marco@16
|
126 >>> c.service
|
marco@16
|
127 ["http://swordapp.org/sd-iri/e4"]
|
marco@16
|
128 >>> c.accept
|
marco@16
|
129 ["*/*"]
|
marco@16
|
130
|
marco@16
|
131 """
|
marco@16
|
132 # APP/Atom
|
marco@16
|
133 self.title = title
|
marco@16
|
134 self.href = href
|
marco@16
|
135 self.accept = accept
|
marco@16
|
136 self.accept_multipart = accept_multipart
|
marco@16
|
137 # SWORD
|
marco@16
|
138 self.mediation = mediation
|
marco@16
|
139 self.description = description
|
marco@16
|
140 self.treatment = treatment
|
marco@16
|
141 self.collectionPolicy = collectionPolicy
|
marco@16
|
142 self.acceptPackaging = acceptPackaging
|
marco@16
|
143 self.service = service
|
marco@16
|
144 self.categories = categories
|
marco@16
|
145 if dom != None:
|
marco@16
|
146 # Allow constructor variables to provide defaults, but information within the
|
marco@16
|
147 # XML element overwrites or appends.
|
marco@16
|
148 self.load_from_etree(dom)
|
marco@16
|
149
|
marco@16
|
150 def _reset(self):
|
marco@16
|
151 """Blank this instance of `SDCollection`"""
|
marco@16
|
152 self.title = None
|
marco@16
|
153 self.href = None
|
marco@16
|
154 self.accept = []
|
marco@16
|
155 self.accept_multipart = []
|
marco@16
|
156 # SWORD
|
marco@16
|
157 self.mediation = None
|
marco@16
|
158 self.description = None
|
marco@16
|
159 self.treatment = None
|
marco@16
|
160 self.collectionPolicy = None
|
marco@16
|
161 self.acceptPackaging = []
|
marco@16
|
162 self.service = None
|
marco@16
|
163 self.categories = []
|
marco@16
|
164
|
marco@16
|
165 def load_from_etree(self, collection):
|
marco@16
|
166 """
|
marco@16
|
167 Parse an `etree.SubElement` into attributes in this object.
|
marco@16
|
168
|
marco@16
|
169 Also, caches the most recently used DOM object it is passed in
|
marco@16
|
170 `self.dom`
|
marco@16
|
171 """
|
marco@16
|
172 self._reset()
|
marco@16
|
173 self.dom = collection
|
marco@16
|
174 self.title = get_text(collection, NS['atom'] % 'title')
|
marco@16
|
175 # MUST have href attribute
|
marco@16
|
176 self.href = collection.attrib.get('href', None)
|
marco@16
|
177 # Accept and Accept multipart
|
marco@16
|
178 for accept in collection.findall(NS['app'] % 'accept'):
|
marco@16
|
179 if accept.attrib.get("alternate", None) == "multipart-related":
|
marco@16
|
180 self.accept_multipart.append(accept.text)
|
marco@16
|
181 else:
|
marco@16
|
182 self.accept.append(accept.text)
|
marco@16
|
183 # Categories
|
marco@16
|
184 for category_element in collection.findall(NS['atom'] % 'category'):
|
marco@16
|
185 self.categories.append(Category(dom=category_element))
|
marco@16
|
186 # SWORD extensions:
|
marco@16
|
187 self.collectionPolicy = get_text(collection, NS['sword'] % 'collectionPolicy')
|
marco@16
|
188
|
marco@16
|
189 # Mediation: True/False
|
marco@16
|
190 mediation = get_text(collection, NS['sword'] % 'mediation')
|
marco@16
|
191 self.mediation = mediation.lower() == "true"
|
marco@16
|
192
|
marco@16
|
193 self.treatment = get_text(collection, NS['sword'] % 'treatment')
|
marco@16
|
194 self.description = get_text(collection, NS['dcterms'] % 'abstract')
|
marco@16
|
195 self.service = get_text(collection, NS['sword'] % 'service', plural = True)
|
marco@16
|
196 self.acceptPackaging = get_text(collection, NS['sword'] % 'acceptPackaging', plural = True)
|
marco@16
|
197
|
marco@16
|
198 # Log collection details:
|
marco@16
|
199 coll_l.debug(str(self))
|
marco@16
|
200
|
marco@16
|
201 def __str__(self):
|
marco@16
|
202 """Provides a simple display of the pertinent information in this object suitable for CLI logging."""
|
marco@16
|
203 _s = ["Collection: '%s' @ '%s'. Accept:%s" % (self.title, self.href, self.accept)]
|
marco@16
|
204 if self.description:
|
marco@16
|
205 _s.append("SWORD: Description - '%s'" % self.description)
|
marco@16
|
206 if self.collectionPolicy:
|
marco@16
|
207 _s.append("SWORD: Collection Policy - '%s'" % self.collectionPolicy)
|
marco@16
|
208 if self.mediation:
|
marco@16
|
209 _s.append("SWORD: Mediation? - '%s'" % self.mediation)
|
marco@16
|
210 if self.treatment:
|
marco@16
|
211 _s.append("SWORD: Treatment - '%s'" % self.treatment)
|
marco@16
|
212 if self.acceptPackaging:
|
marco@16
|
213 _s.append("SWORD: Accept Packaging: '%s'" % self.acceptPackaging)
|
marco@16
|
214 if self.service:
|
marco@16
|
215 _s.append("SWORD: Nested Service Documents - '%s'" % self.service)
|
marco@16
|
216 for c in self.categories:
|
marco@16
|
217 _s.append(str(c))
|
marco@16
|
218 return "\n".join(_s)
|
marco@16
|
219
|
marco@16
|
220 def __repr__(self):
|
marco@16
|
221 """Provides the atom.title of the collection as part of the repr reply"""
|
marco@16
|
222 return "<sword2.SDCollection - title: %s>" % self.title
|
marco@16
|
223
|
marco@16
|
224 def to_json(self):
|
marco@16
|
225 """Provides a simple means to turn the important parsed information into a simple JSON-encoded form.
|
marco@16
|
226
|
marco@16
|
227 NB this uses the attributes of the object, not the cached DOM object, so information can be altered/added
|
marco@16
|
228 on the fly."""
|
marco@16
|
229 from compatible_libs import json
|
marco@16
|
230 if json:
|
marco@16
|
231 _j = {'title':self.title,
|
marco@16
|
232 'href':self.href,
|
marco@16
|
233 'description':self.description,
|
marco@16
|
234 'accept':self.accept,
|
marco@16
|
235 'accept_multipart':self.accept_multipart,
|
marco@16
|
236 'mediation':self.mediation,
|
marco@16
|
237 'treatment':self.treatment,
|
marco@16
|
238 'collectionPolicy':self.collectionPolicy,
|
marco@16
|
239 'acceptPackaging':self.acceptPackaging,
|
marco@16
|
240 'service':self.service,
|
marco@16
|
241 'categories':self.categories}
|
marco@16
|
242 return json.dumps(_j)
|
marco@16
|
243 else:
|
marco@16
|
244 coll_l.error("Could not return information about Collection '%s' as JSON" % self.title)
|
marco@16
|
245 return
|
marco@16
|
246
|
marco@16
|
247 class Collection_Feed(object):
|
marco@16
|
248 """Nothing to see here yet. Move along."""
|
marco@16
|
249 def __init__(self, feed_iri=None, http_client=None, feed_xml=None):
|
marco@16
|
250 self.feed_xml = feed_xml
|
marco@16
|
251 self.feed_iri = feed_iri
|
marco@16
|
252 self._cached = []
|
marco@16
|
253 self.h = http_client
|
marco@16
|
254
|
marco@16
|
255 class Sword_Statement(object):
|
marco@16
|
256 """Beginning SWORD2 Sword Statement support.
|
marco@16
|
257
|
marco@16
|
258 The aim is for the sword statements to be available through attributes on this object.
|
marco@16
|
259
|
marco@16
|
260 In the meantime, please use the low-level `self.feed` for access to an etree.Element containing the
|
marco@16
|
261 parsed form of the `xml_document` it is passed.
|
marco@16
|
262
|
marco@16
|
263 NB if `self.parsed` is not `True`, then there has been a problem parsing the xml document so check the original text,
|
marco@16
|
264 cached in `self.xml_document`
|
marco@16
|
265 """
|
marco@16
|
266 def __init__(self, xml_document):
|
marco@16
|
267 self.xml_document = xml_document
|
marco@16
|
268 self.parsed = False
|
marco@16
|
269 self.first = None
|
marco@16
|
270 self.next = None
|
marco@16
|
271 self.previous = None
|
marco@16
|
272 self.last = None
|
marco@16
|
273 self.categories = []
|
marco@16
|
274 self.entries = []
|
marco@16
|
275 try:
|
marco@16
|
276 coll_l.info("Attempting to parse the Feed XML document")
|
marco@16
|
277 self.feed = etree.fromstring(xml_document)
|
marco@16
|
278 self.parsed = True
|
marco@16
|
279 except Exception, e:
|
marco@16
|
280 coll_l.error("Failed to parse document - %s" % e)
|
marco@16
|
281 coll_l.error("XML document begins:\n %s" % xml_document[:300])
|
marco@16
|
282 self.enumerate_feed()
|
marco@16
|
283
|
marco@16
|
284 def enumerate_feed(self):
|
marco@16
|
285 # Handle Categories
|
marco@16
|
286 for cate in self.feed.findall(NS['atom'] % 'category'):
|
marco@16
|
287 self.categories.append(Category(dom = cate))
|
marco@16
|
288 # handle entries - each one is compatible with a Deposit receipt, so using that
|
marco@16
|
289 for entry in self.feed.findall(NS['atom'] % 'entry'):
|
marco@16
|
290 self.entries.append(Deposit_Receipt(dom=entry))
|
marco@16
|
291 # TODO handle multipage first/last pagination
|
marco@16
|
292
|
marco@16
|
293
|