marco@0
|
1 #!usr/bin/env/ python
|
marco@1
|
2
|
marco@12
|
3 """
|
marco@12
|
4
|
marco@15
|
5 SWORD2 DSpace bulk uploader - v0.6
|
marco@1
|
6
|
marco@1
|
7 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
|
marco@15
|
8 Built on the SWORD2 python client library: https://github.com/swordapp/python-client-sword2.
|
marco@1
|
9
|
marco@1
|
10 Dependencies:
|
marco@1
|
11
|
marco@1
|
12 - python 2.X
|
marco@1
|
13
|
marco@14
|
14 - sword2 library: https://github.com/swordapp/python-client-sword2
|
marco@1
|
15
|
marco@1
|
16 -----------------------------------
|
marco@15
|
17 Updates log:
|
marco@15
|
18
|
marco@15
|
19 v0.6: - now uploading a directory will also maintain the path structure
|
marco@15
|
20 - introduced a file where to specify the server (server.cfg)
|
marco@15
|
21 v0.5: changed the default server to C4DM live server
|
marco@15
|
22
|
marco@15
|
23 -----------------------------------
|
marco@11
|
24 Centre for Digital Music, Queen Mary, University of London
|
marco@11
|
25 Copyright (c) 2012 Marco Fabiani
|
marco@11
|
26
|
marco@11
|
27 Permission is hereby granted, free of charge, to any person
|
marco@11
|
28 obtaining a copy of this software and associated documentation
|
marco@11
|
29 files (the "Software"), to deal in the Software without
|
marco@11
|
30 restriction, including without limitation the rights to use, copy,
|
marco@11
|
31 modify, merge, publish, distribute, sublicense, and/or sell copies
|
marco@11
|
32 of the Software, and to permit persons to whom the Software is
|
marco@11
|
33 furnished to do so, subject to the following conditions:
|
marco@11
|
34
|
marco@11
|
35 The above copyright notice and this permission notice shall be
|
marco@11
|
36 included in all copies or substantial portions of the Software.
|
stephen@20
|
37
|
marco@11
|
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
marco@11
|
39 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
marco@11
|
40 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
marco@11
|
41 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
marco@11
|
42 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
marco@11
|
43 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
marco@11
|
44 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
marco@11
|
45 OTHER DEALINGS IN THE SOFTWARE.
|
marco@1
|
46 -----------------------------------
|
marco@12
|
47
|
marco@11
|
48 A copy of this License can also be found in the COPYING file distributed with the source code.
|
marco@1
|
49 """
|
marco@0
|
50
|
stephen@20
|
51 import argparse, getpass, zipfile, os, sword2.http_layer
|
marco@0
|
52 from sword2 import *
|
marco@0
|
53
|
marco@0
|
54 # Parse arguments
|
stephen@20
|
55 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORD v2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
|
marco@0
|
56 parser.add_argument("data", type=str, nargs=1,
|
stephen@20
|
57 help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
|
stephen@20
|
58 parser.add_argument("--username", dest="user_name", type=str, nargs=1, help="DSpace username.")
|
stephen@20
|
59 parser.add_argument("--password", dest="password", type=str, nargs=1, help="DSpace password.")
|
stephen@21
|
60 parser.add_argument("--timeout", dest="timeout", type=float, nargs=1, default=[30.0], help="Timeout for response for connections. Make sure this is long enough to allow files to be uploaded.")
|
marco@0
|
61 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
|
stephen@20
|
62 parser.add_argument("--author", dest="author", type=str, nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
|
stephen@20
|
63 parser.add_argument("--date", dest="date", type=str, nargs=1, help="Date of creation (string) (ignored for METS packages).")
|
stephen@20
|
64 parser.add_argument("--zip", action="store_true", dest="zip", default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.")
|
stephen@20
|
65 parser.add_argument("--servicedoc", dest="sd", type=str, nargs=1, help="Url of the SWORD v2 service document (default: use server.cfg if available, otherwise http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument")
|
marco@0
|
66
|
marco@0
|
67 args = parser.parse_args()
|
marco@0
|
68 data = args.data[0]
|
stephen@20
|
69 timeout = args.timeout[0]
|
marco@13
|
70 if args.zip:
|
stephen@20
|
71 storeZip = True
|
marco@13
|
72 else:
|
stephen@20
|
73 storeZip = False
|
marco@8
|
74
|
stephen@20
|
75 if args.sd is None:
|
marco@15
|
76 try:
|
marco@15
|
77 f = open("server.cfg", "r")
|
marco@15
|
78 sd = f.readline()
|
marco@15
|
79 print "server.cfg: ", sd
|
marco@15
|
80 except:
|
marco@15
|
81 sd = "http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument"
|
marco@0
|
82 else:
|
marco@13
|
83 sd = args.sd[0]
|
marco@0
|
84
|
stephen@20
|
85 class swordConnection(object):
|
stephen@20
|
86 def __init__(self):
|
stephen@20
|
87 self.serverConnection = None
|
stephen@20
|
88 self.connected = False
|
stephen@20
|
89 self.name = ""
|
marco@0
|
90
|
stephen@20
|
91 def connect(self, timeout=30.0):
|
stephen@20
|
92 self.serverConnection = None
|
stephen@20
|
93 self.connected = False
|
stephen@20
|
94 httpImp = sword2.http_layer.HttpLib2Layer(".cache", timeout=timeout)
|
stephen@20
|
95 print "Connection timeout is ", timeout, "seconds."
|
stephen@20
|
96 # Connect to SWORD server: it will always try to authenticate (no anonymous submissions!
|
stephen@20
|
97 attempts = 3 # Number of attempts left to connect to server
|
stephen@20
|
98 while attempts>0 and not self.connected:
|
stephen@20
|
99 print "Connecting to SWORD server. Remaining attempts: ", attempts
|
stephen@20
|
100 # Try to login, get service document
|
stephen@20
|
101 # Get username and password
|
stephen@20
|
102 if args.user_name is None:
|
stephen@20
|
103 user_name = raw_input("Username: ")
|
stephen@20
|
104 else:
|
stephen@20
|
105 user_name = args.user_name[0]
|
stephen@20
|
106 print "Username: ",user_name
|
stephen@20
|
107
|
stephen@20
|
108 if args.password is None:
|
stephen@20
|
109 user_pass = getpass.getpass("Password:")
|
stephen@20
|
110 else:
|
stephen@20
|
111 user_pass = args.password[0]
|
stephen@20
|
112 # Connect to the server
|
stephen@20
|
113
|
stephen@20
|
114 self.serverConnection = Connection(sd, user_name=user_name, user_pass=user_pass,keep_history=False,http_impl=httpImp)
|
stephen@20
|
115
|
stephen@20
|
116 # Get service document
|
stephen@20
|
117 try:
|
stephen@20
|
118 self.serverConnection.get_service_document()
|
stephen@20
|
119 except: # Server error
|
stephen@20
|
120 print "Server unreachable!"
|
stephen@20
|
121 break
|
stephen@20
|
122
|
stephen@20
|
123 if self.serverConnection.sd is not None:
|
stephen@20
|
124 self.connected = True
|
stephen@20
|
125 else:
|
stephen@20
|
126 attempts-=1
|
stephen@20
|
127 print "Incorrect username and/or password"
|
stephen@20
|
128
|
stephen@20
|
129 if not self.connected:
|
stephen@20
|
130 # Failed to connect to SWORD v2 Server
|
stephen@20
|
131 print "Couldn't connect to the server."
|
stephen@20
|
132 if attempts == 0:
|
stephen@20
|
133 raise Exception, "Invalid credentials entered 3 times."
|
stephen@20
|
134 else:
|
stephen@20
|
135 raise Exception, "Unable to connect to server"
|
marco@0
|
136 else:
|
stephen@20
|
137 self.name = self.serverConnection.workspaces[0][0]
|
stephen@20
|
138
|
stephen@20
|
139 def selectCollection(self):
|
marco@0
|
140 # List available collections
|
marco@0
|
141 print "Available Collections: "
|
stephen@20
|
142 numColl = len(self.serverConnection.workspaces[0][1])
|
marco@0
|
143 for ctr in range(numColl):
|
stephen@20
|
144 coll = self.serverConnection.workspaces[0][1][ctr]
|
marco@0
|
145 print ctr+1,":",coll.title
|
marco@0
|
146 # Select a collection to deposit into
|
marco@14
|
147 sel = "0"
|
stephen@20
|
148 while (not sel.isdigit()) or int(sel)<=0 or int(sel)>numColl:
|
marco@14
|
149 sel = raw_input("Select a Collection to submit your files into: ")
|
marco@14
|
150 sel = int(sel)
|
stephen@20
|
151 collection = swordCollection(self, self.serverConnection.workspaces[0][1][sel-1])
|
stephen@20
|
152 return collection
|
stephen@20
|
153
|
stephen@20
|
154
|
stephen@20
|
155 class swordCollection(object):
|
stephen@20
|
156 def __init__(self, connection, collection):
|
stephen@20
|
157 self.connection = connection
|
stephen@20
|
158 self.serverCollection = collection
|
stephen@20
|
159
|
stephen@20
|
160 def title(self):
|
stephen@20
|
161 return self.serverCollection.title
|
stephen@20
|
162
|
stephen@20
|
163 def createItem(self, metadata_entry, in_progress=True):
|
stephen@20
|
164 creationReceipt = self.connection.serverConnection.create(col_iri = self.serverCollection.href, metadata_entry = metadata_entry, in_progress=in_progress)
|
stephen@20
|
165 return swordItem(self.connection, self, creationReceipt)
|
stephen@20
|
166
|
stephen@20
|
167 def createItemFromFile(self, file, metadata_entry, in_progress=True):
|
stephen@20
|
168 depositReceipt = None
|
stephen@20
|
169 payload = open(file.path, "rb")
|
stephen@20
|
170 try:
|
stephen@20
|
171 deposit_receipt = self.connection.serverConnection.create(col_iri = self.serverCollection.href,
|
stephen@20
|
172 payload = payload,
|
stephen@20
|
173 filename = file.filename,
|
stephen@20
|
174 mimetype = file.mimetype,
|
stephen@20
|
175 packaging = file.packaging,
|
stephen@20
|
176 in_progress = in_progress)
|
stephen@20
|
177 print type, " submission successful."
|
stephen@20
|
178 except:
|
stephen@20
|
179 print "Error! Couldn't submit the file!"
|
stephen@20
|
180 if type == "METS": # Just guessing: not sure this is the problem...
|
stephen@20
|
181 print "To submit a METS package, the collection MUST have a workflow!"
|
stephen@20
|
182 payload.close()
|
stephen@20
|
183
|
stephen@20
|
184 return swordItem(self.connection, self, depositReceipt)
|
stephen@20
|
185
|
stephen@20
|
186 class swordItem(object):
|
stephen@20
|
187 def __init__(self, connection, collection, receipt):
|
stephen@20
|
188 self.connection = connection
|
stephen@20
|
189 self.serverCollection = collection
|
stephen@20
|
190 self.receipt = receipt
|
stephen@20
|
191
|
stephen@20
|
192 def addFile(self, file):
|
stephen@20
|
193 # print "Adding to", self.receipt.edit_media
|
stephen@20
|
194 # print str(file)
|
stephen@20
|
195 payload = open(file.path, "rb")
|
stephen@20
|
196 print "Uploading file ", file.filename,
|
stephen@20
|
197 file.deposit_receipt = self.connection.serverConnection.add_file_to_resource(self.receipt.edit_media,
|
stephen@20
|
198 payload = payload,
|
stephen@20
|
199 filename = file.filename,
|
stephen@20
|
200 mimetype = file.mimetype,
|
stephen@20
|
201 packaging = file.packaging)
|
stephen@20
|
202 payload.close()
|
stephen@20
|
203 print "[uploaded]"
|
stephen@20
|
204
|
stephen@20
|
205 def updateMetadata(self, metadataEntry, in_progress=True):
|
stephen@20
|
206 try:
|
stephen@20
|
207 update_receipt = self.connection.serverConnection.update(dr = self.receipt, metadata_entry = metadataEntry, in_progress = in_progress)
|
stephen@20
|
208 print "Metadata update successful."
|
stephen@20
|
209 except:
|
stephen@20
|
210 print "Server error"
|
stephen@20
|
211 raise
|
stephen@20
|
212
|
stephen@20
|
213 # Class to encapsulate a SWORD2 payload file
|
stephen@20
|
214 class swordFile(object):
|
stephen@20
|
215 def __init__(self, path, filename=None):
|
stephen@20
|
216 self.path = path
|
stephen@20
|
217 self.deposit_receipt = None
|
stephen@20
|
218 if filename is None:
|
stephen@20
|
219 self.filename = os.path.basename(path)
|
marco@0
|
220 else:
|
stephen@20
|
221 self.filename = filename
|
stephen@20
|
222 # Default to a basic binary file
|
stephen@20
|
223 self.mimetype = "application/octet+stream"
|
stephen@20
|
224 self.packaging = 'http://purl.org/net/sword/package/Binary'
|
stephen@20
|
225
|
stephen@20
|
226 def __str__(self):
|
stephen@20
|
227 return "path:" + str(self.path) + ", filename:" + str(self.filename) + ", mimetype:" + str(self.mimetype) + ", packaging:" + str(self.packaging)
|
stephen@20
|
228
|
stephen@20
|
229 def getSubmissionData(args, data):
|
stephen@20
|
230 # Create a submission
|
stephen@20
|
231 filesList = []
|
stephen@20
|
232 temp = False # Delete temp files
|
stephen@20
|
233 packaging = None
|
stephen@20
|
234 # If folder
|
stephen@20
|
235 if os.path.isdir(data):
|
stephen@20
|
236 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
|
stephen@20
|
237 dataName = os.path.basename(os.path.normpath(data))
|
stephen@20
|
238 if args.title is not None:
|
stephen@20
|
239 zipFile = args.title[0].replace(" ","_")+".zip"
|
stephen@20
|
240 else:
|
stephen@20
|
241 zipFile = dataName.replace(" ","_")+".zip"
|
stephen@20
|
242 myZip = zipfile.ZipFile(zipFile, "w")
|
stephen@20
|
243 # get the directory structure
|
stephen@20
|
244 print "Creating a zip archive for submission..."
|
stephen@20
|
245 for root, dirs, files in os.walk(data):
|
stephen@20
|
246 for name in files:
|
stephen@20
|
247 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
|
stephen@20
|
248 # Remove spaces and square brackets
|
stephen@20
|
249 myZip.write(os.path.join(root,name),
|
stephen@20
|
250 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")"))
|
stephen@20
|
251 filesList.append(zipFile)
|
stephen@20
|
252 myZip.close()
|
stephen@20
|
253 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
stephen@20
|
254 type = "SimpleZip"
|
stephen@20
|
255 temp = True
|
marco@4
|
256 else:
|
stephen@20
|
257 # Create a list of files to upload
|
stephen@20
|
258 for root, dirs, files in os.walk(data):
|
stephen@20
|
259 for name in files:
|
stephen@20
|
260 if not name.startswith('.'):
|
stephen@20
|
261 filesList.append(os.path.join(root,name))
|
stephen@20
|
262 type = "multiple files"
|
stephen@20
|
263 elif zipfile.is_zipfile(data):
|
stephen@20
|
264 # This is a zip file
|
stephen@20
|
265 filesList.append(data)
|
stephen@20
|
266 myZip = zipfile.ZipFile(data)
|
stephen@20
|
267 if "mets.xml" in myZip.namelist():
|
stephen@20
|
268 # This is a METS package
|
stephen@20
|
269 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
|
stephen@20
|
270 type = "METS"
|
stephen@20
|
271 in_progress = False
|
stephen@20
|
272 elif "bagit.txt" in "".join(myZip.namelist()):
|
stephen@20
|
273 # This is a BagIt package
|
stephen@20
|
274 packaging = "http://purl.org/net/sword/package/BagIt"
|
stephen@20
|
275 type = "BAGIT"
|
stephen@20
|
276 else:
|
stephen@20
|
277 # This is a simple zip file
|
stephen@20
|
278 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
stephen@20
|
279 type = "SimpleZip"
|
stephen@20
|
280 myZip.close()
|
stephen@20
|
281 elif os.path.isfile(data): # This is a single file
|
stephen@20
|
282 filesList.append(data)
|
stephen@20
|
283 type = "single file"
|
stephen@20
|
284 else:
|
stephen@20
|
285 raise Exception, "Couldn't find the data."
|
stephen@20
|
286
|
stephen@20
|
287 submissionData = {"files": filesList, "packaging": packaging, "type":type, "isTemporaryFile":temp}
|
stephen@20
|
288 return submissionData
|
stephen@20
|
289
|
stephen@20
|
290 def setupMetadataEntry(args):
|
stephen@20
|
291 # Create a metadata entry
|
stephen@20
|
292 if (args.title is not None) or (args.author is not None) or (args.date is not None):
|
stephen@20
|
293 entry = Entry()
|
stephen@20
|
294 if args.title is not None:
|
stephen@20
|
295 entry.add_fields(dcterms_title = args.title[0])
|
stephen@20
|
296 if args.author is not None:
|
stephen@20
|
297 for creator in args.author:
|
stephen@20
|
298 entry.add_fields(dcterms_creator=creator)
|
stephen@20
|
299 if args.date is not None:
|
stephen@20
|
300 entry.add_fields(dcterms_created = args.date[0])
|
stephen@20
|
301 else:
|
stephen@20
|
302 entry = None
|
stephen@20
|
303 return entry
|
stephen@20
|
304
|
stephen@20
|
305 try:
|
stephen@20
|
306 serverConnection = swordConnection()
|
stephen@20
|
307 serverConnection.connect(timeout)
|
stephen@20
|
308 print "------------------------"
|
stephen@20
|
309 print "Welcome to the", serverConnection.name, "repository"
|
stephen@20
|
310
|
stephen@20
|
311 collectionForItem = serverConnection.selectCollection()
|
stephen@20
|
312 print "Selected Collection:", collectionForItem.title()
|
stephen@20
|
313
|
stephen@20
|
314 submissionData = getSubmissionData(args, data)
|
stephen@20
|
315
|
stephen@20
|
316 print "------------------------"
|
stephen@20
|
317 print "This is a", submissionData["type"], "submission"
|
stephen@20
|
318
|
stephen@20
|
319 metadataEntry = setupMetadataEntry(args)
|
stephen@20
|
320
|
stephen@20
|
321 # Select what to do
|
stephen@20
|
322 if (submissionData["type"] == "single file") or (submissionData["type"] == "multiple files"): # Use the single file upload procedure
|
stephen@20
|
323 try:
|
stephen@20
|
324 # Create the metadata entry with ATOM
|
stephen@20
|
325 print "------------------------"
|
stephen@20
|
326 print "Creating the", submissionData["type"], "item... "
|
stephen@20
|
327 if metadataEntry is None:
|
stephen@20
|
328 metadataEntry = Entry(dcterms_title=(os.path.basename(data)))
|
stephen@20
|
329 collectionItem = collectionForItem.createItem(metadata_entry = metadataEntry, in_progress=True)
|
stephen@20
|
330 print "Item created"
|
stephen@20
|
331
|
stephen@20
|
332 # Create a list of files to upload
|
stephen@20
|
333 if submissionData["type"] == "single file":
|
stephen@20
|
334 payLoadList = [swordFile(submissionData["files"][0])]
|
stephen@20
|
335 else:
|
marco@15
|
336 # Get the longest common path in order to send the correct filename to keep the structure
|
stephen@20
|
337 common = os.path.commonprefix(submissionData["files"])
|
stephen@20
|
338 payLoadList=[]
|
stephen@20
|
339 for f in submissionData["files"]:
|
stephen@20
|
340 filename = os.path.relpath(f, common)
|
stephen@20
|
341 payLoadList.append(swordFile(f, filename))
|
stephen@20
|
342
|
stephen@20
|
343 # Upload the files
|
stephen@20
|
344 for payload in payLoadList:
|
stephen@20
|
345 collectionItem.addFile(payload)
|
stephen@20
|
346 except HTTPResponseError:
|
stephen@20
|
347 print "Bad request"
|
stephen@20
|
348 else:
|
stephen@20
|
349 # Send the zip file and let the ingester do its job
|
stephen@20
|
350 if (type == "SimpleZip") or (type=="BAGIT"):
|
stephen@20
|
351 in_progress = True
|
stephen@20
|
352 # FIXME: we don't want to write silly things in dc.description!
|
marco@4
|
353 else:
|
stephen@20
|
354 in_progress = False
|
stephen@20
|
355
|
stephen@20
|
356 payload = swordFile(submissionData["files"][0])
|
stephen@20
|
357 payload.mimetype = "application/zip"
|
stephen@20
|
358 payload.packaging = submissionData["packaging"]
|
stephen@20
|
359 item = collectionForItem.createItemFromFile(payload, in_progress)
|
stephen@20
|
360
|
stephen@20
|
361 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
|
stephen@20
|
362 if type == "SimpleZip":
|
stephen@20
|
363 if metadataEntry is None:
|
stephen@20
|
364 metadataEntry = Entry(dcterms_title=(os.path.basename(submissionData["files"][0])))
|
stephen@20
|
365
|
stephen@20
|
366 # in_progress is True: we don't want to close the submission
|
stephen@20
|
367 item.updateMetadata(metadataEntry, in_progress=True)
|
stephen@20
|
368
|
stephen@20
|
369 # If we want to store the zip file along with the individual files (Only SimpleZip)
|
stephen@20
|
370 if storeZip:
|
marco@4
|
371 try:
|
stephen@20
|
372 zipPayload = swordFile(submissionData["files"][0], os.path.basename(submissionData["files"][0]).replace(" ", "_"))
|
stephen@20
|
373 zipPayload.mimetype = "application/zip"
|
stephen@20
|
374 zipPayload.packaging = 'http://purl.org/net/sword/package/Binary'
|
stephen@20
|
375 item.addFile(zipPayload)
|
stephen@20
|
376 print "Zip file successfully added to the bitstreams."
|
marco@4
|
377 except:
|
stephen@20
|
378 print "Server error: could not add the zip file to the resources"
|
marco@0
|
379
|
stephen@20
|
380 if submissionData["isTemporaryFile"]:
|
stephen@20
|
381 os.remove(submissionData["files"][0])
|
marco@4
|
382
|
stephen@20
|
383 print "------------------------"
|
stephen@20
|
384 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
|
stephen@20
|
385
|
marco@0
|
386 except KeyboardInterrupt:
|
marco@0
|
387 print "------------------------"
|
marco@3
|
388 print "\nSubmission aborted by user." |