annotate sworduploader.py @ 22:d1752c7031e4 timeouts tip

Updated .hgignore to ignore sword2_logging.conf and anything in .cache
author Steve Welburn <stephen.welburn@eecs.qmul.ac.uk>
date Tue, 22 Jan 2013 14:43:42 +0000
parents 3fb1ac952fb2
children
rev   line source
marco@0 1 #!usr/bin/env/ python
marco@1 2
marco@12 3 """
marco@12 4
marco@15 5 SWORD2 DSpace bulk uploader - v0.6
marco@1 6
marco@1 7 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
marco@15 8 Built on the SWORD2 python client library: https://github.com/swordapp/python-client-sword2.
marco@1 9
marco@1 10 Dependencies:
marco@1 11
marco@1 12 - python 2.X
marco@1 13
marco@14 14 - sword2 library: https://github.com/swordapp/python-client-sword2
marco@1 15
marco@1 16 -----------------------------------
marco@15 17 Updates log:
marco@15 18
marco@15 19 v0.6: - now uploading a directory will also maintain the path structure
marco@15 20 - introduced a file where to specify the server (server.cfg)
marco@15 21 v0.5: changed the default server to C4DM live server
marco@15 22
marco@15 23 -----------------------------------
marco@11 24 Centre for Digital Music, Queen Mary, University of London
marco@11 25 Copyright (c) 2012 Marco Fabiani
marco@11 26
marco@11 27 Permission is hereby granted, free of charge, to any person
marco@11 28 obtaining a copy of this software and associated documentation
marco@11 29 files (the "Software"), to deal in the Software without
marco@11 30 restriction, including without limitation the rights to use, copy,
marco@11 31 modify, merge, publish, distribute, sublicense, and/or sell copies
marco@11 32 of the Software, and to permit persons to whom the Software is
marco@11 33 furnished to do so, subject to the following conditions:
marco@11 34
marco@11 35 The above copyright notice and this permission notice shall be
marco@11 36 included in all copies or substantial portions of the Software.
stephen@20 37
marco@11 38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
marco@11 39 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
marco@11 40 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
marco@11 41 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
marco@11 42 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
marco@11 43 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
marco@11 44 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
marco@11 45 OTHER DEALINGS IN THE SOFTWARE.
marco@1 46 -----------------------------------
marco@12 47
marco@11 48 A copy of this License can also be found in the COPYING file distributed with the source code.
marco@1 49 """
marco@0 50
stephen@20 51 import argparse, getpass, zipfile, os, sword2.http_layer
marco@0 52 from sword2 import *
marco@0 53
marco@0 54 # Parse arguments
stephen@20 55 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORD v2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
marco@0 56 parser.add_argument("data", type=str, nargs=1,
stephen@20 57 help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
stephen@20 58 parser.add_argument("--username", dest="user_name", type=str, nargs=1, help="DSpace username.")
stephen@20 59 parser.add_argument("--password", dest="password", type=str, nargs=1, help="DSpace password.")
stephen@21 60 parser.add_argument("--timeout", dest="timeout", type=float, nargs=1, default=[30.0], help="Timeout for response for connections. Make sure this is long enough to allow files to be uploaded.")
marco@0 61 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
stephen@20 62 parser.add_argument("--author", dest="author", type=str, nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
stephen@20 63 parser.add_argument("--date", dest="date", type=str, nargs=1, help="Date of creation (string) (ignored for METS packages).")
stephen@20 64 parser.add_argument("--zip", action="store_true", dest="zip", default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.")
stephen@20 65 parser.add_argument("--servicedoc", dest="sd", type=str, nargs=1, help="Url of the SWORD v2 service document (default: use server.cfg if available, otherwise http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument")
marco@0 66
marco@0 67 args = parser.parse_args()
marco@0 68 data = args.data[0]
stephen@20 69 timeout = args.timeout[0]
marco@13 70 if args.zip:
stephen@20 71 storeZip = True
marco@13 72 else:
stephen@20 73 storeZip = False
marco@8 74
stephen@20 75 if args.sd is None:
marco@15 76 try:
marco@15 77 f = open("server.cfg", "r")
marco@15 78 sd = f.readline()
marco@15 79 print "server.cfg: ", sd
marco@15 80 except:
marco@15 81 sd = "http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument"
marco@0 82 else:
marco@13 83 sd = args.sd[0]
marco@0 84
stephen@20 85 class swordConnection(object):
stephen@20 86 def __init__(self):
stephen@20 87 self.serverConnection = None
stephen@20 88 self.connected = False
stephen@20 89 self.name = ""
marco@0 90
stephen@20 91 def connect(self, timeout=30.0):
stephen@20 92 self.serverConnection = None
stephen@20 93 self.connected = False
stephen@20 94 httpImp = sword2.http_layer.HttpLib2Layer(".cache", timeout=timeout)
stephen@20 95 print "Connection timeout is ", timeout, "seconds."
stephen@20 96 # Connect to SWORD server: it will always try to authenticate (no anonymous submissions!
stephen@20 97 attempts = 3 # Number of attempts left to connect to server
stephen@20 98 while attempts>0 and not self.connected:
stephen@20 99 print "Connecting to SWORD server. Remaining attempts: ", attempts
stephen@20 100 # Try to login, get service document
stephen@20 101 # Get username and password
stephen@20 102 if args.user_name is None:
stephen@20 103 user_name = raw_input("Username: ")
stephen@20 104 else:
stephen@20 105 user_name = args.user_name[0]
stephen@20 106 print "Username: ",user_name
stephen@20 107
stephen@20 108 if args.password is None:
stephen@20 109 user_pass = getpass.getpass("Password:")
stephen@20 110 else:
stephen@20 111 user_pass = args.password[0]
stephen@20 112 # Connect to the server
stephen@20 113
stephen@20 114 self.serverConnection = Connection(sd, user_name=user_name, user_pass=user_pass,keep_history=False,http_impl=httpImp)
stephen@20 115
stephen@20 116 # Get service document
stephen@20 117 try:
stephen@20 118 self.serverConnection.get_service_document()
stephen@20 119 except: # Server error
stephen@20 120 print "Server unreachable!"
stephen@20 121 break
stephen@20 122
stephen@20 123 if self.serverConnection.sd is not None:
stephen@20 124 self.connected = True
stephen@20 125 else:
stephen@20 126 attempts-=1
stephen@20 127 print "Incorrect username and/or password"
stephen@20 128
stephen@20 129 if not self.connected:
stephen@20 130 # Failed to connect to SWORD v2 Server
stephen@20 131 print "Couldn't connect to the server."
stephen@20 132 if attempts == 0:
stephen@20 133 raise Exception, "Invalid credentials entered 3 times."
stephen@20 134 else:
stephen@20 135 raise Exception, "Unable to connect to server"
marco@0 136 else:
stephen@20 137 self.name = self.serverConnection.workspaces[0][0]
stephen@20 138
stephen@20 139 def selectCollection(self):
marco@0 140 # List available collections
marco@0 141 print "Available Collections: "
stephen@20 142 numColl = len(self.serverConnection.workspaces[0][1])
marco@0 143 for ctr in range(numColl):
stephen@20 144 coll = self.serverConnection.workspaces[0][1][ctr]
marco@0 145 print ctr+1,":",coll.title
marco@0 146 # Select a collection to deposit into
marco@14 147 sel = "0"
stephen@20 148 while (not sel.isdigit()) or int(sel)<=0 or int(sel)>numColl:
marco@14 149 sel = raw_input("Select a Collection to submit your files into: ")
marco@14 150 sel = int(sel)
stephen@20 151 collection = swordCollection(self, self.serverConnection.workspaces[0][1][sel-1])
stephen@20 152 return collection
stephen@20 153
stephen@20 154
stephen@20 155 class swordCollection(object):
stephen@20 156 def __init__(self, connection, collection):
stephen@20 157 self.connection = connection
stephen@20 158 self.serverCollection = collection
stephen@20 159
stephen@20 160 def title(self):
stephen@20 161 return self.serverCollection.title
stephen@20 162
stephen@20 163 def createItem(self, metadata_entry, in_progress=True):
stephen@20 164 creationReceipt = self.connection.serverConnection.create(col_iri = self.serverCollection.href, metadata_entry = metadata_entry, in_progress=in_progress)
stephen@20 165 return swordItem(self.connection, self, creationReceipt)
stephen@20 166
stephen@20 167 def createItemFromFile(self, file, metadata_entry, in_progress=True):
stephen@20 168 depositReceipt = None
stephen@20 169 payload = open(file.path, "rb")
stephen@20 170 try:
stephen@20 171 deposit_receipt = self.connection.serverConnection.create(col_iri = self.serverCollection.href,
stephen@20 172 payload = payload,
stephen@20 173 filename = file.filename,
stephen@20 174 mimetype = file.mimetype,
stephen@20 175 packaging = file.packaging,
stephen@20 176 in_progress = in_progress)
stephen@20 177 print type, " submission successful."
stephen@20 178 except:
stephen@20 179 print "Error! Couldn't submit the file!"
stephen@20 180 if type == "METS": # Just guessing: not sure this is the problem...
stephen@20 181 print "To submit a METS package, the collection MUST have a workflow!"
stephen@20 182 payload.close()
stephen@20 183
stephen@20 184 return swordItem(self.connection, self, depositReceipt)
stephen@20 185
stephen@20 186 class swordItem(object):
stephen@20 187 def __init__(self, connection, collection, receipt):
stephen@20 188 self.connection = connection
stephen@20 189 self.serverCollection = collection
stephen@20 190 self.receipt = receipt
stephen@20 191
stephen@20 192 def addFile(self, file):
stephen@20 193 # print "Adding to", self.receipt.edit_media
stephen@20 194 # print str(file)
stephen@20 195 payload = open(file.path, "rb")
stephen@20 196 print "Uploading file ", file.filename,
stephen@20 197 file.deposit_receipt = self.connection.serverConnection.add_file_to_resource(self.receipt.edit_media,
stephen@20 198 payload = payload,
stephen@20 199 filename = file.filename,
stephen@20 200 mimetype = file.mimetype,
stephen@20 201 packaging = file.packaging)
stephen@20 202 payload.close()
stephen@20 203 print "[uploaded]"
stephen@20 204
stephen@20 205 def updateMetadata(self, metadataEntry, in_progress=True):
stephen@20 206 try:
stephen@20 207 update_receipt = self.connection.serverConnection.update(dr = self.receipt, metadata_entry = metadataEntry, in_progress = in_progress)
stephen@20 208 print "Metadata update successful."
stephen@20 209 except:
stephen@20 210 print "Server error"
stephen@20 211 raise
stephen@20 212
stephen@20 213 # Class to encapsulate a SWORD2 payload file
stephen@20 214 class swordFile(object):
stephen@20 215 def __init__(self, path, filename=None):
stephen@20 216 self.path = path
stephen@20 217 self.deposit_receipt = None
stephen@20 218 if filename is None:
stephen@20 219 self.filename = os.path.basename(path)
marco@0 220 else:
stephen@20 221 self.filename = filename
stephen@20 222 # Default to a basic binary file
stephen@20 223 self.mimetype = "application/octet+stream"
stephen@20 224 self.packaging = 'http://purl.org/net/sword/package/Binary'
stephen@20 225
stephen@20 226 def __str__(self):
stephen@20 227 return "path:" + str(self.path) + ", filename:" + str(self.filename) + ", mimetype:" + str(self.mimetype) + ", packaging:" + str(self.packaging)
stephen@20 228
stephen@20 229 def getSubmissionData(args, data):
stephen@20 230 # Create a submission
stephen@20 231 filesList = []
stephen@20 232 temp = False # Delete temp files
stephen@20 233 packaging = None
stephen@20 234 # If folder
stephen@20 235 if os.path.isdir(data):
stephen@20 236 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
stephen@20 237 dataName = os.path.basename(os.path.normpath(data))
stephen@20 238 if args.title is not None:
stephen@20 239 zipFile = args.title[0].replace(" ","_")+".zip"
stephen@20 240 else:
stephen@20 241 zipFile = dataName.replace(" ","_")+".zip"
stephen@20 242 myZip = zipfile.ZipFile(zipFile, "w")
stephen@20 243 # get the directory structure
stephen@20 244 print "Creating a zip archive for submission..."
stephen@20 245 for root, dirs, files in os.walk(data):
stephen@20 246 for name in files:
stephen@20 247 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
stephen@20 248 # Remove spaces and square brackets
stephen@20 249 myZip.write(os.path.join(root,name),
stephen@20 250 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")"))
stephen@20 251 filesList.append(zipFile)
stephen@20 252 myZip.close()
stephen@20 253 packaging = "http://purl.org/net/sword/package/SimpleZip"
stephen@20 254 type = "SimpleZip"
stephen@20 255 temp = True
marco@4 256 else:
stephen@20 257 # Create a list of files to upload
stephen@20 258 for root, dirs, files in os.walk(data):
stephen@20 259 for name in files:
stephen@20 260 if not name.startswith('.'):
stephen@20 261 filesList.append(os.path.join(root,name))
stephen@20 262 type = "multiple files"
stephen@20 263 elif zipfile.is_zipfile(data):
stephen@20 264 # This is a zip file
stephen@20 265 filesList.append(data)
stephen@20 266 myZip = zipfile.ZipFile(data)
stephen@20 267 if "mets.xml" in myZip.namelist():
stephen@20 268 # This is a METS package
stephen@20 269 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
stephen@20 270 type = "METS"
stephen@20 271 in_progress = False
stephen@20 272 elif "bagit.txt" in "".join(myZip.namelist()):
stephen@20 273 # This is a BagIt package
stephen@20 274 packaging = "http://purl.org/net/sword/package/BagIt"
stephen@20 275 type = "BAGIT"
stephen@20 276 else:
stephen@20 277 # This is a simple zip file
stephen@20 278 packaging = "http://purl.org/net/sword/package/SimpleZip"
stephen@20 279 type = "SimpleZip"
stephen@20 280 myZip.close()
stephen@20 281 elif os.path.isfile(data): # This is a single file
stephen@20 282 filesList.append(data)
stephen@20 283 type = "single file"
stephen@20 284 else:
stephen@20 285 raise Exception, "Couldn't find the data."
stephen@20 286
stephen@20 287 submissionData = {"files": filesList, "packaging": packaging, "type":type, "isTemporaryFile":temp}
stephen@20 288 return submissionData
stephen@20 289
stephen@20 290 def setupMetadataEntry(args):
stephen@20 291 # Create a metadata entry
stephen@20 292 if (args.title is not None) or (args.author is not None) or (args.date is not None):
stephen@20 293 entry = Entry()
stephen@20 294 if args.title is not None:
stephen@20 295 entry.add_fields(dcterms_title = args.title[0])
stephen@20 296 if args.author is not None:
stephen@20 297 for creator in args.author:
stephen@20 298 entry.add_fields(dcterms_creator=creator)
stephen@20 299 if args.date is not None:
stephen@20 300 entry.add_fields(dcterms_created = args.date[0])
stephen@20 301 else:
stephen@20 302 entry = None
stephen@20 303 return entry
stephen@20 304
stephen@20 305 try:
stephen@20 306 serverConnection = swordConnection()
stephen@20 307 serverConnection.connect(timeout)
stephen@20 308 print "------------------------"
stephen@20 309 print "Welcome to the", serverConnection.name, "repository"
stephen@20 310
stephen@20 311 collectionForItem = serverConnection.selectCollection()
stephen@20 312 print "Selected Collection:", collectionForItem.title()
stephen@20 313
stephen@20 314 submissionData = getSubmissionData(args, data)
stephen@20 315
stephen@20 316 print "------------------------"
stephen@20 317 print "This is a", submissionData["type"], "submission"
stephen@20 318
stephen@20 319 metadataEntry = setupMetadataEntry(args)
stephen@20 320
stephen@20 321 # Select what to do
stephen@20 322 if (submissionData["type"] == "single file") or (submissionData["type"] == "multiple files"): # Use the single file upload procedure
stephen@20 323 try:
stephen@20 324 # Create the metadata entry with ATOM
stephen@20 325 print "------------------------"
stephen@20 326 print "Creating the", submissionData["type"], "item... "
stephen@20 327 if metadataEntry is None:
stephen@20 328 metadataEntry = Entry(dcterms_title=(os.path.basename(data)))
stephen@20 329 collectionItem = collectionForItem.createItem(metadata_entry = metadataEntry, in_progress=True)
stephen@20 330 print "Item created"
stephen@20 331
stephen@20 332 # Create a list of files to upload
stephen@20 333 if submissionData["type"] == "single file":
stephen@20 334 payLoadList = [swordFile(submissionData["files"][0])]
stephen@20 335 else:
marco@15 336 # Get the longest common path in order to send the correct filename to keep the structure
stephen@20 337 common = os.path.commonprefix(submissionData["files"])
stephen@20 338 payLoadList=[]
stephen@20 339 for f in submissionData["files"]:
stephen@20 340 filename = os.path.relpath(f, common)
stephen@20 341 payLoadList.append(swordFile(f, filename))
stephen@20 342
stephen@20 343 # Upload the files
stephen@20 344 for payload in payLoadList:
stephen@20 345 collectionItem.addFile(payload)
stephen@20 346 except HTTPResponseError:
stephen@20 347 print "Bad request"
stephen@20 348 else:
stephen@20 349 # Send the zip file and let the ingester do its job
stephen@20 350 if (type == "SimpleZip") or (type=="BAGIT"):
stephen@20 351 in_progress = True
stephen@20 352 # FIXME: we don't want to write silly things in dc.description!
marco@4 353 else:
stephen@20 354 in_progress = False
stephen@20 355
stephen@20 356 payload = swordFile(submissionData["files"][0])
stephen@20 357 payload.mimetype = "application/zip"
stephen@20 358 payload.packaging = submissionData["packaging"]
stephen@20 359 item = collectionForItem.createItemFromFile(payload, in_progress)
stephen@20 360
stephen@20 361 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
stephen@20 362 if type == "SimpleZip":
stephen@20 363 if metadataEntry is None:
stephen@20 364 metadataEntry = Entry(dcterms_title=(os.path.basename(submissionData["files"][0])))
stephen@20 365
stephen@20 366 # in_progress is True: we don't want to close the submission
stephen@20 367 item.updateMetadata(metadataEntry, in_progress=True)
stephen@20 368
stephen@20 369 # If we want to store the zip file along with the individual files (Only SimpleZip)
stephen@20 370 if storeZip:
marco@4 371 try:
stephen@20 372 zipPayload = swordFile(submissionData["files"][0], os.path.basename(submissionData["files"][0]).replace(" ", "_"))
stephen@20 373 zipPayload.mimetype = "application/zip"
stephen@20 374 zipPayload.packaging = 'http://purl.org/net/sword/package/Binary'
stephen@20 375 item.addFile(zipPayload)
stephen@20 376 print "Zip file successfully added to the bitstreams."
marco@4 377 except:
stephen@20 378 print "Server error: could not add the zip file to the resources"
marco@0 379
stephen@20 380 if submissionData["isTemporaryFile"]:
stephen@20 381 os.remove(submissionData["files"][0])
marco@4 382
stephen@20 383 print "------------------------"
stephen@20 384 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
stephen@20 385
marco@0 386 except KeyboardInterrupt:
marco@0 387 print "------------------------"
marco@3 388 print "\nSubmission aborted by user."