view sworduploader.py @ 20:8b9e7f2f80e2 timeouts

Updated to: (i) allow timeout and password as parameters (ii) use connection/collection/item/file objects
author Steve Welburn <stephen.welburn@eecs.qmul.ac.uk>
date Tue, 22 Jan 2013 13:41:24 +0000
parents e24aea2d14a4
children 3fb1ac952fb2
line wrap: on
line source
#!usr/bin/env/ python

"""

SWORD2 DSpace bulk uploader - v0.6

A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
Built on the SWORD2 python client library: https://github.com/swordapp/python-client-sword2.

Dependencies:

- python 2.X

- sword2 library: https://github.com/swordapp/python-client-sword2 

-----------------------------------
Updates log:

v0.6: 	- now uploading a directory will also maintain the path structure
		- introduced a file where to specify the server (server.cfg)
v0.5: changed the default server to C4DM live server

-----------------------------------
  Centre for Digital Music, Queen Mary, University of London
  Copyright (c) 2012 Marco Fabiani

  Permission is hereby granted, free of charge, to any person
  obtaining a copy of this software and associated documentation
  files (the "Software"), to deal in the Software without
  restriction, including without limitation the rights to use, copy,
  modify, merge, publish, distribute, sublicense, and/or sell copies
  of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.

  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  OTHER DEALINGS IN THE SOFTWARE.
-----------------------------------

A copy of this License can also be found in the COPYING file distributed with the source code.
"""

import argparse, getpass, zipfile, os, sword2.http_layer
from sword2 import *

# Parse arguments
parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORD v2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
parser.add_argument("data", type=str, nargs=1,
				   help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
parser.add_argument("--username", dest="user_name", type=str, nargs=1, help="DSpace username.")
parser.add_argument("--password", dest="password", type=str, nargs=1, help="DSpace password.")
parser.add_argument("--timeout", dest="timeout", type=float, nargs=1, default=30.0, help="Timeout for response for connections. Make sure this is long enough to allow files to be uploaded.")
parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
parser.add_argument("--author", dest="author", type=str, nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
parser.add_argument("--date", dest="date", type=str, nargs=1, help="Date of creation (string) (ignored for METS packages).")
parser.add_argument("--zip", action="store_true", dest="zip", default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.")
parser.add_argument("--servicedoc", dest="sd", type=str, nargs=1, help="Url of the SWORD v2 service document (default: use server.cfg if available, otherwise http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument")

args = parser.parse_args()
data = args.data[0]
timeout = args.timeout[0]
if args.zip:
	storeZip = True
else:
	storeZip = False

if args.sd is None:
	try:
		f = open("server.cfg", "r")
		sd = f.readline()
		print "server.cfg: ", sd
	except:
		sd = "http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument"
else:
	sd = args.sd[0]

class swordConnection(object):
	def __init__(self):
		self.serverConnection = None
		self.connected = False
		self.name = ""

	def connect(self, timeout=30.0):
		self.serverConnection = None
		self.connected = False
		httpImp = sword2.http_layer.HttpLib2Layer(".cache", timeout=timeout)
		print "Connection timeout is ", timeout, "seconds."
		# Connect to SWORD server: it will always try to authenticate (no anonymous submissions!
		attempts = 3 #  Number of attempts left to connect to server
		while attempts>0 and not self.connected:
			print "Connecting to SWORD server. Remaining attempts: ", attempts
			# Try to login, get service document
			# Get username and password
			if args.user_name is None:
				user_name = raw_input("Username: ")
			else:
				user_name = args.user_name[0]
				print "Username: ",user_name

			if args.password is None:
				user_pass = getpass.getpass("Password:")
			else:
				user_pass = args.password[0]
			# Connect to the server

			self.serverConnection = Connection(sd, user_name=user_name, user_pass=user_pass,keep_history=False,http_impl=httpImp)

			# Get service document
			try:
				self.serverConnection.get_service_document()
			except: # Server error
				print "Server unreachable!"
				break

			if self.serverConnection.sd is not None:
				self.connected = True
			else:
				attempts-=1
				print "Incorrect username and/or password"

		if not self.connected:
			# Failed to connect to SWORD v2 Server
			print "Couldn't connect to the server."
			if attempts == 0:
				raise Exception, "Invalid credentials entered 3 times."
			else:
				raise Exception, "Unable to connect to server"
		else:
			self.name = self.serverConnection.workspaces[0][0]

	def selectCollection(self):
		# List available collections
		print "Available Collections: "
		numColl = len(self.serverConnection.workspaces[0][1])
		for ctr in range(numColl):
			coll = self.serverConnection.workspaces[0][1][ctr]
			print ctr+1,":",coll.title
		# Select a collection to deposit into
		sel = "0"
		while (not sel.isdigit()) or int(sel)<=0 or int(sel)>numColl:
			sel = raw_input("Select a Collection to submit your files into: ")
		sel = int(sel)
		collection = swordCollection(self, self.serverConnection.workspaces[0][1][sel-1])
		return collection


class swordCollection(object):
	def __init__(self, connection, collection):
		self.connection = connection
		self.serverCollection = collection

	def title(self):
		return self.serverCollection.title

	def createItem(self, metadata_entry, in_progress=True):
		creationReceipt = self.connection.serverConnection.create(col_iri = self.serverCollection.href, metadata_entry = metadata_entry, in_progress=in_progress)
		return swordItem(self.connection, self, creationReceipt)

	def createItemFromFile(self, file, metadata_entry, in_progress=True):
		depositReceipt = None
		payload = open(file.path, "rb")
		try:
			deposit_receipt = self.connection.serverConnection.create(col_iri = self.serverCollection.href,
				payload = payload,
				filename = file.filename,
				mimetype = file.mimetype,
				packaging = file.packaging,
				in_progress = in_progress)
			print type, " submission successful."
		except:
			print "Error! Couldn't submit the file!"
			if type == "METS": # Just guessing: not sure this is the problem...
				print "To submit a METS package, the collection MUST have a workflow!"
		payload.close()

		return swordItem(self.connection, self, depositReceipt)

class swordItem(object):
	def __init__(self, connection, collection, receipt):
		self.connection = connection
		self.serverCollection = collection
		self.receipt = receipt

	def addFile(self, file):
#		print "Adding to", self.receipt.edit_media
#		print str(file)
		payload = open(file.path, "rb")
		print "Uploading file ", file.filename,
		file.deposit_receipt = self.connection.serverConnection.add_file_to_resource(self.receipt.edit_media,
			payload = payload,
			filename = file.filename,
			mimetype = file.mimetype,
			packaging = file.packaging)
		payload.close()
		print "[uploaded]"

	def updateMetadata(self, metadataEntry, in_progress=True):
		try:
			update_receipt = self.connection.serverConnection.update(dr = self.receipt, metadata_entry = metadataEntry, in_progress = in_progress)
			print "Metadata update successful."
		except:
			print "Server error"
			raise

# Class to encapsulate a SWORD2 payload file
class swordFile(object):
	def __init__(self, path, filename=None):
		self.path = path
		self.deposit_receipt = None
		if filename is None:
			self.filename = os.path.basename(path)
		else:
			self.filename = filename
		# Default to a basic binary file
		self.mimetype = "application/octet+stream"
		self.packaging = 'http://purl.org/net/sword/package/Binary'

	def __str__(self):
		return "path:" + str(self.path) + ", filename:" + str(self.filename) + ", mimetype:" + str(self.mimetype) + ", packaging:" + str(self.packaging)

def getSubmissionData(args, data):
	# Create a submission
	filesList = []
	temp = False # Delete temp files
	packaging = None
	# If folder
	if os.path.isdir(data):
		if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
			dataName = os.path.basename(os.path.normpath(data))
			if args.title is not None:
				zipFile = args.title[0].replace(" ","_")+".zip"
			else:
				zipFile = dataName.replace(" ","_")+".zip"
			myZip = zipfile.ZipFile(zipFile, "w")
			# get the directory structure
			print "Creating a zip archive for submission..."
			for root, dirs, files in os.walk(data):
				for name in files:
					if not name.startswith('.'): # Do not upload hidden files, OSX/linux
						# Remove spaces and square brackets
						myZip.write(os.path.join(root,name),
							os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")"))
			filesList.append(zipFile)
			myZip.close()
			packaging = "http://purl.org/net/sword/package/SimpleZip"
			type = "SimpleZip"
			temp = True
		else:
			# Create a list of files to upload
			for root, dirs, files in os.walk(data):
				for name in files:
					if not name.startswith('.'):
						filesList.append(os.path.join(root,name))
			type = "multiple files"
	elif zipfile.is_zipfile(data):
		# This is a zip file
		filesList.append(data)
		myZip = zipfile.ZipFile(data)
		if "mets.xml" in myZip.namelist():
			# This is a METS package
			packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
			type = "METS"
			in_progress = False
		elif "bagit.txt" in "".join(myZip.namelist()):
			# This is a BagIt package
			packaging = "http://purl.org/net/sword/package/BagIt"
			type = "BAGIT"
		else:
			# This is a simple zip file
			packaging = "http://purl.org/net/sword/package/SimpleZip"
			type = "SimpleZip"
		myZip.close()
	elif os.path.isfile(data): # This is a single file
		filesList.append(data)
		type = "single file"
	else:
		raise Exception, "Couldn't find the data."

	submissionData = {"files": filesList, "packaging": packaging, "type":type, "isTemporaryFile":temp}
	return submissionData

def setupMetadataEntry(args):
	# Create a metadata entry
	if (args.title is not None) or (args.author is not None) or (args.date is not None):
		entry = Entry()
		if  args.title is not None:
			entry.add_fields(dcterms_title = args.title[0])
		if args.author is not None:
			for creator in args.author:
				entry.add_fields(dcterms_creator=creator)
		if args.date is not None:
			entry.add_fields(dcterms_created = args.date[0])
	else:
		entry = None
	return entry

try:
	serverConnection = swordConnection()
	serverConnection.connect(timeout)
	print "------------------------"
	print "Welcome to the", serverConnection.name, "repository"

	collectionForItem = serverConnection.selectCollection()
	print "Selected Collection:", collectionForItem.title()

	submissionData = getSubmissionData(args, data)

	print "------------------------"
	print "This is a", submissionData["type"], "submission"

	metadataEntry = setupMetadataEntry(args)

	# Select what to do
	if (submissionData["type"] == "single file") or (submissionData["type"] == "multiple files"): # Use the single file upload procedure
		try:
			# Create the metadata entry with ATOM
			print "------------------------"
			print "Creating the", submissionData["type"], "item... "
			if metadataEntry is None:
				metadataEntry = Entry(dcterms_title=(os.path.basename(data)))
			collectionItem = collectionForItem.createItem(metadata_entry = metadataEntry, in_progress=True)
			print "Item created"

			# Create a list of files to upload
			if submissionData["type"] == "single file":
				payLoadList = [swordFile(submissionData["files"][0])]
			else:
				# Get the longest common path in order to send the correct filename to keep the structure
				common = os.path.commonprefix(submissionData["files"])
				payLoadList=[]
				for f in submissionData["files"]:
					filename = os.path.relpath(f, common)
					payLoadList.append(swordFile(f, filename))

			# Upload the files
			for payload in payLoadList:
				collectionItem.addFile(payload)
		except HTTPResponseError:
			print "Bad request"
	else:
		# Send the zip file and let the ingester do its job
		if (type == "SimpleZip") or (type=="BAGIT"):
			in_progress = True
			# FIXME: we don't want to write silly things in dc.description!
		else:
			in_progress = False

		payload = swordFile(submissionData["files"][0])
		payload.mimetype = "application/zip"
		payload.packaging = submissionData["packaging"]
		item = collectionForItem.createItemFromFile(payload, in_progress)

		# If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
		if type == "SimpleZip":
			if metadataEntry is None:
				metadataEntry = Entry(dcterms_title=(os.path.basename(submissionData["files"][0])))

			# in_progress is True: we don't want to close the submission
			item.updateMetadata(metadataEntry, in_progress=True)

			# If we want to store the zip file along with the individual files (Only SimpleZip)
			if storeZip:
				try:
					zipPayload = swordFile(submissionData["files"][0], os.path.basename(submissionData["files"][0]).replace(" ", "_"))
					zipPayload.mimetype = "application/zip"
					zipPayload.packaging = 'http://purl.org/net/sword/package/Binary'
					item.addFile(zipPayload)
					print "Zip file successfully added to the bitstreams."
				except:
					print "Server error: could not add the zip file to the resources"

		if submissionData["isTemporaryFile"]:
			os.remove(submissionData["files"][0])

	print "------------------------"
	print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."

except KeyboardInterrupt:
	print "------------------------"
	print "\nSubmission aborted by user."