view sworduploader.py @ 14:273fc80d9623

v0.5 -> New default service document, uses the GIT sword2 client code
author Marco Fabiani <marco.fabiani@eecs.qmul.ac.uk>
date Fri, 25 May 2012 13:14:05 +0100
parents cc6c7235d08a
children e24aea2d14a4
line wrap: on
line source
#!usr/bin/env/ python

"""

SWORD2 DSpace bulk uploader - v0.5

A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
Built on the SWORD2 python client library: https://bitbucket.org/beno/python-sword2/overview with modifications.

Dependencies:

- python 2.X

- sword2 library: https://github.com/swordapp/python-client-sword2 

-----------------------------------
  Centre for Digital Music, Queen Mary, University of London
  Copyright (c) 2012 Marco Fabiani

  Permission is hereby granted, free of charge, to any person
  obtaining a copy of this software and associated documentation
  files (the "Software"), to deal in the Software without
  restriction, including without limitation the rights to use, copy,
  modify, merge, publish, distribute, sublicense, and/or sell copies
  of the Software, and to permit persons to whom the Software is
  furnished to do so, subject to the following conditions:

  The above copyright notice and this permission notice shall be
  included in all copies or substantial portions of the Software.
  
  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  OTHER DEALINGS IN THE SOFTWARE.
-----------------------------------

A copy of this License can also be found in the COPYING file distributed with the source code.
"""

import argparse, getpass, zipfile, os, sys
from sword2 import *

# Parse arguments
parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
parser.add_argument("data", type=str, nargs=1,
                   help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.")
parser.add_argument("--servicedoc", dest="sd", type=str,nargs=1, help="Url of the SWORDv2 service document (default: http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument")

args = parser.parse_args()
data = args.data[0]
if args.zip:
	storezip = True
else:
	storezip = False

if args.sd == None:
	sd = "http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument"
else:
	sd = args.sd[0]


try:
	# Connect to SWORD server: it will always try to authenticate (no anonymous submissions!
	attempts = 3 #  Number of attempts left to connect to server
	connected = False
	while attempts>0 and not connected:
		print "Connecting to SWORD server. Remaining attempts: ", attempts
		# Try to login, get service document
		# Get username and password
		if args.user_name == None:
			user_name = raw_input("Username: ")
		else:
			user_name = args.user_name[0]
			print "Username: ",user_name
		user_pass = getpass.getpass("Password:")
		# Connect to the server
		c = Connection(sd, user_name=user_name, user_pass=user_pass,keep_history=False)
		
		# Get service document
		try:
			c.get_service_document()
		except: # Server error
			print "Server unreachable!"
			break
		if c.sd != None:
			connected = True
		else:
			attempts-=1
			print "Incorrect username and/or password"
			
			
	if connected:
		# List available collections
		print "------------------------"
		print "Welcome to the ",c.workspaces[0][0], "repository"
		print "Available Collections: "
		numColl = len(c.workspaces[0][1])
		for ctr in range(numColl):
			coll = c.workspaces[0][1][ctr]
			print ctr+1,":",coll.title
		# Select a collection to deposit into
		sel = "0"
		while (not sel.isdigit() or int(sel)<=0 or int(sel)>numColl):
			sel = raw_input("Select a Collection to submit your files into: ")
		sel = int(sel)
		collection = c.workspaces[0][1][sel-1]
		print "Selected Collection: ",collection.title
		
		# Create a submission
		fileslist = []
		temp = False # Delete temp files
		# If folder
		if os.path.isdir(data):
			if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
				dataname = os.path.basename(os.path.normpath(data))
				if args.title != None:
					zipf = args.title[0].replace(" ","_")+".zip"
				else:
					zipf = dataname.replace(" ","_")+".zip"
				myzip = zipfile.ZipFile(zipf, "w")
				# get the directory structure
				print "Creating a zip archive for submission..."
				for root, dirs, files in os.walk(data):
					for name in files:
						if not name.startswith('.'): # Do not upload hidden files, OSX/linux
							myzip.write(os.path.join(root,name),
								os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets
				fileslist.append(zipf)
				myzip.close()
				packaging = "http://purl.org/net/sword/package/SimpleZip"
				type = "SimpleZip"
				temp = True
			else: #create a list of files to upload
				for root, dirs, files in os.walk(data):
					for name in files:
						if not name.startswith('.'):
							fileslist.append(os.path.join(root,name))
				type = "multiple files"
		elif zipfile.is_zipfile(data): #This is a zip file
			fileslist.append(data)
			myzip = zipfile.ZipFile(data)
			if "mets.xml" in myzip.namelist(): #This is a METS package
				packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
				type = "METS"
				in_progress = False
			elif "bagit.txt" in "".join(myzip.namelist()): #This is a BagIt package
				packaging = "http://purl.org/net/sword/package/BagIt"
				type = "BAGIT"
			else:#THis is a simple zip file
				packaging = "http://purl.org/net/sword/package/SimpleZip"
				type = "SimpleZip"
			myzip.close()
		elif os.path.isfile(data): # This is a single file
			fileslist.append(data)
			type = "single file"
		else:
			print "Couldn't find the data."
			sys.exit()
		
		print "------------------------"
		print "This is a ",type," submission"
		
		# Create a metadata entry
		if (args.title != None) or (args.author != None) or (args.date != None):
			entry = Entry()	
			if  args.title != None:
				entry.add_fields(dcterms_title = args.title[0])
			if args.author != None:
				for creator in args.author:
					entry.add_fields(dcterms_creator=creator)
			if args.date != None:
				entry.add_fields(dcterms_created = args.date[0])
		else:
			entry = None
		# Select what to do
		if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
			try:
				# Create the metadata entry with ATOM
				print "------------------------"
				print "Creating the item..."
				if entry is None:
					entry = Entry(dcterms_title=(os.path.basename(data)))
				creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
				
				# Add the files
				for f in fileslist:
					print "Uploading file ",os.path.basename(f)
					payload = open(f,"rb")
					deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
						payload = payload,
						filename = os.path.basename(f),
						mimetype = 'application/zip',
						packaging = 'http://purl.org/net/sword/package/Binary')
					payload.close()
			except HTTPResponseError:
				print "Bad request"
		else:
			# Send the zip file and let the ingester do its job
			payload = open(fileslist[0], "rb")
			if (type == "SimpleZip") or (type=="BAGIT"):
				in_progress = True
				# FIXME: we don't want to write silly things in dc.description!
			else:
				in_progress = False
			try:
				deposit_receipt = c.create(col_iri = collection.href,
					payload = payload,
					filename = fileslist[0],
					mimetype = "application/zip",
					packaging = packaging,
					in_progress = in_progress)
				print type, " submission successful."
			except:
				print "Error! Couldn't submit the file!"						
				if type == "METS": # Just guessing: not sure this is the problem...
					print "To submit a METS package, the collection MUST have a workflow!"
			payload.close()
			
			# If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
			if type == "SimpleZip":
				if entry is None:
					entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
				try:
					update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
					print "Metadata update successfull."
				except:
					print "Server error"
				# If we want to store the zip file along with the individual files (Only SimpleZip)
				if storezip:
					try:
						payload = open(fileslist[0],"rb")
						zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
							payload = payload,
							filename = os.path.basename(fileslist[0]).replace(" ","_"),
							mimetype = 'application/zip',
							packaging = 'http://purl.org/net/sword/package/Binary')
						payload.close()
						print "Zip file successfully added to the bitstreams."
					except:
						print "Server error: could not add the zip file to the resources"
			if temp:
				os.remove(fileslist[0])

		print "------------------------"
		print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."			

		
	else: # Failed to connect to SWORDv2 Server
		print "Couldn't connect to the server."
		if attempts == 0:
			print "Invalid credentials entered 3 times."
	
except KeyboardInterrupt:
	print "------------------------"
	print "\nSubmission aborted by user."