annotate sworduploader.py @ 8:ff51b8204ad4

If using zip files, the zip is stored in DSpace as well Small changes (in_progress explicitly)
author Marco Fabiani <marco.fabiani@eecs.qmul.ac.uk>
date Tue, 03 Apr 2012 16:22:18 +0100
parents 9d9d5a1b1d3c
children 394b4d094767
rev   line source
marco@0 1 #!usr/bin/env/ python
marco@1 2
marco@6 3 """ SWORD2 DSpace bulk uploader - v0.3
marco@1 4
marco@1 5 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
marco@1 6 Built on the SWORD2 python client library: https://bitbucket.org/beno/python-sword2/overview
marco@1 7
marco@1 8 Dependencies:
marco@1 9
marco@1 10 - python 2.X
marco@1 11
marco@4 12 - sword2 library, with modifications:
marco@4 13 (original) https://bitbucket.org/beno/python-sword2/src
marco@4 14 (modified) https://code.soundsoftware.ac.uk/hg/sworduploader
marco@1 15
marco@1 16 -----------------------------------
marco@1 17 Copyright 2012 Marco Fabiani
marco@1 18 Copyright 2012 Queen Mary, University of London
marco@1 19 -----------------------------------
marco@1 20 """
marco@0 21
marco@4 22 import argparse, getpass, zipfile, os, sys
marco@0 23 from sword2 import *
marco@0 24
marco@0 25 # Parse arguments
marco@0 26 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
marco@0 27 parser.add_argument("data", type=str, nargs=1,
marco@1 28 help="Accepts: METSDSpaceSIP packages, zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
marco@0 29 parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
marco@8 30 parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, send it as a single zip archive to preserve its structure. The zip file will be saved along with the individual files.")
marco@0 31 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
marco@0 32 parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
marco@0 33 parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
marco@1 34 parser.add_argument("--servicedoc", dest="dspaceurl", type=str,nargs=1, help="Url of the SWORDv2 service document (default: http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument")
marco@0 35
marco@0 36 args = parser.parse_args()
marco@0 37 data = args.data[0]
marco@8 38 storezip = True
marco@8 39
marco@0 40 if args.dspaceurl == None:
marco@0 41 dspaceurl = "http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument"
marco@0 42 else:
marco@0 43 dspaceurl = args.dspaceurl[0]
marco@0 44
marco@0 45
marco@0 46 try:
marco@0 47 # Connect to SWORD server
marco@0 48 attempts = 3 # Number of attempts left to connect to server
marco@0 49 connected = False
marco@0 50 while attempts>0 and not connected:
marco@0 51 print "Connecting to SWORD server. Remaining attempts: ", attempts
marco@0 52 # Try to login, get service document
marco@0 53 # Get username and password
marco@0 54 if args.user_name == None:
marco@0 55 user_name = raw_input("DSpace Username: ")
marco@0 56 else:
marco@0 57 user_name = args.user_name[0]
marco@0 58 print "DSpace Username: ",user_name
marco@0 59 user_pass = getpass.getpass("DSpace password:")
marco@0 60 # Connect to the server
marco@0 61 c = Connection(dspaceurl, user_name=user_name, user_pass=user_pass,keep_history=False)
marco@3 62
marco@0 63 # Get service document
marco@0 64 try:
marco@0 65 c.get_service_document()
marco@3 66 except: # Server error
marco@3 67 print "Server unreachable!"
marco@3 68 break
marco@3 69 if c.sd != None:
marco@3 70 connected = True
marco@3 71 else:
marco@0 72 attempts-=1
marco@0 73 print "Incorrect username and/or password"
marco@3 74
marco@0 75
marco@0 76 if connected:
marco@0 77 # List available collections
marco@0 78 print "------------------------"
marco@0 79 print "Welcome to the ",c.workspaces[0][0], "repository"
marco@0 80 print "Available Collections: "
marco@0 81 numColl = len(c.workspaces[0][1])
marco@0 82 for ctr in range(numColl):
marco@0 83 coll = c.workspaces[0][1][ctr]
marco@0 84 print ctr+1,":",coll.title
marco@0 85 # Select a collection to deposit into
marco@0 86 sel = -1
marco@0 87 while (sel<=0 or sel>numColl):
marco@0 88 sel = input("Select a Collection to submit your files into: ")
marco@0 89 collection = c.workspaces[0][1][sel-1]
marco@0 90 print "Selected Collection: ",collection.title
marco@0 91
marco@4 92 # Create a submission
marco@4 93 fileslist = []
marco@4 94 temp = False # Delete temp files
marco@4 95 # If folder
marco@4 96 if os.path.isdir(data):
marco@4 97 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
marco@4 98 dataname = os.path.basename(os.path.normpath(data))
marco@4 99 zipf = dataname+".zip"
marco@4 100 myzip = zipfile.ZipFile(zipf, "w")
marco@4 101 # get the directory structure
marco@4 102 print "Creating a zip archive for submission..."
marco@4 103 for root, dirs, files in os.walk(data):
marco@4 104 for name in files:
marco@8 105 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
marco@8 106 myzip.write(os.path.join(root,name),
marco@8 107 os.path.relpath(os.path.join(root,name),data))
marco@4 108 fileslist.append(zipf)
marco@5 109 myzip.close()
marco@4 110 packaging = "http://purl.org/net/sword/package/SimpleZip"
marco@4 111 type = "SimpleZip"
marco@4 112 temp = True
marco@4 113 else: #create a list of files to upload
marco@4 114 for root, dirs, files in os.walk(data):
marco@4 115 for name in files:
marco@4 116 fileslist.append(os.path.join(root,name))
marco@4 117 type = "multiple files"
marco@4 118 elif zipfile.is_zipfile(data): #This is a zip file
marco@4 119 fileslist.append(data)
marco@4 120 myzip = zipfile.ZipFile(data)
marco@4 121 if "mets.xml" in myzip.namelist(): #This is a METS package
marco@4 122 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
marco@4 123 type = "METS"
marco@8 124 in_progress = False
marco@4 125 else: #THis is a simple zip file
marco@4 126 packaging = "http://purl.org/net/sword/package/SimpleZip"
marco@4 127 type = "SimpleZip"
marco@4 128 myzip.close()
marco@0 129 elif os.path.isfile(data): # This is a single file
marco@4 130 fileslist.append(data)
marco@4 131 type = "single file"
marco@0 132 else:
marco@0 133 print "Couldn't find the data."
marco@0 134 sys.exit()
marco@0 135
marco@0 136 print "------------------------"
marco@0 137 print "This is a ",type," submission"
marco@4 138
marco@4 139 # Create a metadata entry
marco@4 140 if (args.title != None) or (args.author != None) or (args.date != None):
marco@4 141 entry = Entry()
marco@4 142 if args.title != None:
marco@4 143 entry.add_fields(dcterms_title = args.title[0])
marco@4 144 if args.author != None:
marco@4 145 for creator in args.author:
marco@4 146 entry.add_fields(dcterms_creator=creator)
marco@4 147 if args.date != None:
marco@4 148 entry.add_fields(dcterms_created = args.date[0])
marco@4 149 else:
marco@4 150 entry = None
marco@4 151 # Select what to do
marco@8 152 if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
marco@4 153 try:
marco@4 154 # Create the metadata entry with ATOM
marco@4 155 print "------------------------"
marco@4 156 print "Creating the item..."
marco@4 157 if entry is None:
marco@4 158 entry = Entry(dcterms_title=(os.path.basename(data)))
marco@8 159 creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
marco@4 160
marco@4 161 # Add the files
marco@4 162 for f in fileslist:
marco@4 163 print "Uploading file ",os.path.basename(f)
marco@4 164 payload = open(f,"rb")
marco@4 165 deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
marco@4 166 payload = payload,
marco@4 167 filename = os.path.basename(f),
marco@4 168 mimetype = 'application/zip',
marco@4 169 packaging = 'http://purl.org/net/sword/package/Binary')
marco@4 170 payload.close()
marco@4 171 except HTTPResponseError:
marco@4 172 print "Bad request"
marco@4 173 else:
marco@8 174 # Send the zip file and let the ingester do its job
marco@4 175 payload = open(fileslist[0], "rb")
marco@8 176 if type == "SimpleZip":
marco@8 177 in_progress = True
marco@8 178 else:
marco@8 179 in_progress = False
marco@4 180 try:
marco@4 181 deposit_receipt = c.create(col_iri = collection.href,
marco@4 182 payload = payload,
marco@4 183 filename = fileslist[0],
marco@4 184 mimetype = "application/zip",
marco@8 185 packaging = packaging,
marco@8 186 in_progress = in_progress)
marco@4 187 print type, " submission successful."
marco@4 188 except:
marco@4 189 print "Error! Couldn't submit the file!"
marco@4 190 if type == "METS": # Just guessing: not sure this is the problem...
marco@4 191 print "To submit a METS package, the collection MUST have a workflow!"
marco@4 192 payload.close()
marco@4 193
marco@8 194 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
marco@7 195 if type == "SimpleZip":
marco@7 196 if entry is None:
marco@7 197 entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
marco@4 198 try:
marco@8 199 update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
marco@7 200 print "Metadata update successfull."
marco@4 201 except:
marco@4 202 print "Server error"
marco@8 203 # If we want to store the zip file along with the individual files (Only SimpleZip)
marco@8 204 if storezip:
marco@8 205 try:
marco@8 206 payload = open(fileslist[0],"rb")
marco@8 207 zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
marco@8 208 payload = payload,
marco@8 209 filename = os.path.basename(fileslist[0]).replace(" ","_"),
marco@8 210 mimetype = 'application/zip',
marco@8 211 packaging = 'http://purl.org/net/sword/package/Binary')
marco@8 212 payload.close()
marco@8 213 print "Zip file successfully added to the bitstreams."
marco@8 214 except:
marco@8 215 print "Server error: could not add the zip file to the resources"
marco@4 216 if temp:
marco@4 217 os.remove(fileslist[0])
marco@0 218
marco@4 219 print "------------------------"
marco@4 220 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
marco@4 221
marco@0 222
marco@0 223 else: # Failed to connect to SWORDv2 Server
marco@0 224 print "Couldn't connect to the server."
marco@0 225 if attempts == 0:
marco@0 226 print "Invalid credentials entered 3 times."
marco@0 227
marco@0 228 except KeyboardInterrupt:
marco@0 229 print "------------------------"
marco@3 230 print "\nSubmission aborted by user."