annotate sworduploader.py @ 9:394b4d094767

Give better names to zip file Remove square brakets from file names
author Marco Fabiani <marco.fabiani@eecs.qmul.ac.uk>
date Wed, 04 Apr 2012 17:59:06 +0100
parents ff51b8204ad4
children af2a645f63a2
rev   line source
marco@0 1 #!usr/bin/env/ python
marco@1 2
marco@6 3 """ SWORD2 DSpace bulk uploader - v0.3
marco@1 4
marco@1 5 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
marco@1 6 Built on the SWORD2 python client library: https://bitbucket.org/beno/python-sword2/overview
marco@1 7
marco@1 8 Dependencies:
marco@1 9
marco@1 10 - python 2.X
marco@1 11
marco@4 12 - sword2 library, with modifications:
marco@4 13 (original) https://bitbucket.org/beno/python-sword2/src
marco@4 14 (modified) https://code.soundsoftware.ac.uk/hg/sworduploader
marco@1 15
marco@1 16 -----------------------------------
marco@1 17 Copyright 2012 Marco Fabiani
marco@1 18 Copyright 2012 Queen Mary, University of London
marco@1 19 -----------------------------------
marco@1 20 """
marco@0 21
marco@4 22 import argparse, getpass, zipfile, os, sys
marco@0 23 from sword2 import *
marco@0 24
marco@0 25 # Parse arguments
marco@0 26 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
marco@0 27 parser.add_argument("data", type=str, nargs=1,
marco@1 28 help="Accepts: METSDSpaceSIP packages, zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
marco@0 29 parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
marco@8 30 parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, send it as a single zip archive to preserve its structure. The zip file will be saved along with the individual files.")
marco@0 31 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
marco@0 32 parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
marco@0 33 parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
marco@1 34 parser.add_argument("--servicedoc", dest="dspaceurl", type=str,nargs=1, help="Url of the SWORDv2 service document (default: http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument")
marco@0 35
marco@0 36 args = parser.parse_args()
marco@0 37 data = args.data[0]
marco@8 38 storezip = True
marco@8 39
marco@0 40 if args.dspaceurl == None:
marco@0 41 dspaceurl = "http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument"
marco@0 42 else:
marco@0 43 dspaceurl = args.dspaceurl[0]
marco@0 44
marco@0 45
marco@0 46 try:
marco@0 47 # Connect to SWORD server
marco@0 48 attempts = 3 # Number of attempts left to connect to server
marco@0 49 connected = False
marco@0 50 while attempts>0 and not connected:
marco@0 51 print "Connecting to SWORD server. Remaining attempts: ", attempts
marco@0 52 # Try to login, get service document
marco@0 53 # Get username and password
marco@0 54 if args.user_name == None:
marco@0 55 user_name = raw_input("DSpace Username: ")
marco@0 56 else:
marco@0 57 user_name = args.user_name[0]
marco@0 58 print "DSpace Username: ",user_name
marco@0 59 user_pass = getpass.getpass("DSpace password:")
marco@0 60 # Connect to the server
marco@0 61 c = Connection(dspaceurl, user_name=user_name, user_pass=user_pass,keep_history=False)
marco@3 62
marco@0 63 # Get service document
marco@0 64 try:
marco@0 65 c.get_service_document()
marco@3 66 except: # Server error
marco@3 67 print "Server unreachable!"
marco@3 68 break
marco@3 69 if c.sd != None:
marco@3 70 connected = True
marco@3 71 else:
marco@0 72 attempts-=1
marco@0 73 print "Incorrect username and/or password"
marco@3 74
marco@0 75
marco@0 76 if connected:
marco@0 77 # List available collections
marco@0 78 print "------------------------"
marco@0 79 print "Welcome to the ",c.workspaces[0][0], "repository"
marco@0 80 print "Available Collections: "
marco@0 81 numColl = len(c.workspaces[0][1])
marco@0 82 for ctr in range(numColl):
marco@0 83 coll = c.workspaces[0][1][ctr]
marco@0 84 print ctr+1,":",coll.title
marco@0 85 # Select a collection to deposit into
marco@0 86 sel = -1
marco@0 87 while (sel<=0 or sel>numColl):
marco@0 88 sel = input("Select a Collection to submit your files into: ")
marco@0 89 collection = c.workspaces[0][1][sel-1]
marco@0 90 print "Selected Collection: ",collection.title
marco@0 91
marco@4 92 # Create a submission
marco@4 93 fileslist = []
marco@4 94 temp = False # Delete temp files
marco@4 95 # If folder
marco@4 96 if os.path.isdir(data):
marco@4 97 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
marco@4 98 dataname = os.path.basename(os.path.normpath(data))
marco@9 99 if args.title != None:
marco@9 100 zipf = args.title[0].replace(" ","_")+".zip"
marco@9 101 else:
marco@9 102 zipf = dataname.replace(" ","_")+".zip"
marco@4 103 myzip = zipfile.ZipFile(zipf, "w")
marco@4 104 # get the directory structure
marco@4 105 print "Creating a zip archive for submission..."
marco@4 106 for root, dirs, files in os.walk(data):
marco@4 107 for name in files:
marco@8 108 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
marco@8 109 myzip.write(os.path.join(root,name),
marco@9 110 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets
marco@4 111 fileslist.append(zipf)
marco@5 112 myzip.close()
marco@4 113 packaging = "http://purl.org/net/sword/package/SimpleZip"
marco@4 114 type = "SimpleZip"
marco@4 115 temp = True
marco@4 116 else: #create a list of files to upload
marco@4 117 for root, dirs, files in os.walk(data):
marco@4 118 for name in files:
marco@9 119 if not name.startswith('.'):
marco@9 120 fileslist.append(os.path.join(root,name))
marco@4 121 type = "multiple files"
marco@4 122 elif zipfile.is_zipfile(data): #This is a zip file
marco@4 123 fileslist.append(data)
marco@4 124 myzip = zipfile.ZipFile(data)
marco@4 125 if "mets.xml" in myzip.namelist(): #This is a METS package
marco@4 126 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
marco@4 127 type = "METS"
marco@8 128 in_progress = False
marco@4 129 else: #THis is a simple zip file
marco@4 130 packaging = "http://purl.org/net/sword/package/SimpleZip"
marco@4 131 type = "SimpleZip"
marco@4 132 myzip.close()
marco@0 133 elif os.path.isfile(data): # This is a single file
marco@4 134 fileslist.append(data)
marco@4 135 type = "single file"
marco@0 136 else:
marco@0 137 print "Couldn't find the data."
marco@0 138 sys.exit()
marco@0 139
marco@0 140 print "------------------------"
marco@0 141 print "This is a ",type," submission"
marco@4 142
marco@4 143 # Create a metadata entry
marco@4 144 if (args.title != None) or (args.author != None) or (args.date != None):
marco@4 145 entry = Entry()
marco@4 146 if args.title != None:
marco@4 147 entry.add_fields(dcterms_title = args.title[0])
marco@4 148 if args.author != None:
marco@4 149 for creator in args.author:
marco@4 150 entry.add_fields(dcterms_creator=creator)
marco@4 151 if args.date != None:
marco@4 152 entry.add_fields(dcterms_created = args.date[0])
marco@4 153 else:
marco@4 154 entry = None
marco@4 155 # Select what to do
marco@8 156 if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
marco@4 157 try:
marco@4 158 # Create the metadata entry with ATOM
marco@4 159 print "------------------------"
marco@4 160 print "Creating the item..."
marco@4 161 if entry is None:
marco@4 162 entry = Entry(dcterms_title=(os.path.basename(data)))
marco@8 163 creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
marco@4 164
marco@4 165 # Add the files
marco@4 166 for f in fileslist:
marco@4 167 print "Uploading file ",os.path.basename(f)
marco@4 168 payload = open(f,"rb")
marco@4 169 deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
marco@4 170 payload = payload,
marco@4 171 filename = os.path.basename(f),
marco@4 172 mimetype = 'application/zip',
marco@4 173 packaging = 'http://purl.org/net/sword/package/Binary')
marco@4 174 payload.close()
marco@4 175 except HTTPResponseError:
marco@4 176 print "Bad request"
marco@4 177 else:
marco@8 178 # Send the zip file and let the ingester do its job
marco@4 179 payload = open(fileslist[0], "rb")
marco@8 180 if type == "SimpleZip":
marco@8 181 in_progress = True
marco@9 182 # FIXME: we don't want to write silly things in dc.description!
marco@8 183 else:
marco@8 184 in_progress = False
marco@4 185 try:
marco@4 186 deposit_receipt = c.create(col_iri = collection.href,
marco@4 187 payload = payload,
marco@4 188 filename = fileslist[0],
marco@4 189 mimetype = "application/zip",
marco@8 190 packaging = packaging,
marco@8 191 in_progress = in_progress)
marco@4 192 print type, " submission successful."
marco@4 193 except:
marco@4 194 print "Error! Couldn't submit the file!"
marco@4 195 if type == "METS": # Just guessing: not sure this is the problem...
marco@4 196 print "To submit a METS package, the collection MUST have a workflow!"
marco@4 197 payload.close()
marco@4 198
marco@8 199 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
marco@7 200 if type == "SimpleZip":
marco@7 201 if entry is None:
marco@7 202 entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
marco@4 203 try:
marco@8 204 update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
marco@7 205 print "Metadata update successfull."
marco@4 206 except:
marco@4 207 print "Server error"
marco@8 208 # If we want to store the zip file along with the individual files (Only SimpleZip)
marco@8 209 if storezip:
marco@8 210 try:
marco@8 211 payload = open(fileslist[0],"rb")
marco@8 212 zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
marco@8 213 payload = payload,
marco@8 214 filename = os.path.basename(fileslist[0]).replace(" ","_"),
marco@8 215 mimetype = 'application/zip',
marco@8 216 packaging = 'http://purl.org/net/sword/package/Binary')
marco@8 217 payload.close()
marco@8 218 print "Zip file successfully added to the bitstreams."
marco@8 219 except:
marco@8 220 print "Server error: could not add the zip file to the resources"
marco@4 221 if temp:
marco@4 222 os.remove(fileslist[0])
marco@0 223
marco@4 224 print "------------------------"
marco@4 225 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
marco@4 226
marco@0 227
marco@0 228 else: # Failed to connect to SWORDv2 Server
marco@0 229 print "Couldn't connect to the server."
marco@0 230 if attempts == 0:
marco@0 231 print "Invalid credentials entered 3 times."
marco@0 232
marco@0 233 except KeyboardInterrupt:
marco@0 234 print "------------------------"
marco@3 235 print "\nSubmission aborted by user."