marco@0: #!usr/bin/env/ python marco@1: marco@12: """ marco@12: marco@15: SWORD2 DSpace bulk uploader - v0.6 marco@1: marco@1: A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x. marco@15: Built on the SWORD2 python client library: https://github.com/swordapp/python-client-sword2. marco@1: marco@1: Dependencies: marco@1: marco@1: - python 2.X marco@1: marco@14: - sword2 library: https://github.com/swordapp/python-client-sword2 marco@1: marco@1: ----------------------------------- marco@15: Updates log: marco@15: marco@15: v0.6: - now uploading a directory will also maintain the path structure marco@15: - introduced a file where to specify the server (server.cfg) marco@15: v0.5: changed the default server to C4DM live server marco@15: marco@15: ----------------------------------- marco@11: Centre for Digital Music, Queen Mary, University of London marco@11: Copyright (c) 2012 Marco Fabiani marco@11: marco@11: Permission is hereby granted, free of charge, to any person marco@11: obtaining a copy of this software and associated documentation marco@11: files (the "Software"), to deal in the Software without marco@11: restriction, including without limitation the rights to use, copy, marco@11: modify, merge, publish, distribute, sublicense, and/or sell copies marco@11: of the Software, and to permit persons to whom the Software is marco@11: furnished to do so, subject to the following conditions: marco@11: marco@11: The above copyright notice and this permission notice shall be marco@11: included in all copies or substantial portions of the Software. marco@11: marco@11: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, marco@11: EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES marco@11: OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND marco@11: NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT marco@11: HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, marco@11: WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING marco@11: FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR marco@11: OTHER DEALINGS IN THE SOFTWARE. marco@1: ----------------------------------- marco@12: marco@11: A copy of this License can also be found in the COPYING file distributed with the source code. marco@1: """ marco@0: marco@4: import argparse, getpass, zipfile, os, sys marco@0: from sword2 import * marco@0: marco@0: # Parse arguments marco@0: parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.") marco@0: parser.add_argument("data", type=str, nargs=1, marco@12: help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!") marco@0: parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.") marco@0: parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).") marco@0: parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"") marco@0: parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).") marco@14: parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.") marco@15: parser.add_argument("--servicedoc", dest="sd", type=str,nargs=1, help="Url of the SWORDv2 service document (default: use server.cfg if available, otherwise http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument") marco@0: marco@0: args = parser.parse_args() marco@0: data = args.data[0] marco@13: if args.zip: marco@13: storezip = True marco@13: else: marco@13: storezip = False marco@8: marco@13: if args.sd == None: marco@15: try: marco@15: f = open("server.cfg", "r") marco@15: sd = f.readline() marco@15: print "server.cfg: ", sd marco@15: except: marco@15: sd = "http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument" marco@0: else: marco@13: sd = args.sd[0] marco@0: marco@0: marco@0: try: marco@13: # Connect to SWORD server: it will always try to authenticate (no anonymous submissions! marco@0: attempts = 3 # Number of attempts left to connect to server marco@0: connected = False marco@0: while attempts>0 and not connected: marco@0: print "Connecting to SWORD server. Remaining attempts: ", attempts marco@0: # Try to login, get service document marco@0: # Get username and password marco@0: if args.user_name == None: marco@13: user_name = raw_input("Username: ") marco@0: else: marco@0: user_name = args.user_name[0] marco@13: print "Username: ",user_name marco@13: user_pass = getpass.getpass("Password:") marco@0: # Connect to the server marco@13: c = Connection(sd, user_name=user_name, user_pass=user_pass,keep_history=False) marco@3: marco@0: # Get service document marco@0: try: marco@0: c.get_service_document() marco@3: except: # Server error marco@3: print "Server unreachable!" marco@3: break marco@3: if c.sd != None: marco@3: connected = True marco@3: else: marco@0: attempts-=1 marco@0: print "Incorrect username and/or password" marco@3: marco@0: marco@0: if connected: marco@0: # List available collections marco@0: print "------------------------" marco@0: print "Welcome to the ",c.workspaces[0][0], "repository" marco@0: print "Available Collections: " marco@0: numColl = len(c.workspaces[0][1]) marco@0: for ctr in range(numColl): marco@0: coll = c.workspaces[0][1][ctr] marco@0: print ctr+1,":",coll.title marco@0: # Select a collection to deposit into marco@14: sel = "0" marco@14: while (not sel.isdigit() or int(sel)<=0 or int(sel)>numColl): marco@14: sel = raw_input("Select a Collection to submit your files into: ") marco@14: sel = int(sel) marco@0: collection = c.workspaces[0][1][sel-1] marco@0: print "Selected Collection: ",collection.title marco@0: marco@4: # Create a submission marco@4: fileslist = [] marco@4: temp = False # Delete temp files marco@4: # If folder marco@4: if os.path.isdir(data): marco@4: if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only... marco@4: dataname = os.path.basename(os.path.normpath(data)) marco@9: if args.title != None: marco@9: zipf = args.title[0].replace(" ","_")+".zip" marco@9: else: marco@9: zipf = dataname.replace(" ","_")+".zip" marco@4: myzip = zipfile.ZipFile(zipf, "w") marco@4: # get the directory structure marco@4: print "Creating a zip archive for submission..." marco@4: for root, dirs, files in os.walk(data): marco@4: for name in files: marco@8: if not name.startswith('.'): # Do not upload hidden files, OSX/linux marco@8: myzip.write(os.path.join(root,name), marco@9: os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets marco@4: fileslist.append(zipf) marco@5: myzip.close() marco@4: packaging = "http://purl.org/net/sword/package/SimpleZip" marco@4: type = "SimpleZip" marco@4: temp = True marco@4: else: #create a list of files to upload marco@4: for root, dirs, files in os.walk(data): marco@4: for name in files: marco@9: if not name.startswith('.'): marco@9: fileslist.append(os.path.join(root,name)) marco@4: type = "multiple files" marco@4: elif zipfile.is_zipfile(data): #This is a zip file marco@4: fileslist.append(data) marco@4: myzip = zipfile.ZipFile(data) marco@4: if "mets.xml" in myzip.namelist(): #This is a METS package marco@4: packaging = "http://purl.org/net/sword/package/METSDSpaceSIP" marco@4: type = "METS" marco@8: in_progress = False marco@10: elif "bagit.txt" in "".join(myzip.namelist()): #This is a BagIt package marco@10: packaging = "http://purl.org/net/sword/package/BagIt" marco@10: type = "BAGIT" marco@10: else:#THis is a simple zip file marco@4: packaging = "http://purl.org/net/sword/package/SimpleZip" marco@4: type = "SimpleZip" marco@4: myzip.close() marco@0: elif os.path.isfile(data): # This is a single file marco@4: fileslist.append(data) marco@4: type = "single file" marco@0: else: marco@0: print "Couldn't find the data." marco@0: sys.exit() marco@0: marco@0: print "------------------------" marco@0: print "This is a ",type," submission" marco@4: marco@4: # Create a metadata entry marco@4: if (args.title != None) or (args.author != None) or (args.date != None): marco@4: entry = Entry() marco@4: if args.title != None: marco@4: entry.add_fields(dcterms_title = args.title[0]) marco@4: if args.author != None: marco@4: for creator in args.author: marco@4: entry.add_fields(dcterms_creator=creator) marco@4: if args.date != None: marco@4: entry.add_fields(dcterms_created = args.date[0]) marco@4: else: marco@4: entry = None marco@4: # Select what to do marco@8: if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure marco@4: try: marco@4: # Create the metadata entry with ATOM marco@4: print "------------------------" marco@4: print "Creating the item..." marco@4: if entry is None: marco@4: entry = Entry(dcterms_title=(os.path.basename(data))) marco@8: creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True) marco@4: marco@4: # Add the files marco@15: # Get the longest common path in order to send the correct filename to keep the structure marco@15: common = os.path.commonprefix(fileslist) marco@4: for f in fileslist: marco@15: filename = os.path.relpath(f,common) marco@15: print "Uploading file ", filename marco@4: payload = open(f,"rb") marco@4: deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media, marco@4: payload = payload, marco@15: filename = filename, marco@4: mimetype = 'application/zip', marco@4: packaging = 'http://purl.org/net/sword/package/Binary') marco@4: payload.close() marco@4: except HTTPResponseError: marco@4: print "Bad request" marco@4: else: marco@8: # Send the zip file and let the ingester do its job marco@4: payload = open(fileslist[0], "rb") marco@10: if (type == "SimpleZip") or (type=="BAGIT"): marco@8: in_progress = True marco@9: # FIXME: we don't want to write silly things in dc.description! marco@8: else: marco@8: in_progress = False marco@4: try: marco@4: deposit_receipt = c.create(col_iri = collection.href, marco@4: payload = payload, marco@4: filename = fileslist[0], marco@4: mimetype = "application/zip", marco@8: packaging = packaging, marco@8: in_progress = in_progress) marco@4: print type, " submission successful." marco@4: except: marco@4: print "Error! Couldn't submit the file!" marco@4: if type == "METS": # Just guessing: not sure this is the problem... marco@4: print "To submit a METS package, the collection MUST have a workflow!" marco@4: payload.close() marco@4: marco@8: # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip) marco@7: if type == "SimpleZip": marco@7: if entry is None: marco@7: entry = Entry(dcterms_title=(os.path.basename(fileslist[0]))) marco@4: try: marco@8: update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission marco@7: print "Metadata update successfull." marco@4: except: marco@4: print "Server error" marco@8: # If we want to store the zip file along with the individual files (Only SimpleZip) marco@8: if storezip: marco@8: try: marco@8: payload = open(fileslist[0],"rb") marco@8: zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media, marco@8: payload = payload, marco@8: filename = os.path.basename(fileslist[0]).replace(" ","_"), marco@8: mimetype = 'application/zip', marco@8: packaging = 'http://purl.org/net/sword/package/Binary') marco@8: payload.close() marco@8: print "Zip file successfully added to the bitstreams." marco@8: except: marco@8: print "Server error: could not add the zip file to the resources" marco@4: if temp: marco@4: os.remove(fileslist[0]) marco@0: marco@4: print "------------------------" marco@4: print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"." marco@4: marco@0: marco@0: else: # Failed to connect to SWORDv2 Server marco@0: print "Couldn't connect to the server." marco@0: if attempts == 0: marco@0: print "Invalid credentials entered 3 times." marco@0: marco@0: except KeyboardInterrupt: marco@0: print "------------------------" marco@3: print "\nSubmission aborted by user."