marco@0
|
1 #!usr/bin/env/ python
|
marco@1
|
2
|
marco@6
|
3 """ SWORD2 DSpace bulk uploader - v0.3
|
marco@1
|
4
|
marco@1
|
5 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
|
marco@1
|
6 Built on the SWORD2 python client library: https://bitbucket.org/beno/python-sword2/overview
|
marco@1
|
7
|
marco@1
|
8 Dependencies:
|
marco@1
|
9
|
marco@1
|
10 - python 2.X
|
marco@1
|
11
|
marco@4
|
12 - sword2 library, with modifications:
|
marco@4
|
13 (original) https://bitbucket.org/beno/python-sword2/src
|
marco@4
|
14 (modified) https://code.soundsoftware.ac.uk/hg/sworduploader
|
marco@1
|
15
|
marco@1
|
16 -----------------------------------
|
marco@1
|
17 Copyright 2012 Marco Fabiani
|
marco@1
|
18 Copyright 2012 Queen Mary, University of London
|
marco@1
|
19 -----------------------------------
|
marco@1
|
20 """
|
marco@0
|
21
|
marco@4
|
22 import argparse, getpass, zipfile, os, sys
|
marco@0
|
23 from sword2 import *
|
marco@0
|
24
|
marco@0
|
25 # Parse arguments
|
marco@0
|
26 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
|
marco@0
|
27 parser.add_argument("data", type=str, nargs=1,
|
marco@1
|
28 help="Accepts: METSDSpaceSIP packages, zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
|
marco@0
|
29 parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
|
marco@8
|
30 parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, send it as a single zip archive to preserve its structure. The zip file will be saved along with the individual files.")
|
marco@0
|
31 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
|
marco@0
|
32 parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
|
marco@0
|
33 parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
|
marco@1
|
34 parser.add_argument("--servicedoc", dest="dspaceurl", type=str,nargs=1, help="Url of the SWORDv2 service document (default: http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument")
|
marco@0
|
35
|
marco@0
|
36 args = parser.parse_args()
|
marco@0
|
37 data = args.data[0]
|
marco@8
|
38 storezip = True
|
marco@8
|
39
|
marco@0
|
40 if args.dspaceurl == None:
|
marco@0
|
41 dspaceurl = "http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument"
|
marco@0
|
42 else:
|
marco@0
|
43 dspaceurl = args.dspaceurl[0]
|
marco@0
|
44
|
marco@0
|
45
|
marco@0
|
46 try:
|
marco@0
|
47 # Connect to SWORD server
|
marco@0
|
48 attempts = 3 # Number of attempts left to connect to server
|
marco@0
|
49 connected = False
|
marco@0
|
50 while attempts>0 and not connected:
|
marco@0
|
51 print "Connecting to SWORD server. Remaining attempts: ", attempts
|
marco@0
|
52 # Try to login, get service document
|
marco@0
|
53 # Get username and password
|
marco@0
|
54 if args.user_name == None:
|
marco@0
|
55 user_name = raw_input("DSpace Username: ")
|
marco@0
|
56 else:
|
marco@0
|
57 user_name = args.user_name[0]
|
marco@0
|
58 print "DSpace Username: ",user_name
|
marco@0
|
59 user_pass = getpass.getpass("DSpace password:")
|
marco@0
|
60 # Connect to the server
|
marco@0
|
61 c = Connection(dspaceurl, user_name=user_name, user_pass=user_pass,keep_history=False)
|
marco@3
|
62
|
marco@0
|
63 # Get service document
|
marco@0
|
64 try:
|
marco@0
|
65 c.get_service_document()
|
marco@3
|
66 except: # Server error
|
marco@3
|
67 print "Server unreachable!"
|
marco@3
|
68 break
|
marco@3
|
69 if c.sd != None:
|
marco@3
|
70 connected = True
|
marco@3
|
71 else:
|
marco@0
|
72 attempts-=1
|
marco@0
|
73 print "Incorrect username and/or password"
|
marco@3
|
74
|
marco@0
|
75
|
marco@0
|
76 if connected:
|
marco@0
|
77 # List available collections
|
marco@0
|
78 print "------------------------"
|
marco@0
|
79 print "Welcome to the ",c.workspaces[0][0], "repository"
|
marco@0
|
80 print "Available Collections: "
|
marco@0
|
81 numColl = len(c.workspaces[0][1])
|
marco@0
|
82 for ctr in range(numColl):
|
marco@0
|
83 coll = c.workspaces[0][1][ctr]
|
marco@0
|
84 print ctr+1,":",coll.title
|
marco@0
|
85 # Select a collection to deposit into
|
marco@0
|
86 sel = -1
|
marco@0
|
87 while (sel<=0 or sel>numColl):
|
marco@0
|
88 sel = input("Select a Collection to submit your files into: ")
|
marco@0
|
89 collection = c.workspaces[0][1][sel-1]
|
marco@0
|
90 print "Selected Collection: ",collection.title
|
marco@0
|
91
|
marco@4
|
92 # Create a submission
|
marco@4
|
93 fileslist = []
|
marco@4
|
94 temp = False # Delete temp files
|
marco@4
|
95 # If folder
|
marco@4
|
96 if os.path.isdir(data):
|
marco@4
|
97 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
|
marco@4
|
98 dataname = os.path.basename(os.path.normpath(data))
|
marco@9
|
99 if args.title != None:
|
marco@9
|
100 zipf = args.title[0].replace(" ","_")+".zip"
|
marco@9
|
101 else:
|
marco@9
|
102 zipf = dataname.replace(" ","_")+".zip"
|
marco@4
|
103 myzip = zipfile.ZipFile(zipf, "w")
|
marco@4
|
104 # get the directory structure
|
marco@4
|
105 print "Creating a zip archive for submission..."
|
marco@4
|
106 for root, dirs, files in os.walk(data):
|
marco@4
|
107 for name in files:
|
marco@8
|
108 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
|
marco@8
|
109 myzip.write(os.path.join(root,name),
|
marco@9
|
110 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets
|
marco@4
|
111 fileslist.append(zipf)
|
marco@5
|
112 myzip.close()
|
marco@4
|
113 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
114 type = "SimpleZip"
|
marco@4
|
115 temp = True
|
marco@4
|
116 else: #create a list of files to upload
|
marco@4
|
117 for root, dirs, files in os.walk(data):
|
marco@4
|
118 for name in files:
|
marco@9
|
119 if not name.startswith('.'):
|
marco@9
|
120 fileslist.append(os.path.join(root,name))
|
marco@4
|
121 type = "multiple files"
|
marco@4
|
122 elif zipfile.is_zipfile(data): #This is a zip file
|
marco@4
|
123 fileslist.append(data)
|
marco@4
|
124 myzip = zipfile.ZipFile(data)
|
marco@4
|
125 if "mets.xml" in myzip.namelist(): #This is a METS package
|
marco@4
|
126 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
|
marco@4
|
127 type = "METS"
|
marco@8
|
128 in_progress = False
|
marco@10
|
129 elif "bagit.txt" in "".join(myzip.namelist()): #This is a BagIt package
|
marco@10
|
130 packaging = "http://purl.org/net/sword/package/BagIt"
|
marco@10
|
131 type = "BAGIT"
|
marco@10
|
132 else:#THis is a simple zip file
|
marco@4
|
133 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
134 type = "SimpleZip"
|
marco@4
|
135 myzip.close()
|
marco@0
|
136 elif os.path.isfile(data): # This is a single file
|
marco@4
|
137 fileslist.append(data)
|
marco@4
|
138 type = "single file"
|
marco@0
|
139 else:
|
marco@0
|
140 print "Couldn't find the data."
|
marco@0
|
141 sys.exit()
|
marco@0
|
142
|
marco@0
|
143 print "------------------------"
|
marco@0
|
144 print "This is a ",type," submission"
|
marco@4
|
145
|
marco@4
|
146 # Create a metadata entry
|
marco@4
|
147 if (args.title != None) or (args.author != None) or (args.date != None):
|
marco@4
|
148 entry = Entry()
|
marco@4
|
149 if args.title != None:
|
marco@4
|
150 entry.add_fields(dcterms_title = args.title[0])
|
marco@4
|
151 if args.author != None:
|
marco@4
|
152 for creator in args.author:
|
marco@4
|
153 entry.add_fields(dcterms_creator=creator)
|
marco@4
|
154 if args.date != None:
|
marco@4
|
155 entry.add_fields(dcterms_created = args.date[0])
|
marco@4
|
156 else:
|
marco@4
|
157 entry = None
|
marco@4
|
158 # Select what to do
|
marco@8
|
159 if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
|
marco@4
|
160 try:
|
marco@4
|
161 # Create the metadata entry with ATOM
|
marco@4
|
162 print "------------------------"
|
marco@4
|
163 print "Creating the item..."
|
marco@4
|
164 if entry is None:
|
marco@4
|
165 entry = Entry(dcterms_title=(os.path.basename(data)))
|
marco@8
|
166 creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
|
marco@4
|
167
|
marco@4
|
168 # Add the files
|
marco@4
|
169 for f in fileslist:
|
marco@4
|
170 print "Uploading file ",os.path.basename(f)
|
marco@4
|
171 payload = open(f,"rb")
|
marco@4
|
172 deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
|
marco@4
|
173 payload = payload,
|
marco@4
|
174 filename = os.path.basename(f),
|
marco@4
|
175 mimetype = 'application/zip',
|
marco@4
|
176 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@4
|
177 payload.close()
|
marco@4
|
178 except HTTPResponseError:
|
marco@4
|
179 print "Bad request"
|
marco@4
|
180 else:
|
marco@8
|
181 # Send the zip file and let the ingester do its job
|
marco@4
|
182 payload = open(fileslist[0], "rb")
|
marco@10
|
183 if (type == "SimpleZip") or (type=="BAGIT"):
|
marco@8
|
184 in_progress = True
|
marco@9
|
185 # FIXME: we don't want to write silly things in dc.description!
|
marco@8
|
186 else:
|
marco@8
|
187 in_progress = False
|
marco@4
|
188 try:
|
marco@4
|
189 deposit_receipt = c.create(col_iri = collection.href,
|
marco@4
|
190 payload = payload,
|
marco@4
|
191 filename = fileslist[0],
|
marco@4
|
192 mimetype = "application/zip",
|
marco@8
|
193 packaging = packaging,
|
marco@8
|
194 in_progress = in_progress)
|
marco@4
|
195 print type, " submission successful."
|
marco@4
|
196 except:
|
marco@4
|
197 print "Error! Couldn't submit the file!"
|
marco@4
|
198 if type == "METS": # Just guessing: not sure this is the problem...
|
marco@4
|
199 print "To submit a METS package, the collection MUST have a workflow!"
|
marco@4
|
200 payload.close()
|
marco@4
|
201
|
marco@8
|
202 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
|
marco@7
|
203 if type == "SimpleZip":
|
marco@7
|
204 if entry is None:
|
marco@7
|
205 entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
|
marco@4
|
206 try:
|
marco@8
|
207 update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
|
marco@7
|
208 print "Metadata update successfull."
|
marco@4
|
209 except:
|
marco@4
|
210 print "Server error"
|
marco@8
|
211 # If we want to store the zip file along with the individual files (Only SimpleZip)
|
marco@8
|
212 if storezip:
|
marco@8
|
213 try:
|
marco@8
|
214 payload = open(fileslist[0],"rb")
|
marco@8
|
215 zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
|
marco@8
|
216 payload = payload,
|
marco@8
|
217 filename = os.path.basename(fileslist[0]).replace(" ","_"),
|
marco@8
|
218 mimetype = 'application/zip',
|
marco@8
|
219 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@8
|
220 payload.close()
|
marco@8
|
221 print "Zip file successfully added to the bitstreams."
|
marco@8
|
222 except:
|
marco@8
|
223 print "Server error: could not add the zip file to the resources"
|
marco@4
|
224 if temp:
|
marco@4
|
225 os.remove(fileslist[0])
|
marco@0
|
226
|
marco@4
|
227 print "------------------------"
|
marco@4
|
228 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
|
marco@4
|
229
|
marco@0
|
230
|
marco@0
|
231 else: # Failed to connect to SWORDv2 Server
|
marco@0
|
232 print "Couldn't connect to the server."
|
marco@0
|
233 if attempts == 0:
|
marco@0
|
234 print "Invalid credentials entered 3 times."
|
marco@0
|
235
|
marco@0
|
236 except KeyboardInterrupt:
|
marco@0
|
237 print "------------------------"
|
marco@3
|
238 print "\nSubmission aborted by user." |