marco@0
|
1 #!usr/bin/env/ python
|
marco@1
|
2
|
marco@12
|
3 """
|
marco@12
|
4
|
marco@15
|
5 SWORD2 DSpace bulk uploader - v0.6
|
marco@1
|
6
|
marco@1
|
7 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
|
marco@15
|
8 Built on the SWORD2 python client library: https://github.com/swordapp/python-client-sword2.
|
marco@1
|
9
|
marco@1
|
10 Dependencies:
|
marco@1
|
11
|
marco@1
|
12 - python 2.X
|
marco@1
|
13
|
marco@14
|
14 - sword2 library: https://github.com/swordapp/python-client-sword2
|
marco@1
|
15
|
marco@1
|
16 -----------------------------------
|
marco@15
|
17 Updates log:
|
marco@15
|
18
|
marco@15
|
19 v0.6: - now uploading a directory will also maintain the path structure
|
marco@15
|
20 - introduced a file where to specify the server (server.cfg)
|
marco@15
|
21 v0.5: changed the default server to C4DM live server
|
marco@15
|
22
|
marco@15
|
23 -----------------------------------
|
marco@11
|
24 Centre for Digital Music, Queen Mary, University of London
|
marco@11
|
25 Copyright (c) 2012 Marco Fabiani
|
marco@11
|
26
|
marco@11
|
27 Permission is hereby granted, free of charge, to any person
|
marco@11
|
28 obtaining a copy of this software and associated documentation
|
marco@11
|
29 files (the "Software"), to deal in the Software without
|
marco@11
|
30 restriction, including without limitation the rights to use, copy,
|
marco@11
|
31 modify, merge, publish, distribute, sublicense, and/or sell copies
|
marco@11
|
32 of the Software, and to permit persons to whom the Software is
|
marco@11
|
33 furnished to do so, subject to the following conditions:
|
marco@11
|
34
|
marco@11
|
35 The above copyright notice and this permission notice shall be
|
marco@11
|
36 included in all copies or substantial portions of the Software.
|
marco@11
|
37
|
marco@11
|
38 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
marco@11
|
39 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
marco@11
|
40 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
marco@11
|
41 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
marco@11
|
42 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
marco@11
|
43 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
marco@11
|
44 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
marco@11
|
45 OTHER DEALINGS IN THE SOFTWARE.
|
marco@1
|
46 -----------------------------------
|
marco@12
|
47
|
marco@11
|
48 A copy of this License can also be found in the COPYING file distributed with the source code.
|
marco@1
|
49 """
|
marco@0
|
50
|
marco@4
|
51 import argparse, getpass, zipfile, os, sys
|
marco@0
|
52 from sword2 import *
|
marco@0
|
53
|
marco@0
|
54 # Parse arguments
|
marco@0
|
55 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
|
marco@0
|
56 parser.add_argument("data", type=str, nargs=1,
|
marco@12
|
57 help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
|
marco@0
|
58 parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
|
marco@0
|
59 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
|
marco@0
|
60 parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
|
marco@0
|
61 parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
|
marco@14
|
62 parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.")
|
marco@15
|
63 parser.add_argument("--servicedoc", dest="sd", type=str,nargs=1, help="Url of the SWORDv2 service document (default: use server.cfg if available, otherwise http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument")
|
marco@0
|
64
|
marco@0
|
65 args = parser.parse_args()
|
marco@0
|
66 data = args.data[0]
|
marco@13
|
67 if args.zip:
|
marco@13
|
68 storezip = True
|
marco@13
|
69 else:
|
marco@13
|
70 storezip = False
|
marco@8
|
71
|
marco@13
|
72 if args.sd == None:
|
marco@15
|
73 try:
|
marco@15
|
74 f = open("server.cfg", "r")
|
marco@15
|
75 sd = f.readline()
|
marco@15
|
76 print "server.cfg: ", sd
|
marco@15
|
77 except:
|
marco@15
|
78 sd = "http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument"
|
marco@0
|
79 else:
|
marco@13
|
80 sd = args.sd[0]
|
marco@0
|
81
|
marco@0
|
82
|
marco@0
|
83 try:
|
marco@13
|
84 # Connect to SWORD server: it will always try to authenticate (no anonymous submissions!
|
marco@0
|
85 attempts = 3 # Number of attempts left to connect to server
|
marco@0
|
86 connected = False
|
marco@0
|
87 while attempts>0 and not connected:
|
marco@0
|
88 print "Connecting to SWORD server. Remaining attempts: ", attempts
|
marco@0
|
89 # Try to login, get service document
|
marco@0
|
90 # Get username and password
|
marco@0
|
91 if args.user_name == None:
|
marco@13
|
92 user_name = raw_input("Username: ")
|
marco@0
|
93 else:
|
marco@0
|
94 user_name = args.user_name[0]
|
marco@13
|
95 print "Username: ",user_name
|
marco@13
|
96 user_pass = getpass.getpass("Password:")
|
marco@0
|
97 # Connect to the server
|
marco@13
|
98 c = Connection(sd, user_name=user_name, user_pass=user_pass,keep_history=False)
|
marco@3
|
99
|
marco@0
|
100 # Get service document
|
marco@0
|
101 try:
|
marco@0
|
102 c.get_service_document()
|
marco@3
|
103 except: # Server error
|
marco@3
|
104 print "Server unreachable!"
|
marco@3
|
105 break
|
marco@3
|
106 if c.sd != None:
|
marco@3
|
107 connected = True
|
marco@3
|
108 else:
|
marco@0
|
109 attempts-=1
|
marco@0
|
110 print "Incorrect username and/or password"
|
marco@3
|
111
|
marco@0
|
112
|
marco@0
|
113 if connected:
|
marco@0
|
114 # List available collections
|
marco@0
|
115 print "------------------------"
|
marco@0
|
116 print "Welcome to the ",c.workspaces[0][0], "repository"
|
marco@0
|
117 print "Available Collections: "
|
marco@0
|
118 numColl = len(c.workspaces[0][1])
|
marco@0
|
119 for ctr in range(numColl):
|
marco@0
|
120 coll = c.workspaces[0][1][ctr]
|
marco@0
|
121 print ctr+1,":",coll.title
|
marco@0
|
122 # Select a collection to deposit into
|
marco@14
|
123 sel = "0"
|
marco@14
|
124 while (not sel.isdigit() or int(sel)<=0 or int(sel)>numColl):
|
marco@14
|
125 sel = raw_input("Select a Collection to submit your files into: ")
|
marco@14
|
126 sel = int(sel)
|
marco@0
|
127 collection = c.workspaces[0][1][sel-1]
|
marco@0
|
128 print "Selected Collection: ",collection.title
|
marco@0
|
129
|
marco@4
|
130 # Create a submission
|
marco@4
|
131 fileslist = []
|
marco@4
|
132 temp = False # Delete temp files
|
marco@4
|
133 # If folder
|
marco@4
|
134 if os.path.isdir(data):
|
marco@4
|
135 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
|
marco@4
|
136 dataname = os.path.basename(os.path.normpath(data))
|
marco@9
|
137 if args.title != None:
|
marco@9
|
138 zipf = args.title[0].replace(" ","_")+".zip"
|
marco@9
|
139 else:
|
marco@9
|
140 zipf = dataname.replace(" ","_")+".zip"
|
marco@4
|
141 myzip = zipfile.ZipFile(zipf, "w")
|
marco@4
|
142 # get the directory structure
|
marco@4
|
143 print "Creating a zip archive for submission..."
|
marco@4
|
144 for root, dirs, files in os.walk(data):
|
marco@4
|
145 for name in files:
|
marco@8
|
146 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
|
marco@8
|
147 myzip.write(os.path.join(root,name),
|
marco@9
|
148 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets
|
marco@4
|
149 fileslist.append(zipf)
|
marco@5
|
150 myzip.close()
|
marco@4
|
151 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
152 type = "SimpleZip"
|
marco@4
|
153 temp = True
|
marco@4
|
154 else: #create a list of files to upload
|
marco@4
|
155 for root, dirs, files in os.walk(data):
|
marco@4
|
156 for name in files:
|
marco@9
|
157 if not name.startswith('.'):
|
marco@9
|
158 fileslist.append(os.path.join(root,name))
|
marco@4
|
159 type = "multiple files"
|
marco@4
|
160 elif zipfile.is_zipfile(data): #This is a zip file
|
marco@4
|
161 fileslist.append(data)
|
marco@4
|
162 myzip = zipfile.ZipFile(data)
|
marco@4
|
163 if "mets.xml" in myzip.namelist(): #This is a METS package
|
marco@4
|
164 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
|
marco@4
|
165 type = "METS"
|
marco@8
|
166 in_progress = False
|
marco@10
|
167 elif "bagit.txt" in "".join(myzip.namelist()): #This is a BagIt package
|
marco@10
|
168 packaging = "http://purl.org/net/sword/package/BagIt"
|
marco@10
|
169 type = "BAGIT"
|
marco@10
|
170 else:#THis is a simple zip file
|
marco@4
|
171 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
172 type = "SimpleZip"
|
marco@4
|
173 myzip.close()
|
marco@0
|
174 elif os.path.isfile(data): # This is a single file
|
marco@4
|
175 fileslist.append(data)
|
marco@4
|
176 type = "single file"
|
marco@0
|
177 else:
|
marco@0
|
178 print "Couldn't find the data."
|
marco@0
|
179 sys.exit()
|
marco@0
|
180
|
marco@0
|
181 print "------------------------"
|
marco@0
|
182 print "This is a ",type," submission"
|
marco@4
|
183
|
marco@4
|
184 # Create a metadata entry
|
marco@4
|
185 if (args.title != None) or (args.author != None) or (args.date != None):
|
marco@4
|
186 entry = Entry()
|
marco@4
|
187 if args.title != None:
|
marco@4
|
188 entry.add_fields(dcterms_title = args.title[0])
|
marco@4
|
189 if args.author != None:
|
marco@4
|
190 for creator in args.author:
|
marco@4
|
191 entry.add_fields(dcterms_creator=creator)
|
marco@4
|
192 if args.date != None:
|
marco@4
|
193 entry.add_fields(dcterms_created = args.date[0])
|
marco@4
|
194 else:
|
marco@4
|
195 entry = None
|
marco@4
|
196 # Select what to do
|
marco@8
|
197 if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
|
marco@4
|
198 try:
|
marco@4
|
199 # Create the metadata entry with ATOM
|
marco@4
|
200 print "------------------------"
|
marco@4
|
201 print "Creating the item..."
|
marco@4
|
202 if entry is None:
|
marco@4
|
203 entry = Entry(dcterms_title=(os.path.basename(data)))
|
marco@8
|
204 creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
|
marco@4
|
205
|
marco@4
|
206 # Add the files
|
marco@15
|
207 # Get the longest common path in order to send the correct filename to keep the structure
|
marco@15
|
208 common = os.path.commonprefix(fileslist)
|
marco@4
|
209 for f in fileslist:
|
marco@15
|
210 filename = os.path.relpath(f,common)
|
marco@15
|
211 print "Uploading file ", filename
|
marco@4
|
212 payload = open(f,"rb")
|
marco@4
|
213 deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
|
marco@4
|
214 payload = payload,
|
marco@15
|
215 filename = filename,
|
marco@4
|
216 mimetype = 'application/zip',
|
marco@4
|
217 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@4
|
218 payload.close()
|
marco@4
|
219 except HTTPResponseError:
|
marco@4
|
220 print "Bad request"
|
marco@4
|
221 else:
|
marco@8
|
222 # Send the zip file and let the ingester do its job
|
marco@4
|
223 payload = open(fileslist[0], "rb")
|
marco@10
|
224 if (type == "SimpleZip") or (type=="BAGIT"):
|
marco@8
|
225 in_progress = True
|
marco@9
|
226 # FIXME: we don't want to write silly things in dc.description!
|
marco@8
|
227 else:
|
marco@8
|
228 in_progress = False
|
marco@4
|
229 try:
|
marco@4
|
230 deposit_receipt = c.create(col_iri = collection.href,
|
marco@4
|
231 payload = payload,
|
marco@4
|
232 filename = fileslist[0],
|
marco@4
|
233 mimetype = "application/zip",
|
marco@8
|
234 packaging = packaging,
|
marco@8
|
235 in_progress = in_progress)
|
marco@4
|
236 print type, " submission successful."
|
marco@4
|
237 except:
|
marco@4
|
238 print "Error! Couldn't submit the file!"
|
marco@4
|
239 if type == "METS": # Just guessing: not sure this is the problem...
|
marco@4
|
240 print "To submit a METS package, the collection MUST have a workflow!"
|
marco@4
|
241 payload.close()
|
marco@4
|
242
|
marco@8
|
243 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
|
marco@7
|
244 if type == "SimpleZip":
|
marco@7
|
245 if entry is None:
|
marco@7
|
246 entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
|
marco@4
|
247 try:
|
marco@8
|
248 update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
|
marco@7
|
249 print "Metadata update successfull."
|
marco@4
|
250 except:
|
marco@4
|
251 print "Server error"
|
marco@8
|
252 # If we want to store the zip file along with the individual files (Only SimpleZip)
|
marco@8
|
253 if storezip:
|
marco@8
|
254 try:
|
marco@8
|
255 payload = open(fileslist[0],"rb")
|
marco@8
|
256 zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
|
marco@8
|
257 payload = payload,
|
marco@8
|
258 filename = os.path.basename(fileslist[0]).replace(" ","_"),
|
marco@8
|
259 mimetype = 'application/zip',
|
marco@8
|
260 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@8
|
261 payload.close()
|
marco@8
|
262 print "Zip file successfully added to the bitstreams."
|
marco@8
|
263 except:
|
marco@8
|
264 print "Server error: could not add the zip file to the resources"
|
marco@4
|
265 if temp:
|
marco@4
|
266 os.remove(fileslist[0])
|
marco@0
|
267
|
marco@4
|
268 print "------------------------"
|
marco@4
|
269 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
|
marco@4
|
270
|
marco@0
|
271
|
marco@0
|
272 else: # Failed to connect to SWORDv2 Server
|
marco@0
|
273 print "Couldn't connect to the server."
|
marco@0
|
274 if attempts == 0:
|
marco@0
|
275 print "Invalid credentials entered 3 times."
|
marco@0
|
276
|
marco@0
|
277 except KeyboardInterrupt:
|
marco@0
|
278 print "------------------------"
|
marco@3
|
279 print "\nSubmission aborted by user." |