marco@0
|
1 #!usr/bin/env/ python
|
marco@1
|
2
|
marco@12
|
3 """
|
marco@12
|
4
|
marco@13
|
5 SWORD2 DSpace bulk uploader - v0.5
|
marco@1
|
6
|
marco@1
|
7 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
|
marco@12
|
8 Built on the SWORD2 python client library: https://bitbucket.org/beno/python-sword2/overview with modifications.
|
marco@1
|
9
|
marco@1
|
10 Dependencies:
|
marco@1
|
11
|
marco@1
|
12 - python 2.X
|
marco@1
|
13
|
marco@14
|
14 - sword2 library: https://github.com/swordapp/python-client-sword2
|
marco@1
|
15
|
marco@1
|
16 -----------------------------------
|
marco@11
|
17 Centre for Digital Music, Queen Mary, University of London
|
marco@11
|
18 Copyright (c) 2012 Marco Fabiani
|
marco@11
|
19
|
marco@11
|
20 Permission is hereby granted, free of charge, to any person
|
marco@11
|
21 obtaining a copy of this software and associated documentation
|
marco@11
|
22 files (the "Software"), to deal in the Software without
|
marco@11
|
23 restriction, including without limitation the rights to use, copy,
|
marco@11
|
24 modify, merge, publish, distribute, sublicense, and/or sell copies
|
marco@11
|
25 of the Software, and to permit persons to whom the Software is
|
marco@11
|
26 furnished to do so, subject to the following conditions:
|
marco@11
|
27
|
marco@11
|
28 The above copyright notice and this permission notice shall be
|
marco@11
|
29 included in all copies or substantial portions of the Software.
|
marco@11
|
30
|
marco@11
|
31 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
marco@11
|
32 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
marco@11
|
33 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
marco@11
|
34 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
marco@11
|
35 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
marco@11
|
36 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
marco@11
|
37 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
marco@11
|
38 OTHER DEALINGS IN THE SOFTWARE.
|
marco@1
|
39 -----------------------------------
|
marco@12
|
40
|
marco@11
|
41 A copy of this License can also be found in the COPYING file distributed with the source code.
|
marco@1
|
42 """
|
marco@0
|
43
|
marco@4
|
44 import argparse, getpass, zipfile, os, sys
|
marco@0
|
45 from sword2 import *
|
marco@0
|
46
|
marco@0
|
47 # Parse arguments
|
marco@0
|
48 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
|
marco@0
|
49 parser.add_argument("data", type=str, nargs=1,
|
marco@12
|
50 help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
|
marco@0
|
51 parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
|
marco@0
|
52 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
|
marco@0
|
53 parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
|
marco@0
|
54 parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
|
marco@14
|
55 parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.")
|
marco@13
|
56 parser.add_argument("--servicedoc", dest="sd", type=str,nargs=1, help="Url of the SWORDv2 service document (default: http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument")
|
marco@0
|
57
|
marco@0
|
58 args = parser.parse_args()
|
marco@0
|
59 data = args.data[0]
|
marco@13
|
60 if args.zip:
|
marco@13
|
61 storezip = True
|
marco@13
|
62 else:
|
marco@13
|
63 storezip = False
|
marco@8
|
64
|
marco@13
|
65 if args.sd == None:
|
marco@14
|
66 sd = "http://c4dm.eecs.qmul.ac.uk/rdr/swordv2/servicedocument"
|
marco@0
|
67 else:
|
marco@13
|
68 sd = args.sd[0]
|
marco@0
|
69
|
marco@0
|
70
|
marco@0
|
71 try:
|
marco@13
|
72 # Connect to SWORD server: it will always try to authenticate (no anonymous submissions!
|
marco@0
|
73 attempts = 3 # Number of attempts left to connect to server
|
marco@0
|
74 connected = False
|
marco@0
|
75 while attempts>0 and not connected:
|
marco@0
|
76 print "Connecting to SWORD server. Remaining attempts: ", attempts
|
marco@0
|
77 # Try to login, get service document
|
marco@0
|
78 # Get username and password
|
marco@0
|
79 if args.user_name == None:
|
marco@13
|
80 user_name = raw_input("Username: ")
|
marco@0
|
81 else:
|
marco@0
|
82 user_name = args.user_name[0]
|
marco@13
|
83 print "Username: ",user_name
|
marco@13
|
84 user_pass = getpass.getpass("Password:")
|
marco@0
|
85 # Connect to the server
|
marco@13
|
86 c = Connection(sd, user_name=user_name, user_pass=user_pass,keep_history=False)
|
marco@3
|
87
|
marco@0
|
88 # Get service document
|
marco@0
|
89 try:
|
marco@0
|
90 c.get_service_document()
|
marco@3
|
91 except: # Server error
|
marco@3
|
92 print "Server unreachable!"
|
marco@3
|
93 break
|
marco@3
|
94 if c.sd != None:
|
marco@3
|
95 connected = True
|
marco@3
|
96 else:
|
marco@0
|
97 attempts-=1
|
marco@0
|
98 print "Incorrect username and/or password"
|
marco@3
|
99
|
marco@0
|
100
|
marco@0
|
101 if connected:
|
marco@0
|
102 # List available collections
|
marco@0
|
103 print "------------------------"
|
marco@0
|
104 print "Welcome to the ",c.workspaces[0][0], "repository"
|
marco@0
|
105 print "Available Collections: "
|
marco@0
|
106 numColl = len(c.workspaces[0][1])
|
marco@0
|
107 for ctr in range(numColl):
|
marco@0
|
108 coll = c.workspaces[0][1][ctr]
|
marco@0
|
109 print ctr+1,":",coll.title
|
marco@0
|
110 # Select a collection to deposit into
|
marco@14
|
111 sel = "0"
|
marco@14
|
112 while (not sel.isdigit() or int(sel)<=0 or int(sel)>numColl):
|
marco@14
|
113 sel = raw_input("Select a Collection to submit your files into: ")
|
marco@14
|
114 sel = int(sel)
|
marco@0
|
115 collection = c.workspaces[0][1][sel-1]
|
marco@0
|
116 print "Selected Collection: ",collection.title
|
marco@0
|
117
|
marco@4
|
118 # Create a submission
|
marco@4
|
119 fileslist = []
|
marco@4
|
120 temp = False # Delete temp files
|
marco@4
|
121 # If folder
|
marco@4
|
122 if os.path.isdir(data):
|
marco@4
|
123 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
|
marco@4
|
124 dataname = os.path.basename(os.path.normpath(data))
|
marco@9
|
125 if args.title != None:
|
marco@9
|
126 zipf = args.title[0].replace(" ","_")+".zip"
|
marco@9
|
127 else:
|
marco@9
|
128 zipf = dataname.replace(" ","_")+".zip"
|
marco@4
|
129 myzip = zipfile.ZipFile(zipf, "w")
|
marco@4
|
130 # get the directory structure
|
marco@4
|
131 print "Creating a zip archive for submission..."
|
marco@4
|
132 for root, dirs, files in os.walk(data):
|
marco@4
|
133 for name in files:
|
marco@8
|
134 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
|
marco@8
|
135 myzip.write(os.path.join(root,name),
|
marco@9
|
136 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets
|
marco@4
|
137 fileslist.append(zipf)
|
marco@5
|
138 myzip.close()
|
marco@4
|
139 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
140 type = "SimpleZip"
|
marco@4
|
141 temp = True
|
marco@4
|
142 else: #create a list of files to upload
|
marco@4
|
143 for root, dirs, files in os.walk(data):
|
marco@4
|
144 for name in files:
|
marco@9
|
145 if not name.startswith('.'):
|
marco@9
|
146 fileslist.append(os.path.join(root,name))
|
marco@4
|
147 type = "multiple files"
|
marco@4
|
148 elif zipfile.is_zipfile(data): #This is a zip file
|
marco@4
|
149 fileslist.append(data)
|
marco@4
|
150 myzip = zipfile.ZipFile(data)
|
marco@4
|
151 if "mets.xml" in myzip.namelist(): #This is a METS package
|
marco@4
|
152 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
|
marco@4
|
153 type = "METS"
|
marco@8
|
154 in_progress = False
|
marco@10
|
155 elif "bagit.txt" in "".join(myzip.namelist()): #This is a BagIt package
|
marco@10
|
156 packaging = "http://purl.org/net/sword/package/BagIt"
|
marco@10
|
157 type = "BAGIT"
|
marco@10
|
158 else:#THis is a simple zip file
|
marco@4
|
159 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
160 type = "SimpleZip"
|
marco@4
|
161 myzip.close()
|
marco@0
|
162 elif os.path.isfile(data): # This is a single file
|
marco@4
|
163 fileslist.append(data)
|
marco@4
|
164 type = "single file"
|
marco@0
|
165 else:
|
marco@0
|
166 print "Couldn't find the data."
|
marco@0
|
167 sys.exit()
|
marco@0
|
168
|
marco@0
|
169 print "------------------------"
|
marco@0
|
170 print "This is a ",type," submission"
|
marco@4
|
171
|
marco@4
|
172 # Create a metadata entry
|
marco@4
|
173 if (args.title != None) or (args.author != None) or (args.date != None):
|
marco@4
|
174 entry = Entry()
|
marco@4
|
175 if args.title != None:
|
marco@4
|
176 entry.add_fields(dcterms_title = args.title[0])
|
marco@4
|
177 if args.author != None:
|
marco@4
|
178 for creator in args.author:
|
marco@4
|
179 entry.add_fields(dcterms_creator=creator)
|
marco@4
|
180 if args.date != None:
|
marco@4
|
181 entry.add_fields(dcterms_created = args.date[0])
|
marco@4
|
182 else:
|
marco@4
|
183 entry = None
|
marco@4
|
184 # Select what to do
|
marco@8
|
185 if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
|
marco@4
|
186 try:
|
marco@4
|
187 # Create the metadata entry with ATOM
|
marco@4
|
188 print "------------------------"
|
marco@4
|
189 print "Creating the item..."
|
marco@4
|
190 if entry is None:
|
marco@4
|
191 entry = Entry(dcterms_title=(os.path.basename(data)))
|
marco@8
|
192 creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
|
marco@4
|
193
|
marco@4
|
194 # Add the files
|
marco@4
|
195 for f in fileslist:
|
marco@4
|
196 print "Uploading file ",os.path.basename(f)
|
marco@4
|
197 payload = open(f,"rb")
|
marco@4
|
198 deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
|
marco@4
|
199 payload = payload,
|
marco@4
|
200 filename = os.path.basename(f),
|
marco@4
|
201 mimetype = 'application/zip',
|
marco@4
|
202 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@4
|
203 payload.close()
|
marco@4
|
204 except HTTPResponseError:
|
marco@4
|
205 print "Bad request"
|
marco@4
|
206 else:
|
marco@8
|
207 # Send the zip file and let the ingester do its job
|
marco@4
|
208 payload = open(fileslist[0], "rb")
|
marco@10
|
209 if (type == "SimpleZip") or (type=="BAGIT"):
|
marco@8
|
210 in_progress = True
|
marco@9
|
211 # FIXME: we don't want to write silly things in dc.description!
|
marco@8
|
212 else:
|
marco@8
|
213 in_progress = False
|
marco@4
|
214 try:
|
marco@4
|
215 deposit_receipt = c.create(col_iri = collection.href,
|
marco@4
|
216 payload = payload,
|
marco@4
|
217 filename = fileslist[0],
|
marco@4
|
218 mimetype = "application/zip",
|
marco@8
|
219 packaging = packaging,
|
marco@8
|
220 in_progress = in_progress)
|
marco@4
|
221 print type, " submission successful."
|
marco@4
|
222 except:
|
marco@4
|
223 print "Error! Couldn't submit the file!"
|
marco@4
|
224 if type == "METS": # Just guessing: not sure this is the problem...
|
marco@4
|
225 print "To submit a METS package, the collection MUST have a workflow!"
|
marco@4
|
226 payload.close()
|
marco@4
|
227
|
marco@8
|
228 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
|
marco@7
|
229 if type == "SimpleZip":
|
marco@7
|
230 if entry is None:
|
marco@7
|
231 entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
|
marco@4
|
232 try:
|
marco@8
|
233 update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
|
marco@7
|
234 print "Metadata update successfull."
|
marco@4
|
235 except:
|
marco@4
|
236 print "Server error"
|
marco@8
|
237 # If we want to store the zip file along with the individual files (Only SimpleZip)
|
marco@8
|
238 if storezip:
|
marco@8
|
239 try:
|
marco@8
|
240 payload = open(fileslist[0],"rb")
|
marco@8
|
241 zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
|
marco@8
|
242 payload = payload,
|
marco@8
|
243 filename = os.path.basename(fileslist[0]).replace(" ","_"),
|
marco@8
|
244 mimetype = 'application/zip',
|
marco@8
|
245 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@8
|
246 payload.close()
|
marco@8
|
247 print "Zip file successfully added to the bitstreams."
|
marco@8
|
248 except:
|
marco@8
|
249 print "Server error: could not add the zip file to the resources"
|
marco@4
|
250 if temp:
|
marco@4
|
251 os.remove(fileslist[0])
|
marco@0
|
252
|
marco@4
|
253 print "------------------------"
|
marco@4
|
254 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
|
marco@4
|
255
|
marco@0
|
256
|
marco@0
|
257 else: # Failed to connect to SWORDv2 Server
|
marco@0
|
258 print "Couldn't connect to the server."
|
marco@0
|
259 if attempts == 0:
|
marco@0
|
260 print "Invalid credentials entered 3 times."
|
marco@0
|
261
|
marco@0
|
262 except KeyboardInterrupt:
|
marco@0
|
263 print "------------------------"
|
marco@3
|
264 print "\nSubmission aborted by user." |