marco@0
|
1 #!usr/bin/env/ python
|
marco@1
|
2
|
marco@11
|
3 """ SWORD2 DSpace bulk uploader - v0.4
|
marco@1
|
4
|
marco@1
|
5 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
|
marco@1
|
6 Built on the SWORD2 python client library: https://bitbucket.org/beno/python-sword2/overview
|
marco@1
|
7
|
marco@1
|
8 Dependencies:
|
marco@1
|
9
|
marco@1
|
10 - python 2.X
|
marco@1
|
11
|
marco@4
|
12 - sword2 library, with modifications:
|
marco@4
|
13 (original) https://bitbucket.org/beno/python-sword2/src
|
marco@11
|
14 (modified) https://bitbucket.org/marcofabiani/python-sword2/src
|
marco@1
|
15
|
marco@1
|
16 -----------------------------------
|
marco@11
|
17 Centre for Digital Music, Queen Mary, University of London
|
marco@11
|
18 Copyright (c) 2012 Marco Fabiani
|
marco@11
|
19
|
marco@11
|
20 Permission is hereby granted, free of charge, to any person
|
marco@11
|
21 obtaining a copy of this software and associated documentation
|
marco@11
|
22 files (the "Software"), to deal in the Software without
|
marco@11
|
23 restriction, including without limitation the rights to use, copy,
|
marco@11
|
24 modify, merge, publish, distribute, sublicense, and/or sell copies
|
marco@11
|
25 of the Software, and to permit persons to whom the Software is
|
marco@11
|
26 furnished to do so, subject to the following conditions:
|
marco@11
|
27
|
marco@11
|
28 The above copyright notice and this permission notice shall be
|
marco@11
|
29 included in all copies or substantial portions of the Software.
|
marco@11
|
30
|
marco@11
|
31 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
marco@11
|
32 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
marco@11
|
33 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
marco@11
|
34 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
marco@11
|
35 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
marco@11
|
36 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
marco@11
|
37 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
marco@11
|
38 OTHER DEALINGS IN THE SOFTWARE.
|
marco@1
|
39 -----------------------------------
|
marco@11
|
40 A copy of this License can also be found in the COPYING file distributed with the source code.
|
marco@1
|
41 """
|
marco@0
|
42
|
marco@4
|
43 import argparse, getpass, zipfile, os, sys
|
marco@0
|
44 from sword2 import *
|
marco@0
|
45
|
marco@0
|
46 # Parse arguments
|
marco@0
|
47 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
|
marco@0
|
48 parser.add_argument("data", type=str, nargs=1,
|
marco@1
|
49 help="Accepts: METSDSpaceSIP packages, zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
|
marco@0
|
50 parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
|
marco@8
|
51 parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, send it as a single zip archive to preserve its structure. The zip file will be saved along with the individual files.")
|
marco@0
|
52 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
|
marco@0
|
53 parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
|
marco@0
|
54 parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
|
marco@1
|
55 parser.add_argument("--servicedoc", dest="dspaceurl", type=str,nargs=1, help="Url of the SWORDv2 service document (default: http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument")
|
marco@0
|
56
|
marco@0
|
57 args = parser.parse_args()
|
marco@0
|
58 data = args.data[0]
|
marco@8
|
59 storezip = True
|
marco@8
|
60
|
marco@0
|
61 if args.dspaceurl == None:
|
marco@0
|
62 dspaceurl = "http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument"
|
marco@0
|
63 else:
|
marco@0
|
64 dspaceurl = args.dspaceurl[0]
|
marco@0
|
65
|
marco@0
|
66
|
marco@0
|
67 try:
|
marco@0
|
68 # Connect to SWORD server
|
marco@0
|
69 attempts = 3 # Number of attempts left to connect to server
|
marco@0
|
70 connected = False
|
marco@0
|
71 while attempts>0 and not connected:
|
marco@0
|
72 print "Connecting to SWORD server. Remaining attempts: ", attempts
|
marco@0
|
73 # Try to login, get service document
|
marco@0
|
74 # Get username and password
|
marco@0
|
75 if args.user_name == None:
|
marco@0
|
76 user_name = raw_input("DSpace Username: ")
|
marco@0
|
77 else:
|
marco@0
|
78 user_name = args.user_name[0]
|
marco@0
|
79 print "DSpace Username: ",user_name
|
marco@0
|
80 user_pass = getpass.getpass("DSpace password:")
|
marco@0
|
81 # Connect to the server
|
marco@0
|
82 c = Connection(dspaceurl, user_name=user_name, user_pass=user_pass,keep_history=False)
|
marco@3
|
83
|
marco@0
|
84 # Get service document
|
marco@0
|
85 try:
|
marco@0
|
86 c.get_service_document()
|
marco@3
|
87 except: # Server error
|
marco@3
|
88 print "Server unreachable!"
|
marco@3
|
89 break
|
marco@3
|
90 if c.sd != None:
|
marco@3
|
91 connected = True
|
marco@3
|
92 else:
|
marco@0
|
93 attempts-=1
|
marco@0
|
94 print "Incorrect username and/or password"
|
marco@3
|
95
|
marco@0
|
96
|
marco@0
|
97 if connected:
|
marco@0
|
98 # List available collections
|
marco@0
|
99 print "------------------------"
|
marco@0
|
100 print "Welcome to the ",c.workspaces[0][0], "repository"
|
marco@0
|
101 print "Available Collections: "
|
marco@0
|
102 numColl = len(c.workspaces[0][1])
|
marco@0
|
103 for ctr in range(numColl):
|
marco@0
|
104 coll = c.workspaces[0][1][ctr]
|
marco@0
|
105 print ctr+1,":",coll.title
|
marco@0
|
106 # Select a collection to deposit into
|
marco@0
|
107 sel = -1
|
marco@0
|
108 while (sel<=0 or sel>numColl):
|
marco@0
|
109 sel = input("Select a Collection to submit your files into: ")
|
marco@0
|
110 collection = c.workspaces[0][1][sel-1]
|
marco@0
|
111 print "Selected Collection: ",collection.title
|
marco@0
|
112
|
marco@4
|
113 # Create a submission
|
marco@4
|
114 fileslist = []
|
marco@4
|
115 temp = False # Delete temp files
|
marco@4
|
116 # If folder
|
marco@4
|
117 if os.path.isdir(data):
|
marco@4
|
118 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
|
marco@4
|
119 dataname = os.path.basename(os.path.normpath(data))
|
marco@9
|
120 if args.title != None:
|
marco@9
|
121 zipf = args.title[0].replace(" ","_")+".zip"
|
marco@9
|
122 else:
|
marco@9
|
123 zipf = dataname.replace(" ","_")+".zip"
|
marco@4
|
124 myzip = zipfile.ZipFile(zipf, "w")
|
marco@4
|
125 # get the directory structure
|
marco@4
|
126 print "Creating a zip archive for submission..."
|
marco@4
|
127 for root, dirs, files in os.walk(data):
|
marco@4
|
128 for name in files:
|
marco@8
|
129 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
|
marco@8
|
130 myzip.write(os.path.join(root,name),
|
marco@9
|
131 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets
|
marco@4
|
132 fileslist.append(zipf)
|
marco@5
|
133 myzip.close()
|
marco@4
|
134 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
135 type = "SimpleZip"
|
marco@4
|
136 temp = True
|
marco@4
|
137 else: #create a list of files to upload
|
marco@4
|
138 for root, dirs, files in os.walk(data):
|
marco@4
|
139 for name in files:
|
marco@9
|
140 if not name.startswith('.'):
|
marco@9
|
141 fileslist.append(os.path.join(root,name))
|
marco@4
|
142 type = "multiple files"
|
marco@4
|
143 elif zipfile.is_zipfile(data): #This is a zip file
|
marco@4
|
144 fileslist.append(data)
|
marco@4
|
145 myzip = zipfile.ZipFile(data)
|
marco@4
|
146 if "mets.xml" in myzip.namelist(): #This is a METS package
|
marco@4
|
147 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
|
marco@4
|
148 type = "METS"
|
marco@8
|
149 in_progress = False
|
marco@10
|
150 elif "bagit.txt" in "".join(myzip.namelist()): #This is a BagIt package
|
marco@10
|
151 packaging = "http://purl.org/net/sword/package/BagIt"
|
marco@10
|
152 type = "BAGIT"
|
marco@10
|
153 else:#THis is a simple zip file
|
marco@4
|
154 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
155 type = "SimpleZip"
|
marco@4
|
156 myzip.close()
|
marco@0
|
157 elif os.path.isfile(data): # This is a single file
|
marco@4
|
158 fileslist.append(data)
|
marco@4
|
159 type = "single file"
|
marco@0
|
160 else:
|
marco@0
|
161 print "Couldn't find the data."
|
marco@0
|
162 sys.exit()
|
marco@0
|
163
|
marco@0
|
164 print "------------------------"
|
marco@0
|
165 print "This is a ",type," submission"
|
marco@4
|
166
|
marco@4
|
167 # Create a metadata entry
|
marco@4
|
168 if (args.title != None) or (args.author != None) or (args.date != None):
|
marco@4
|
169 entry = Entry()
|
marco@4
|
170 if args.title != None:
|
marco@4
|
171 entry.add_fields(dcterms_title = args.title[0])
|
marco@4
|
172 if args.author != None:
|
marco@4
|
173 for creator in args.author:
|
marco@4
|
174 entry.add_fields(dcterms_creator=creator)
|
marco@4
|
175 if args.date != None:
|
marco@4
|
176 entry.add_fields(dcterms_created = args.date[0])
|
marco@4
|
177 else:
|
marco@4
|
178 entry = None
|
marco@4
|
179 # Select what to do
|
marco@8
|
180 if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
|
marco@4
|
181 try:
|
marco@4
|
182 # Create the metadata entry with ATOM
|
marco@4
|
183 print "------------------------"
|
marco@4
|
184 print "Creating the item..."
|
marco@4
|
185 if entry is None:
|
marco@4
|
186 entry = Entry(dcterms_title=(os.path.basename(data)))
|
marco@8
|
187 creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
|
marco@4
|
188
|
marco@4
|
189 # Add the files
|
marco@4
|
190 for f in fileslist:
|
marco@4
|
191 print "Uploading file ",os.path.basename(f)
|
marco@4
|
192 payload = open(f,"rb")
|
marco@4
|
193 deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
|
marco@4
|
194 payload = payload,
|
marco@4
|
195 filename = os.path.basename(f),
|
marco@4
|
196 mimetype = 'application/zip',
|
marco@4
|
197 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@4
|
198 payload.close()
|
marco@4
|
199 except HTTPResponseError:
|
marco@4
|
200 print "Bad request"
|
marco@4
|
201 else:
|
marco@8
|
202 # Send the zip file and let the ingester do its job
|
marco@4
|
203 payload = open(fileslist[0], "rb")
|
marco@10
|
204 if (type == "SimpleZip") or (type=="BAGIT"):
|
marco@8
|
205 in_progress = True
|
marco@9
|
206 # FIXME: we don't want to write silly things in dc.description!
|
marco@8
|
207 else:
|
marco@8
|
208 in_progress = False
|
marco@4
|
209 try:
|
marco@4
|
210 deposit_receipt = c.create(col_iri = collection.href,
|
marco@4
|
211 payload = payload,
|
marco@4
|
212 filename = fileslist[0],
|
marco@4
|
213 mimetype = "application/zip",
|
marco@8
|
214 packaging = packaging,
|
marco@8
|
215 in_progress = in_progress)
|
marco@4
|
216 print type, " submission successful."
|
marco@4
|
217 except:
|
marco@4
|
218 print "Error! Couldn't submit the file!"
|
marco@4
|
219 if type == "METS": # Just guessing: not sure this is the problem...
|
marco@4
|
220 print "To submit a METS package, the collection MUST have a workflow!"
|
marco@4
|
221 payload.close()
|
marco@4
|
222
|
marco@8
|
223 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
|
marco@7
|
224 if type == "SimpleZip":
|
marco@7
|
225 if entry is None:
|
marco@7
|
226 entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
|
marco@4
|
227 try:
|
marco@8
|
228 update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
|
marco@7
|
229 print "Metadata update successfull."
|
marco@4
|
230 except:
|
marco@4
|
231 print "Server error"
|
marco@8
|
232 # If we want to store the zip file along with the individual files (Only SimpleZip)
|
marco@8
|
233 if storezip:
|
marco@8
|
234 try:
|
marco@8
|
235 payload = open(fileslist[0],"rb")
|
marco@8
|
236 zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
|
marco@8
|
237 payload = payload,
|
marco@8
|
238 filename = os.path.basename(fileslist[0]).replace(" ","_"),
|
marco@8
|
239 mimetype = 'application/zip',
|
marco@8
|
240 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@8
|
241 payload.close()
|
marco@8
|
242 print "Zip file successfully added to the bitstreams."
|
marco@8
|
243 except:
|
marco@8
|
244 print "Server error: could not add the zip file to the resources"
|
marco@4
|
245 if temp:
|
marco@4
|
246 os.remove(fileslist[0])
|
marco@0
|
247
|
marco@4
|
248 print "------------------------"
|
marco@4
|
249 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
|
marco@4
|
250
|
marco@0
|
251
|
marco@0
|
252 else: # Failed to connect to SWORDv2 Server
|
marco@0
|
253 print "Couldn't connect to the server."
|
marco@0
|
254 if attempts == 0:
|
marco@0
|
255 print "Invalid credentials entered 3 times."
|
marco@0
|
256
|
marco@0
|
257 except KeyboardInterrupt:
|
marco@0
|
258 print "------------------------"
|
marco@3
|
259 print "\nSubmission aborted by user." |