marco@0
|
1 #!usr/bin/env/ python
|
marco@1
|
2
|
marco@12
|
3 """
|
marco@12
|
4
|
marco@13
|
5 SWORD2 DSpace bulk uploader - v0.5
|
marco@1
|
6
|
marco@1
|
7 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
|
marco@12
|
8 Built on the SWORD2 python client library: https://bitbucket.org/beno/python-sword2/overview with modifications.
|
marco@1
|
9
|
marco@1
|
10 Dependencies:
|
marco@1
|
11
|
marco@1
|
12 - python 2.X
|
marco@1
|
13
|
marco@12
|
14 - sword2 library, with modifications: https://bitbucket.org/marcofabiani/python-sword2/src
|
marco@1
|
15
|
marco@1
|
16 -----------------------------------
|
marco@11
|
17 Centre for Digital Music, Queen Mary, University of London
|
marco@11
|
18 Copyright (c) 2012 Marco Fabiani
|
marco@11
|
19
|
marco@11
|
20 Permission is hereby granted, free of charge, to any person
|
marco@11
|
21 obtaining a copy of this software and associated documentation
|
marco@11
|
22 files (the "Software"), to deal in the Software without
|
marco@11
|
23 restriction, including without limitation the rights to use, copy,
|
marco@11
|
24 modify, merge, publish, distribute, sublicense, and/or sell copies
|
marco@11
|
25 of the Software, and to permit persons to whom the Software is
|
marco@11
|
26 furnished to do so, subject to the following conditions:
|
marco@11
|
27
|
marco@11
|
28 The above copyright notice and this permission notice shall be
|
marco@11
|
29 included in all copies or substantial portions of the Software.
|
marco@11
|
30
|
marco@11
|
31 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
marco@11
|
32 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
marco@11
|
33 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
marco@11
|
34 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
marco@11
|
35 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
marco@11
|
36 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
marco@11
|
37 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
marco@11
|
38 OTHER DEALINGS IN THE SOFTWARE.
|
marco@1
|
39 -----------------------------------
|
marco@12
|
40
|
marco@11
|
41 A copy of this License can also be found in the COPYING file distributed with the source code.
|
marco@1
|
42 """
|
marco@0
|
43
|
marco@4
|
44 import argparse, getpass, zipfile, os, sys
|
marco@0
|
45 from sword2 import *
|
marco@0
|
46
|
marco@0
|
47 # Parse arguments
|
marco@0
|
48 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
|
marco@0
|
49 parser.add_argument("data", type=str, nargs=1,
|
marco@12
|
50 help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
|
marco@0
|
51 parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
|
marco@12
|
52 parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.")
|
marco@0
|
53 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
|
marco@0
|
54 parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
|
marco@0
|
55 parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
|
marco@13
|
56 parser.add_argument("--servicedoc", dest="sd", type=str,nargs=1, help="Url of the SWORDv2 service document (default: http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument")
|
marco@0
|
57
|
marco@0
|
58 args = parser.parse_args()
|
marco@0
|
59 data = args.data[0]
|
marco@13
|
60 if args.zip:
|
marco@13
|
61 storezip = True
|
marco@13
|
62 else:
|
marco@13
|
63 storezip = False
|
marco@8
|
64
|
marco@13
|
65 if args.sd == None:
|
marco@13
|
66 sd = "http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument"
|
marco@0
|
67 else:
|
marco@13
|
68 sd = args.sd[0]
|
marco@0
|
69
|
marco@0
|
70
|
marco@0
|
71 try:
|
marco@13
|
72 # Connect to SWORD server: it will always try to authenticate (no anonymous submissions!
|
marco@0
|
73 attempts = 3 # Number of attempts left to connect to server
|
marco@0
|
74 connected = False
|
marco@0
|
75 while attempts>0 and not connected:
|
marco@0
|
76 print "Connecting to SWORD server. Remaining attempts: ", attempts
|
marco@0
|
77 # Try to login, get service document
|
marco@0
|
78 # Get username and password
|
marco@0
|
79 if args.user_name == None:
|
marco@13
|
80 user_name = raw_input("Username: ")
|
marco@0
|
81 else:
|
marco@0
|
82 user_name = args.user_name[0]
|
marco@13
|
83 print "Username: ",user_name
|
marco@13
|
84 user_pass = getpass.getpass("Password:")
|
marco@0
|
85 # Connect to the server
|
marco@13
|
86 c = Connection(sd, user_name=user_name, user_pass=user_pass,keep_history=False)
|
marco@3
|
87
|
marco@0
|
88 # Get service document
|
marco@0
|
89 try:
|
marco@0
|
90 c.get_service_document()
|
marco@3
|
91 except: # Server error
|
marco@3
|
92 print "Server unreachable!"
|
marco@3
|
93 break
|
marco@3
|
94 if c.sd != None:
|
marco@3
|
95 connected = True
|
marco@3
|
96 else:
|
marco@0
|
97 attempts-=1
|
marco@0
|
98 print "Incorrect username and/or password"
|
marco@3
|
99
|
marco@0
|
100
|
marco@0
|
101 if connected:
|
marco@0
|
102 # List available collections
|
marco@0
|
103 print "------------------------"
|
marco@0
|
104 print "Welcome to the ",c.workspaces[0][0], "repository"
|
marco@0
|
105 print "Available Collections: "
|
marco@0
|
106 numColl = len(c.workspaces[0][1])
|
marco@0
|
107 for ctr in range(numColl):
|
marco@0
|
108 coll = c.workspaces[0][1][ctr]
|
marco@0
|
109 print ctr+1,":",coll.title
|
marco@0
|
110 # Select a collection to deposit into
|
marco@0
|
111 sel = -1
|
marco@0
|
112 while (sel<=0 or sel>numColl):
|
marco@0
|
113 sel = input("Select a Collection to submit your files into: ")
|
marco@0
|
114 collection = c.workspaces[0][1][sel-1]
|
marco@0
|
115 print "Selected Collection: ",collection.title
|
marco@0
|
116
|
marco@4
|
117 # Create a submission
|
marco@4
|
118 fileslist = []
|
marco@4
|
119 temp = False # Delete temp files
|
marco@4
|
120 # If folder
|
marco@4
|
121 if os.path.isdir(data):
|
marco@4
|
122 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
|
marco@4
|
123 dataname = os.path.basename(os.path.normpath(data))
|
marco@9
|
124 if args.title != None:
|
marco@9
|
125 zipf = args.title[0].replace(" ","_")+".zip"
|
marco@9
|
126 else:
|
marco@9
|
127 zipf = dataname.replace(" ","_")+".zip"
|
marco@4
|
128 myzip = zipfile.ZipFile(zipf, "w")
|
marco@4
|
129 # get the directory structure
|
marco@4
|
130 print "Creating a zip archive for submission..."
|
marco@4
|
131 for root, dirs, files in os.walk(data):
|
marco@4
|
132 for name in files:
|
marco@8
|
133 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
|
marco@8
|
134 myzip.write(os.path.join(root,name),
|
marco@9
|
135 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets
|
marco@4
|
136 fileslist.append(zipf)
|
marco@5
|
137 myzip.close()
|
marco@4
|
138 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
139 type = "SimpleZip"
|
marco@4
|
140 temp = True
|
marco@4
|
141 else: #create a list of files to upload
|
marco@4
|
142 for root, dirs, files in os.walk(data):
|
marco@4
|
143 for name in files:
|
marco@9
|
144 if not name.startswith('.'):
|
marco@9
|
145 fileslist.append(os.path.join(root,name))
|
marco@4
|
146 type = "multiple files"
|
marco@4
|
147 elif zipfile.is_zipfile(data): #This is a zip file
|
marco@4
|
148 fileslist.append(data)
|
marco@4
|
149 myzip = zipfile.ZipFile(data)
|
marco@4
|
150 if "mets.xml" in myzip.namelist(): #This is a METS package
|
marco@4
|
151 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
|
marco@4
|
152 type = "METS"
|
marco@8
|
153 in_progress = False
|
marco@10
|
154 elif "bagit.txt" in "".join(myzip.namelist()): #This is a BagIt package
|
marco@10
|
155 packaging = "http://purl.org/net/sword/package/BagIt"
|
marco@10
|
156 type = "BAGIT"
|
marco@10
|
157 else:#THis is a simple zip file
|
marco@4
|
158 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
159 type = "SimpleZip"
|
marco@4
|
160 myzip.close()
|
marco@0
|
161 elif os.path.isfile(data): # This is a single file
|
marco@4
|
162 fileslist.append(data)
|
marco@4
|
163 type = "single file"
|
marco@0
|
164 else:
|
marco@0
|
165 print "Couldn't find the data."
|
marco@0
|
166 sys.exit()
|
marco@0
|
167
|
marco@0
|
168 print "------------------------"
|
marco@0
|
169 print "This is a ",type," submission"
|
marco@4
|
170
|
marco@4
|
171 # Create a metadata entry
|
marco@4
|
172 if (args.title != None) or (args.author != None) or (args.date != None):
|
marco@4
|
173 entry = Entry()
|
marco@4
|
174 if args.title != None:
|
marco@4
|
175 entry.add_fields(dcterms_title = args.title[0])
|
marco@4
|
176 if args.author != None:
|
marco@4
|
177 for creator in args.author:
|
marco@4
|
178 entry.add_fields(dcterms_creator=creator)
|
marco@4
|
179 if args.date != None:
|
marco@4
|
180 entry.add_fields(dcterms_created = args.date[0])
|
marco@4
|
181 else:
|
marco@4
|
182 entry = None
|
marco@4
|
183 # Select what to do
|
marco@8
|
184 if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
|
marco@4
|
185 try:
|
marco@4
|
186 # Create the metadata entry with ATOM
|
marco@4
|
187 print "------------------------"
|
marco@4
|
188 print "Creating the item..."
|
marco@4
|
189 if entry is None:
|
marco@4
|
190 entry = Entry(dcterms_title=(os.path.basename(data)))
|
marco@8
|
191 creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
|
marco@4
|
192
|
marco@4
|
193 # Add the files
|
marco@4
|
194 for f in fileslist:
|
marco@4
|
195 print "Uploading file ",os.path.basename(f)
|
marco@4
|
196 payload = open(f,"rb")
|
marco@4
|
197 deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
|
marco@4
|
198 payload = payload,
|
marco@4
|
199 filename = os.path.basename(f),
|
marco@4
|
200 mimetype = 'application/zip',
|
marco@4
|
201 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@4
|
202 payload.close()
|
marco@4
|
203 except HTTPResponseError:
|
marco@4
|
204 print "Bad request"
|
marco@4
|
205 else:
|
marco@8
|
206 # Send the zip file and let the ingester do its job
|
marco@4
|
207 payload = open(fileslist[0], "rb")
|
marco@10
|
208 if (type == "SimpleZip") or (type=="BAGIT"):
|
marco@8
|
209 in_progress = True
|
marco@9
|
210 # FIXME: we don't want to write silly things in dc.description!
|
marco@8
|
211 else:
|
marco@8
|
212 in_progress = False
|
marco@4
|
213 try:
|
marco@4
|
214 deposit_receipt = c.create(col_iri = collection.href,
|
marco@4
|
215 payload = payload,
|
marco@4
|
216 filename = fileslist[0],
|
marco@4
|
217 mimetype = "application/zip",
|
marco@8
|
218 packaging = packaging,
|
marco@8
|
219 in_progress = in_progress)
|
marco@4
|
220 print type, " submission successful."
|
marco@4
|
221 except:
|
marco@4
|
222 print "Error! Couldn't submit the file!"
|
marco@4
|
223 if type == "METS": # Just guessing: not sure this is the problem...
|
marco@4
|
224 print "To submit a METS package, the collection MUST have a workflow!"
|
marco@4
|
225 payload.close()
|
marco@4
|
226
|
marco@8
|
227 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
|
marco@7
|
228 if type == "SimpleZip":
|
marco@7
|
229 if entry is None:
|
marco@7
|
230 entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
|
marco@4
|
231 try:
|
marco@8
|
232 update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
|
marco@7
|
233 print "Metadata update successfull."
|
marco@4
|
234 except:
|
marco@4
|
235 print "Server error"
|
marco@8
|
236 # If we want to store the zip file along with the individual files (Only SimpleZip)
|
marco@8
|
237 if storezip:
|
marco@8
|
238 try:
|
marco@8
|
239 payload = open(fileslist[0],"rb")
|
marco@8
|
240 zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
|
marco@8
|
241 payload = payload,
|
marco@8
|
242 filename = os.path.basename(fileslist[0]).replace(" ","_"),
|
marco@8
|
243 mimetype = 'application/zip',
|
marco@8
|
244 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@8
|
245 payload.close()
|
marco@8
|
246 print "Zip file successfully added to the bitstreams."
|
marco@8
|
247 except:
|
marco@8
|
248 print "Server error: could not add the zip file to the resources"
|
marco@4
|
249 if temp:
|
marco@4
|
250 os.remove(fileslist[0])
|
marco@0
|
251
|
marco@4
|
252 print "------------------------"
|
marco@4
|
253 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
|
marco@4
|
254
|
marco@0
|
255
|
marco@0
|
256 else: # Failed to connect to SWORDv2 Server
|
marco@0
|
257 print "Couldn't connect to the server."
|
marco@0
|
258 if attempts == 0:
|
marco@0
|
259 print "Invalid credentials entered 3 times."
|
marco@0
|
260
|
marco@0
|
261 except KeyboardInterrupt:
|
marco@0
|
262 print "------------------------"
|
marco@3
|
263 print "\nSubmission aborted by user." |