marco@0
|
1 #!usr/bin/env/ python
|
marco@1
|
2
|
marco@12
|
3 """
|
marco@12
|
4
|
marco@12
|
5 SWORD2 DSpace bulk uploader - v0.4
|
marco@1
|
6
|
marco@1
|
7 A python script to submit large numbers of files to a SWORD2-compatible repository, specifically DSpace 1.8x.
|
marco@12
|
8 Built on the SWORD2 python client library: https://bitbucket.org/beno/python-sword2/overview with modifications.
|
marco@1
|
9
|
marco@1
|
10 Dependencies:
|
marco@1
|
11
|
marco@1
|
12 - python 2.X
|
marco@1
|
13
|
marco@12
|
14 - sword2 library, with modifications: https://bitbucket.org/marcofabiani/python-sword2/src
|
marco@1
|
15
|
marco@1
|
16 -----------------------------------
|
marco@11
|
17 Centre for Digital Music, Queen Mary, University of London
|
marco@11
|
18 Copyright (c) 2012 Marco Fabiani
|
marco@11
|
19
|
marco@11
|
20 Permission is hereby granted, free of charge, to any person
|
marco@11
|
21 obtaining a copy of this software and associated documentation
|
marco@11
|
22 files (the "Software"), to deal in the Software without
|
marco@11
|
23 restriction, including without limitation the rights to use, copy,
|
marco@11
|
24 modify, merge, publish, distribute, sublicense, and/or sell copies
|
marco@11
|
25 of the Software, and to permit persons to whom the Software is
|
marco@11
|
26 furnished to do so, subject to the following conditions:
|
marco@11
|
27
|
marco@11
|
28 The above copyright notice and this permission notice shall be
|
marco@11
|
29 included in all copies or substantial portions of the Software.
|
marco@11
|
30
|
marco@11
|
31 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
marco@11
|
32 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
marco@11
|
33 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
marco@11
|
34 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
marco@11
|
35 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
marco@11
|
36 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
marco@11
|
37 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
marco@11
|
38 OTHER DEALINGS IN THE SOFTWARE.
|
marco@1
|
39 -----------------------------------
|
marco@12
|
40
|
marco@11
|
41 A copy of this License can also be found in the COPYING file distributed with the source code.
|
marco@1
|
42 """
|
marco@0
|
43
|
marco@4
|
44 import argparse, getpass, zipfile, os, sys
|
marco@0
|
45 from sword2 import *
|
marco@0
|
46
|
marco@0
|
47 # Parse arguments
|
marco@0
|
48 parser = argparse.ArgumentParser(description="Bulk upload to DSpace using SWORDv2.",epilog="If the submission is created successfully, it will remain open to be completed with the necessary metadata and licenses, using the DSpace web interface. The submission can be found in the \"My Account -> Submissions\" section of the user's area.")
|
marco@0
|
49 parser.add_argument("data", type=str, nargs=1,
|
marco@12
|
50 help="Accepts: METSDSpaceSIP and BagIt packages, simple zip files, directories, single files. NOTE: METSDSpaceSIP packages are only accepted by Collections with a workflow!")
|
marco@0
|
51 parser.add_argument("--username", dest="user_name", type=str,nargs=1, help="DSpace username.")
|
marco@12
|
52 parser.add_argument("--zip", action="store_true",dest="zip",default=False, help="If \"data\" is a directory, compress it and post it as a single file. The zip file will be saved along with the individual files.")
|
marco@0
|
53 parser.add_argument("--title", dest="title", type=str,nargs=1, help="Title (ignored for METS packages).")
|
marco@0
|
54 parser.add_argument("--author", dest="author", type=str,nargs="+", help="Author(s) (ignored for METS packages). Accepts multiple entries in the format \"Surname, Name\"")
|
marco@0
|
55 parser.add_argument("--date", dest="date", type=str,nargs=1, help="Date of creation (string) (ignored for METS packages).")
|
marco@1
|
56 parser.add_argument("--servicedoc", dest="dspaceurl", type=str,nargs=1, help="Url of the SWORDv2 service document (default: http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument")
|
marco@0
|
57
|
marco@0
|
58 args = parser.parse_args()
|
marco@0
|
59 data = args.data[0]
|
marco@8
|
60 storezip = True
|
marco@8
|
61
|
marco@0
|
62 if args.dspaceurl == None:
|
marco@0
|
63 dspaceurl = "http://c4dm.eecs.qmul.ac.uk/smdmrd-test/swordv2/servicedocument"
|
marco@0
|
64 else:
|
marco@0
|
65 dspaceurl = args.dspaceurl[0]
|
marco@0
|
66
|
marco@0
|
67
|
marco@0
|
68 try:
|
marco@0
|
69 # Connect to SWORD server
|
marco@0
|
70 attempts = 3 # Number of attempts left to connect to server
|
marco@0
|
71 connected = False
|
marco@0
|
72 while attempts>0 and not connected:
|
marco@0
|
73 print "Connecting to SWORD server. Remaining attempts: ", attempts
|
marco@0
|
74 # Try to login, get service document
|
marco@0
|
75 # Get username and password
|
marco@0
|
76 if args.user_name == None:
|
marco@0
|
77 user_name = raw_input("DSpace Username: ")
|
marco@0
|
78 else:
|
marco@0
|
79 user_name = args.user_name[0]
|
marco@0
|
80 print "DSpace Username: ",user_name
|
marco@0
|
81 user_pass = getpass.getpass("DSpace password:")
|
marco@0
|
82 # Connect to the server
|
marco@0
|
83 c = Connection(dspaceurl, user_name=user_name, user_pass=user_pass,keep_history=False)
|
marco@3
|
84
|
marco@0
|
85 # Get service document
|
marco@0
|
86 try:
|
marco@0
|
87 c.get_service_document()
|
marco@3
|
88 except: # Server error
|
marco@3
|
89 print "Server unreachable!"
|
marco@3
|
90 break
|
marco@3
|
91 if c.sd != None:
|
marco@3
|
92 connected = True
|
marco@3
|
93 else:
|
marco@0
|
94 attempts-=1
|
marco@0
|
95 print "Incorrect username and/or password"
|
marco@3
|
96
|
marco@0
|
97
|
marco@0
|
98 if connected:
|
marco@0
|
99 # List available collections
|
marco@0
|
100 print "------------------------"
|
marco@0
|
101 print "Welcome to the ",c.workspaces[0][0], "repository"
|
marco@0
|
102 print "Available Collections: "
|
marco@0
|
103 numColl = len(c.workspaces[0][1])
|
marco@0
|
104 for ctr in range(numColl):
|
marco@0
|
105 coll = c.workspaces[0][1][ctr]
|
marco@0
|
106 print ctr+1,":",coll.title
|
marco@0
|
107 # Select a collection to deposit into
|
marco@0
|
108 sel = -1
|
marco@0
|
109 while (sel<=0 or sel>numColl):
|
marco@0
|
110 sel = input("Select a Collection to submit your files into: ")
|
marco@0
|
111 collection = c.workspaces[0][1][sel-1]
|
marco@0
|
112 print "Selected Collection: ",collection.title
|
marco@0
|
113
|
marco@4
|
114 # Create a submission
|
marco@4
|
115 fileslist = []
|
marco@4
|
116 temp = False # Delete temp files
|
marco@4
|
117 # If folder
|
marco@4
|
118 if os.path.isdir(data):
|
marco@4
|
119 if args.zip: # If zip option, zip all the files and maintain the structure, but start from the base only...
|
marco@4
|
120 dataname = os.path.basename(os.path.normpath(data))
|
marco@9
|
121 if args.title != None:
|
marco@9
|
122 zipf = args.title[0].replace(" ","_")+".zip"
|
marco@9
|
123 else:
|
marco@9
|
124 zipf = dataname.replace(" ","_")+".zip"
|
marco@4
|
125 myzip = zipfile.ZipFile(zipf, "w")
|
marco@4
|
126 # get the directory structure
|
marco@4
|
127 print "Creating a zip archive for submission..."
|
marco@4
|
128 for root, dirs, files in os.walk(data):
|
marco@4
|
129 for name in files:
|
marco@8
|
130 if not name.startswith('.'): # Do not upload hidden files, OSX/linux
|
marco@8
|
131 myzip.write(os.path.join(root,name),
|
marco@9
|
132 os.path.relpath(os.path.join(root,name),data).replace(" ","_").replace("[","(").replace("]",")")) # Remove spaces and square brakets
|
marco@4
|
133 fileslist.append(zipf)
|
marco@5
|
134 myzip.close()
|
marco@4
|
135 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
136 type = "SimpleZip"
|
marco@4
|
137 temp = True
|
marco@4
|
138 else: #create a list of files to upload
|
marco@4
|
139 for root, dirs, files in os.walk(data):
|
marco@4
|
140 for name in files:
|
marco@9
|
141 if not name.startswith('.'):
|
marco@9
|
142 fileslist.append(os.path.join(root,name))
|
marco@4
|
143 type = "multiple files"
|
marco@4
|
144 elif zipfile.is_zipfile(data): #This is a zip file
|
marco@4
|
145 fileslist.append(data)
|
marco@4
|
146 myzip = zipfile.ZipFile(data)
|
marco@4
|
147 if "mets.xml" in myzip.namelist(): #This is a METS package
|
marco@4
|
148 packaging = "http://purl.org/net/sword/package/METSDSpaceSIP"
|
marco@4
|
149 type = "METS"
|
marco@8
|
150 in_progress = False
|
marco@10
|
151 elif "bagit.txt" in "".join(myzip.namelist()): #This is a BagIt package
|
marco@10
|
152 packaging = "http://purl.org/net/sword/package/BagIt"
|
marco@10
|
153 type = "BAGIT"
|
marco@10
|
154 else:#THis is a simple zip file
|
marco@4
|
155 packaging = "http://purl.org/net/sword/package/SimpleZip"
|
marco@4
|
156 type = "SimpleZip"
|
marco@4
|
157 myzip.close()
|
marco@0
|
158 elif os.path.isfile(data): # This is a single file
|
marco@4
|
159 fileslist.append(data)
|
marco@4
|
160 type = "single file"
|
marco@0
|
161 else:
|
marco@0
|
162 print "Couldn't find the data."
|
marco@0
|
163 sys.exit()
|
marco@0
|
164
|
marco@0
|
165 print "------------------------"
|
marco@0
|
166 print "This is a ",type," submission"
|
marco@4
|
167
|
marco@4
|
168 # Create a metadata entry
|
marco@4
|
169 if (args.title != None) or (args.author != None) or (args.date != None):
|
marco@4
|
170 entry = Entry()
|
marco@4
|
171 if args.title != None:
|
marco@4
|
172 entry.add_fields(dcterms_title = args.title[0])
|
marco@4
|
173 if args.author != None:
|
marco@4
|
174 for creator in args.author:
|
marco@4
|
175 entry.add_fields(dcterms_creator=creator)
|
marco@4
|
176 if args.date != None:
|
marco@4
|
177 entry.add_fields(dcterms_created = args.date[0])
|
marco@4
|
178 else:
|
marco@4
|
179 entry = None
|
marco@4
|
180 # Select what to do
|
marco@8
|
181 if (type is "single file") or (type is "multiple files"): # Use the single file upload procedure
|
marco@4
|
182 try:
|
marco@4
|
183 # Create the metadata entry with ATOM
|
marco@4
|
184 print "------------------------"
|
marco@4
|
185 print "Creating the item..."
|
marco@4
|
186 if entry is None:
|
marco@4
|
187 entry = Entry(dcterms_title=(os.path.basename(data)))
|
marco@8
|
188 creation_receipt = c.create(col_iri = collection.href, metadata_entry = entry, in_progress=True)
|
marco@4
|
189
|
marco@4
|
190 # Add the files
|
marco@4
|
191 for f in fileslist:
|
marco@4
|
192 print "Uploading file ",os.path.basename(f)
|
marco@4
|
193 payload = open(f,"rb")
|
marco@4
|
194 deposit_receipt = c.add_file_to_resource(edit_media_iri = creation_receipt.edit_media,
|
marco@4
|
195 payload = payload,
|
marco@4
|
196 filename = os.path.basename(f),
|
marco@4
|
197 mimetype = 'application/zip',
|
marco@4
|
198 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@4
|
199 payload.close()
|
marco@4
|
200 except HTTPResponseError:
|
marco@4
|
201 print "Bad request"
|
marco@4
|
202 else:
|
marco@8
|
203 # Send the zip file and let the ingester do its job
|
marco@4
|
204 payload = open(fileslist[0], "rb")
|
marco@10
|
205 if (type == "SimpleZip") or (type=="BAGIT"):
|
marco@8
|
206 in_progress = True
|
marco@9
|
207 # FIXME: we don't want to write silly things in dc.description!
|
marco@8
|
208 else:
|
marco@8
|
209 in_progress = False
|
marco@4
|
210 try:
|
marco@4
|
211 deposit_receipt = c.create(col_iri = collection.href,
|
marco@4
|
212 payload = payload,
|
marco@4
|
213 filename = fileslist[0],
|
marco@4
|
214 mimetype = "application/zip",
|
marco@8
|
215 packaging = packaging,
|
marco@8
|
216 in_progress = in_progress)
|
marco@4
|
217 print type, " submission successful."
|
marco@4
|
218 except:
|
marco@4
|
219 print "Error! Couldn't submit the file!"
|
marco@4
|
220 if type == "METS": # Just guessing: not sure this is the problem...
|
marco@4
|
221 print "To submit a METS package, the collection MUST have a workflow!"
|
marco@4
|
222 payload.close()
|
marco@4
|
223
|
marco@8
|
224 # If some of the additional arguments for author, title, date etc. have been specified, update the metadata (only SimpleZip)
|
marco@7
|
225 if type == "SimpleZip":
|
marco@7
|
226 if entry is None:
|
marco@7
|
227 entry = Entry(dcterms_title=(os.path.basename(fileslist[0])))
|
marco@4
|
228 try:
|
marco@8
|
229 update_receipt = c.update(dr = deposit_receipt , metadata_entry = entry, in_progress = True) # in_progress is True: we don't want to close the submission
|
marco@7
|
230 print "Metadata update successfull."
|
marco@4
|
231 except:
|
marco@4
|
232 print "Server error"
|
marco@8
|
233 # If we want to store the zip file along with the individual files (Only SimpleZip)
|
marco@8
|
234 if storezip:
|
marco@8
|
235 try:
|
marco@8
|
236 payload = open(fileslist[0],"rb")
|
marco@8
|
237 zipdeposit_receipt = c.add_file_to_resource(edit_media_iri = deposit_receipt.edit_media,
|
marco@8
|
238 payload = payload,
|
marco@8
|
239 filename = os.path.basename(fileslist[0]).replace(" ","_"),
|
marco@8
|
240 mimetype = 'application/zip',
|
marco@8
|
241 packaging = 'http://purl.org/net/sword/package/Binary')
|
marco@8
|
242 payload.close()
|
marco@8
|
243 print "Zip file successfully added to the bitstreams."
|
marco@8
|
244 except:
|
marco@8
|
245 print "Server error: could not add the zip file to the resources"
|
marco@4
|
246 if temp:
|
marco@4
|
247 os.remove(fileslist[0])
|
marco@0
|
248
|
marco@4
|
249 print "------------------------"
|
marco@4
|
250 print "You will find the submission in the \"Submissions\" list in your DSpace account. To complete/edit it with metadata and licenses, click on the title and then on \"Resume\"."
|
marco@4
|
251
|
marco@0
|
252
|
marco@0
|
253 else: # Failed to connect to SWORDv2 Server
|
marco@0
|
254 print "Couldn't connect to the server."
|
marco@0
|
255 if attempts == 0:
|
marco@0
|
256 print "Invalid credentials entered 3 times."
|
marco@0
|
257
|
marco@0
|
258 except KeyboardInterrupt:
|
marco@0
|
259 print "------------------------"
|
marco@3
|
260 print "\nSubmission aborted by user." |