Chris@87
|
1 """A file interface for handling local and remote data files.
|
Chris@87
|
2
|
Chris@87
|
3 The goal of datasource is to abstract some of the file system operations
|
Chris@87
|
4 when dealing with data files so the researcher doesn't have to know all the
|
Chris@87
|
5 low-level details. Through datasource, a researcher can obtain and use a
|
Chris@87
|
6 file with one function call, regardless of location of the file.
|
Chris@87
|
7
|
Chris@87
|
8 DataSource is meant to augment standard python libraries, not replace them.
|
Chris@87
|
9 It should work seemlessly with standard file IO operations and the os
|
Chris@87
|
10 module.
|
Chris@87
|
11
|
Chris@87
|
12 DataSource files can originate locally or remotely:
|
Chris@87
|
13
|
Chris@87
|
14 - local files : '/home/guido/src/local/data.txt'
|
Chris@87
|
15 - URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
|
Chris@87
|
16
|
Chris@87
|
17 DataSource files can also be compressed or uncompressed. Currently only
|
Chris@87
|
18 gzip and bz2 are supported.
|
Chris@87
|
19
|
Chris@87
|
20 Example::
|
Chris@87
|
21
|
Chris@87
|
22 >>> # Create a DataSource, use os.curdir (default) for local storage.
|
Chris@87
|
23 >>> ds = datasource.DataSource()
|
Chris@87
|
24 >>>
|
Chris@87
|
25 >>> # Open a remote file.
|
Chris@87
|
26 >>> # DataSource downloads the file, stores it locally in:
|
Chris@87
|
27 >>> # './www.google.com/index.html'
|
Chris@87
|
28 >>> # opens the file and returns a file object.
|
Chris@87
|
29 >>> fp = ds.open('http://www.google.com/index.html')
|
Chris@87
|
30 >>>
|
Chris@87
|
31 >>> # Use the file as you normally would
|
Chris@87
|
32 >>> fp.read()
|
Chris@87
|
33 >>> fp.close()
|
Chris@87
|
34
|
Chris@87
|
35 """
|
Chris@87
|
36 from __future__ import division, absolute_import, print_function
|
Chris@87
|
37
|
Chris@87
|
38 import os
|
Chris@87
|
39 import sys
|
Chris@87
|
40 import shutil
|
Chris@87
|
41
|
Chris@87
|
42 _open = open
|
Chris@87
|
43
|
Chris@87
|
44
|
Chris@87
|
45 # Using a class instead of a module-level dictionary
|
Chris@87
|
46 # to reduce the inital 'import numpy' overhead by
|
Chris@87
|
47 # deferring the import of bz2 and gzip until needed
|
Chris@87
|
48
|
Chris@87
|
49 # TODO: .zip support, .tar support?
|
Chris@87
|
50 class _FileOpeners(object):
|
Chris@87
|
51 """
|
Chris@87
|
52 Container for different methods to open (un-)compressed files.
|
Chris@87
|
53
|
Chris@87
|
54 `_FileOpeners` contains a dictionary that holds one method for each
|
Chris@87
|
55 supported file format. Attribute lookup is implemented in such a way
|
Chris@87
|
56 that an instance of `_FileOpeners` itself can be indexed with the keys
|
Chris@87
|
57 of that dictionary. Currently uncompressed files as well as files
|
Chris@87
|
58 compressed with ``gzip`` or ``bz2`` compression are supported.
|
Chris@87
|
59
|
Chris@87
|
60 Notes
|
Chris@87
|
61 -----
|
Chris@87
|
62 `_file_openers`, an instance of `_FileOpeners`, is made available for
|
Chris@87
|
63 use in the `_datasource` module.
|
Chris@87
|
64
|
Chris@87
|
65 Examples
|
Chris@87
|
66 --------
|
Chris@87
|
67 >>> np.lib._datasource._file_openers.keys()
|
Chris@87
|
68 [None, '.bz2', '.gz']
|
Chris@87
|
69 >>> np.lib._datasource._file_openers['.gz'] is gzip.open
|
Chris@87
|
70 True
|
Chris@87
|
71
|
Chris@87
|
72 """
|
Chris@87
|
73
|
Chris@87
|
74 def __init__(self):
|
Chris@87
|
75 self._loaded = False
|
Chris@87
|
76 self._file_openers = {None: open}
|
Chris@87
|
77
|
Chris@87
|
78 def _load(self):
|
Chris@87
|
79 if self._loaded:
|
Chris@87
|
80 return
|
Chris@87
|
81 try:
|
Chris@87
|
82 import bz2
|
Chris@87
|
83 self._file_openers[".bz2"] = bz2.BZ2File
|
Chris@87
|
84 except ImportError:
|
Chris@87
|
85 pass
|
Chris@87
|
86 try:
|
Chris@87
|
87 import gzip
|
Chris@87
|
88 self._file_openers[".gz"] = gzip.open
|
Chris@87
|
89 except ImportError:
|
Chris@87
|
90 pass
|
Chris@87
|
91 self._loaded = True
|
Chris@87
|
92
|
Chris@87
|
93 def keys(self):
|
Chris@87
|
94 """
|
Chris@87
|
95 Return the keys of currently supported file openers.
|
Chris@87
|
96
|
Chris@87
|
97 Parameters
|
Chris@87
|
98 ----------
|
Chris@87
|
99 None
|
Chris@87
|
100
|
Chris@87
|
101 Returns
|
Chris@87
|
102 -------
|
Chris@87
|
103 keys : list
|
Chris@87
|
104 The keys are None for uncompressed files and the file extension
|
Chris@87
|
105 strings (i.e. ``'.gz'``, ``'.bz2'``) for supported compression
|
Chris@87
|
106 methods.
|
Chris@87
|
107
|
Chris@87
|
108 """
|
Chris@87
|
109 self._load()
|
Chris@87
|
110 return list(self._file_openers.keys())
|
Chris@87
|
111
|
Chris@87
|
112 def __getitem__(self, key):
|
Chris@87
|
113 self._load()
|
Chris@87
|
114 return self._file_openers[key]
|
Chris@87
|
115
|
Chris@87
|
116 _file_openers = _FileOpeners()
|
Chris@87
|
117
|
Chris@87
|
118 def open(path, mode='r', destpath=os.curdir):
|
Chris@87
|
119 """
|
Chris@87
|
120 Open `path` with `mode` and return the file object.
|
Chris@87
|
121
|
Chris@87
|
122 If ``path`` is an URL, it will be downloaded, stored in the
|
Chris@87
|
123 `DataSource` `destpath` directory and opened from there.
|
Chris@87
|
124
|
Chris@87
|
125 Parameters
|
Chris@87
|
126 ----------
|
Chris@87
|
127 path : str
|
Chris@87
|
128 Local file path or URL to open.
|
Chris@87
|
129 mode : str, optional
|
Chris@87
|
130 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
|
Chris@87
|
131 append. Available modes depend on the type of object specified by
|
Chris@87
|
132 path. Default is 'r'.
|
Chris@87
|
133 destpath : str, optional
|
Chris@87
|
134 Path to the directory where the source file gets downloaded to for
|
Chris@87
|
135 use. If `destpath` is None, a temporary directory will be created.
|
Chris@87
|
136 The default path is the current directory.
|
Chris@87
|
137
|
Chris@87
|
138 Returns
|
Chris@87
|
139 -------
|
Chris@87
|
140 out : file object
|
Chris@87
|
141 The opened file.
|
Chris@87
|
142
|
Chris@87
|
143 Notes
|
Chris@87
|
144 -----
|
Chris@87
|
145 This is a convenience function that instantiates a `DataSource` and
|
Chris@87
|
146 returns the file object from ``DataSource.open(path)``.
|
Chris@87
|
147
|
Chris@87
|
148 """
|
Chris@87
|
149
|
Chris@87
|
150 ds = DataSource(destpath)
|
Chris@87
|
151 return ds.open(path, mode)
|
Chris@87
|
152
|
Chris@87
|
153
|
Chris@87
|
154 class DataSource (object):
|
Chris@87
|
155 """
|
Chris@87
|
156 DataSource(destpath='.')
|
Chris@87
|
157
|
Chris@87
|
158 A generic data source file (file, http, ftp, ...).
|
Chris@87
|
159
|
Chris@87
|
160 DataSources can be local files or remote files/URLs. The files may
|
Chris@87
|
161 also be compressed or uncompressed. DataSource hides some of the
|
Chris@87
|
162 low-level details of downloading the file, allowing you to simply pass
|
Chris@87
|
163 in a valid file path (or URL) and obtain a file object.
|
Chris@87
|
164
|
Chris@87
|
165 Parameters
|
Chris@87
|
166 ----------
|
Chris@87
|
167 destpath : str or None, optional
|
Chris@87
|
168 Path to the directory where the source file gets downloaded to for
|
Chris@87
|
169 use. If `destpath` is None, a temporary directory will be created.
|
Chris@87
|
170 The default path is the current directory.
|
Chris@87
|
171
|
Chris@87
|
172 Notes
|
Chris@87
|
173 -----
|
Chris@87
|
174 URLs require a scheme string (``http://``) to be used, without it they
|
Chris@87
|
175 will fail::
|
Chris@87
|
176
|
Chris@87
|
177 >>> repos = DataSource()
|
Chris@87
|
178 >>> repos.exists('www.google.com/index.html')
|
Chris@87
|
179 False
|
Chris@87
|
180 >>> repos.exists('http://www.google.com/index.html')
|
Chris@87
|
181 True
|
Chris@87
|
182
|
Chris@87
|
183 Temporary directories are deleted when the DataSource is deleted.
|
Chris@87
|
184
|
Chris@87
|
185 Examples
|
Chris@87
|
186 --------
|
Chris@87
|
187 ::
|
Chris@87
|
188
|
Chris@87
|
189 >>> ds = DataSource('/home/guido')
|
Chris@87
|
190 >>> urlname = 'http://www.google.com/index.html'
|
Chris@87
|
191 >>> gfile = ds.open('http://www.google.com/index.html') # remote file
|
Chris@87
|
192 >>> ds.abspath(urlname)
|
Chris@87
|
193 '/home/guido/www.google.com/site/index.html'
|
Chris@87
|
194
|
Chris@87
|
195 >>> ds = DataSource(None) # use with temporary file
|
Chris@87
|
196 >>> ds.open('/home/guido/foobar.txt')
|
Chris@87
|
197 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
|
Chris@87
|
198 >>> ds.abspath('/home/guido/foobar.txt')
|
Chris@87
|
199 '/tmp/tmpy4pgsP/home/guido/foobar.txt'
|
Chris@87
|
200
|
Chris@87
|
201 """
|
Chris@87
|
202
|
Chris@87
|
203 def __init__(self, destpath=os.curdir):
|
Chris@87
|
204 """Create a DataSource with a local path at destpath."""
|
Chris@87
|
205 if destpath:
|
Chris@87
|
206 self._destpath = os.path.abspath(destpath)
|
Chris@87
|
207 self._istmpdest = False
|
Chris@87
|
208 else:
|
Chris@87
|
209 import tempfile # deferring import to improve startup time
|
Chris@87
|
210 self._destpath = tempfile.mkdtemp()
|
Chris@87
|
211 self._istmpdest = True
|
Chris@87
|
212
|
Chris@87
|
213 def __del__(self):
|
Chris@87
|
214 # Remove temp directories
|
Chris@87
|
215 if self._istmpdest:
|
Chris@87
|
216 shutil.rmtree(self._destpath)
|
Chris@87
|
217
|
Chris@87
|
218 def _iszip(self, filename):
|
Chris@87
|
219 """Test if the filename is a zip file by looking at the file extension.
|
Chris@87
|
220
|
Chris@87
|
221 """
|
Chris@87
|
222 fname, ext = os.path.splitext(filename)
|
Chris@87
|
223 return ext in _file_openers.keys()
|
Chris@87
|
224
|
Chris@87
|
225 def _iswritemode(self, mode):
|
Chris@87
|
226 """Test if the given mode will open a file for writing."""
|
Chris@87
|
227
|
Chris@87
|
228 # Currently only used to test the bz2 files.
|
Chris@87
|
229 _writemodes = ("w", "+")
|
Chris@87
|
230 for c in mode:
|
Chris@87
|
231 if c in _writemodes:
|
Chris@87
|
232 return True
|
Chris@87
|
233 return False
|
Chris@87
|
234
|
Chris@87
|
235 def _splitzipext(self, filename):
|
Chris@87
|
236 """Split zip extension from filename and return filename.
|
Chris@87
|
237
|
Chris@87
|
238 *Returns*:
|
Chris@87
|
239 base, zip_ext : {tuple}
|
Chris@87
|
240
|
Chris@87
|
241 """
|
Chris@87
|
242
|
Chris@87
|
243 if self._iszip(filename):
|
Chris@87
|
244 return os.path.splitext(filename)
|
Chris@87
|
245 else:
|
Chris@87
|
246 return filename, None
|
Chris@87
|
247
|
Chris@87
|
248 def _possible_names(self, filename):
|
Chris@87
|
249 """Return a tuple containing compressed filename variations."""
|
Chris@87
|
250 names = [filename]
|
Chris@87
|
251 if not self._iszip(filename):
|
Chris@87
|
252 for zipext in _file_openers.keys():
|
Chris@87
|
253 if zipext:
|
Chris@87
|
254 names.append(filename+zipext)
|
Chris@87
|
255 return names
|
Chris@87
|
256
|
Chris@87
|
257 def _isurl(self, path):
|
Chris@87
|
258 """Test if path is a net location. Tests the scheme and netloc."""
|
Chris@87
|
259
|
Chris@87
|
260 # We do this here to reduce the 'import numpy' initial import time.
|
Chris@87
|
261 if sys.version_info[0] >= 3:
|
Chris@87
|
262 from urllib.parse import urlparse
|
Chris@87
|
263 else:
|
Chris@87
|
264 from urlparse import urlparse
|
Chris@87
|
265
|
Chris@87
|
266 # BUG : URLs require a scheme string ('http://') to be used.
|
Chris@87
|
267 # www.google.com will fail.
|
Chris@87
|
268 # Should we prepend the scheme for those that don't have it and
|
Chris@87
|
269 # test that also? Similar to the way we append .gz and test for
|
Chris@87
|
270 # for compressed versions of files.
|
Chris@87
|
271
|
Chris@87
|
272 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
|
Chris@87
|
273 return bool(scheme and netloc)
|
Chris@87
|
274
|
Chris@87
|
275 def _cache(self, path):
|
Chris@87
|
276 """Cache the file specified by path.
|
Chris@87
|
277
|
Chris@87
|
278 Creates a copy of the file in the datasource cache.
|
Chris@87
|
279
|
Chris@87
|
280 """
|
Chris@87
|
281 # We import these here because importing urllib2 is slow and
|
Chris@87
|
282 # a significant fraction of numpy's total import time.
|
Chris@87
|
283 if sys.version_info[0] >= 3:
|
Chris@87
|
284 from urllib.request import urlopen
|
Chris@87
|
285 from urllib.error import URLError
|
Chris@87
|
286 else:
|
Chris@87
|
287 from urllib2 import urlopen
|
Chris@87
|
288 from urllib2 import URLError
|
Chris@87
|
289
|
Chris@87
|
290 upath = self.abspath(path)
|
Chris@87
|
291
|
Chris@87
|
292 # ensure directory exists
|
Chris@87
|
293 if not os.path.exists(os.path.dirname(upath)):
|
Chris@87
|
294 os.makedirs(os.path.dirname(upath))
|
Chris@87
|
295
|
Chris@87
|
296 # TODO: Doesn't handle compressed files!
|
Chris@87
|
297 if self._isurl(path):
|
Chris@87
|
298 try:
|
Chris@87
|
299 openedurl = urlopen(path)
|
Chris@87
|
300 f = _open(upath, 'wb')
|
Chris@87
|
301 try:
|
Chris@87
|
302 shutil.copyfileobj(openedurl, f)
|
Chris@87
|
303 finally:
|
Chris@87
|
304 f.close()
|
Chris@87
|
305 openedurl.close()
|
Chris@87
|
306 except URLError:
|
Chris@87
|
307 raise URLError("URL not found: %s" % path)
|
Chris@87
|
308 else:
|
Chris@87
|
309 shutil.copyfile(path, upath)
|
Chris@87
|
310 return upath
|
Chris@87
|
311
|
Chris@87
|
312 def _findfile(self, path):
|
Chris@87
|
313 """Searches for ``path`` and returns full path if found.
|
Chris@87
|
314
|
Chris@87
|
315 If path is an URL, _findfile will cache a local copy and return the
|
Chris@87
|
316 path to the cached file. If path is a local file, _findfile will
|
Chris@87
|
317 return a path to that local file.
|
Chris@87
|
318
|
Chris@87
|
319 The search will include possible compressed versions of the file
|
Chris@87
|
320 and return the first occurence found.
|
Chris@87
|
321
|
Chris@87
|
322 """
|
Chris@87
|
323
|
Chris@87
|
324 # Build list of possible local file paths
|
Chris@87
|
325 if not self._isurl(path):
|
Chris@87
|
326 # Valid local paths
|
Chris@87
|
327 filelist = self._possible_names(path)
|
Chris@87
|
328 # Paths in self._destpath
|
Chris@87
|
329 filelist += self._possible_names(self.abspath(path))
|
Chris@87
|
330 else:
|
Chris@87
|
331 # Cached URLs in self._destpath
|
Chris@87
|
332 filelist = self._possible_names(self.abspath(path))
|
Chris@87
|
333 # Remote URLs
|
Chris@87
|
334 filelist = filelist + self._possible_names(path)
|
Chris@87
|
335
|
Chris@87
|
336 for name in filelist:
|
Chris@87
|
337 if self.exists(name):
|
Chris@87
|
338 if self._isurl(name):
|
Chris@87
|
339 name = self._cache(name)
|
Chris@87
|
340 return name
|
Chris@87
|
341 return None
|
Chris@87
|
342
|
Chris@87
|
343 def abspath(self, path):
|
Chris@87
|
344 """
|
Chris@87
|
345 Return absolute path of file in the DataSource directory.
|
Chris@87
|
346
|
Chris@87
|
347 If `path` is an URL, then `abspath` will return either the location
|
Chris@87
|
348 the file exists locally or the location it would exist when opened
|
Chris@87
|
349 using the `open` method.
|
Chris@87
|
350
|
Chris@87
|
351 Parameters
|
Chris@87
|
352 ----------
|
Chris@87
|
353 path : str
|
Chris@87
|
354 Can be a local file or a remote URL.
|
Chris@87
|
355
|
Chris@87
|
356 Returns
|
Chris@87
|
357 -------
|
Chris@87
|
358 out : str
|
Chris@87
|
359 Complete path, including the `DataSource` destination directory.
|
Chris@87
|
360
|
Chris@87
|
361 Notes
|
Chris@87
|
362 -----
|
Chris@87
|
363 The functionality is based on `os.path.abspath`.
|
Chris@87
|
364
|
Chris@87
|
365 """
|
Chris@87
|
366 # We do this here to reduce the 'import numpy' initial import time.
|
Chris@87
|
367 if sys.version_info[0] >= 3:
|
Chris@87
|
368 from urllib.parse import urlparse
|
Chris@87
|
369 else:
|
Chris@87
|
370 from urlparse import urlparse
|
Chris@87
|
371
|
Chris@87
|
372 # TODO: This should be more robust. Handles case where path includes
|
Chris@87
|
373 # the destpath, but not other sub-paths. Failing case:
|
Chris@87
|
374 # path = /home/guido/datafile.txt
|
Chris@87
|
375 # destpath = /home/alex/
|
Chris@87
|
376 # upath = self.abspath(path)
|
Chris@87
|
377 # upath == '/home/alex/home/guido/datafile.txt'
|
Chris@87
|
378
|
Chris@87
|
379 # handle case where path includes self._destpath
|
Chris@87
|
380 splitpath = path.split(self._destpath, 2)
|
Chris@87
|
381 if len(splitpath) > 1:
|
Chris@87
|
382 path = splitpath[1]
|
Chris@87
|
383 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
|
Chris@87
|
384 netloc = self._sanitize_relative_path(netloc)
|
Chris@87
|
385 upath = self._sanitize_relative_path(upath)
|
Chris@87
|
386 return os.path.join(self._destpath, netloc, upath)
|
Chris@87
|
387
|
Chris@87
|
388 def _sanitize_relative_path(self, path):
|
Chris@87
|
389 """Return a sanitised relative path for which
|
Chris@87
|
390 os.path.abspath(os.path.join(base, path)).startswith(base)
|
Chris@87
|
391 """
|
Chris@87
|
392 last = None
|
Chris@87
|
393 path = os.path.normpath(path)
|
Chris@87
|
394 while path != last:
|
Chris@87
|
395 last = path
|
Chris@87
|
396 # Note: os.path.join treats '/' as os.sep on Windows
|
Chris@87
|
397 path = path.lstrip(os.sep).lstrip('/')
|
Chris@87
|
398 path = path.lstrip(os.pardir).lstrip('..')
|
Chris@87
|
399 drive, path = os.path.splitdrive(path) # for Windows
|
Chris@87
|
400 return path
|
Chris@87
|
401
|
Chris@87
|
402 def exists(self, path):
|
Chris@87
|
403 """
|
Chris@87
|
404 Test if path exists.
|
Chris@87
|
405
|
Chris@87
|
406 Test if `path` exists as (and in this order):
|
Chris@87
|
407
|
Chris@87
|
408 - a local file.
|
Chris@87
|
409 - a remote URL that has been downloaded and stored locally in the
|
Chris@87
|
410 `DataSource` directory.
|
Chris@87
|
411 - a remote URL that has not been downloaded, but is valid and
|
Chris@87
|
412 accessible.
|
Chris@87
|
413
|
Chris@87
|
414 Parameters
|
Chris@87
|
415 ----------
|
Chris@87
|
416 path : str
|
Chris@87
|
417 Can be a local file or a remote URL.
|
Chris@87
|
418
|
Chris@87
|
419 Returns
|
Chris@87
|
420 -------
|
Chris@87
|
421 out : bool
|
Chris@87
|
422 True if `path` exists.
|
Chris@87
|
423
|
Chris@87
|
424 Notes
|
Chris@87
|
425 -----
|
Chris@87
|
426 When `path` is an URL, `exists` will return True if it's either
|
Chris@87
|
427 stored locally in the `DataSource` directory, or is a valid remote
|
Chris@87
|
428 URL. `DataSource` does not discriminate between the two, the file
|
Chris@87
|
429 is accessible if it exists in either location.
|
Chris@87
|
430
|
Chris@87
|
431 """
|
Chris@87
|
432 # We import this here because importing urllib2 is slow and
|
Chris@87
|
433 # a significant fraction of numpy's total import time.
|
Chris@87
|
434 if sys.version_info[0] >= 3:
|
Chris@87
|
435 from urllib.request import urlopen
|
Chris@87
|
436 from urllib.error import URLError
|
Chris@87
|
437 else:
|
Chris@87
|
438 from urllib2 import urlopen
|
Chris@87
|
439 from urllib2 import URLError
|
Chris@87
|
440
|
Chris@87
|
441 # Test local path
|
Chris@87
|
442 if os.path.exists(path):
|
Chris@87
|
443 return True
|
Chris@87
|
444
|
Chris@87
|
445 # Test cached url
|
Chris@87
|
446 upath = self.abspath(path)
|
Chris@87
|
447 if os.path.exists(upath):
|
Chris@87
|
448 return True
|
Chris@87
|
449
|
Chris@87
|
450 # Test remote url
|
Chris@87
|
451 if self._isurl(path):
|
Chris@87
|
452 try:
|
Chris@87
|
453 netfile = urlopen(path)
|
Chris@87
|
454 netfile.close()
|
Chris@87
|
455 del(netfile)
|
Chris@87
|
456 return True
|
Chris@87
|
457 except URLError:
|
Chris@87
|
458 return False
|
Chris@87
|
459 return False
|
Chris@87
|
460
|
Chris@87
|
461 def open(self, path, mode='r'):
|
Chris@87
|
462 """
|
Chris@87
|
463 Open and return file-like object.
|
Chris@87
|
464
|
Chris@87
|
465 If `path` is an URL, it will be downloaded, stored in the
|
Chris@87
|
466 `DataSource` directory and opened from there.
|
Chris@87
|
467
|
Chris@87
|
468 Parameters
|
Chris@87
|
469 ----------
|
Chris@87
|
470 path : str
|
Chris@87
|
471 Local file path or URL to open.
|
Chris@87
|
472 mode : {'r', 'w', 'a'}, optional
|
Chris@87
|
473 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
|
Chris@87
|
474 'a' to append. Available modes depend on the type of object
|
Chris@87
|
475 specified by `path`. Default is 'r'.
|
Chris@87
|
476
|
Chris@87
|
477 Returns
|
Chris@87
|
478 -------
|
Chris@87
|
479 out : file object
|
Chris@87
|
480 File object.
|
Chris@87
|
481
|
Chris@87
|
482 """
|
Chris@87
|
483
|
Chris@87
|
484 # TODO: There is no support for opening a file for writing which
|
Chris@87
|
485 # doesn't exist yet (creating a file). Should there be?
|
Chris@87
|
486
|
Chris@87
|
487 # TODO: Add a ``subdir`` parameter for specifying the subdirectory
|
Chris@87
|
488 # used to store URLs in self._destpath.
|
Chris@87
|
489
|
Chris@87
|
490 if self._isurl(path) and self._iswritemode(mode):
|
Chris@87
|
491 raise ValueError("URLs are not writeable")
|
Chris@87
|
492
|
Chris@87
|
493 # NOTE: _findfile will fail on a new file opened for writing.
|
Chris@87
|
494 found = self._findfile(path)
|
Chris@87
|
495 if found:
|
Chris@87
|
496 _fname, ext = self._splitzipext(found)
|
Chris@87
|
497 if ext == 'bz2':
|
Chris@87
|
498 mode.replace("+", "")
|
Chris@87
|
499 return _file_openers[ext](found, mode=mode)
|
Chris@87
|
500 else:
|
Chris@87
|
501 raise IOError("%s not found." % path)
|
Chris@87
|
502
|
Chris@87
|
503
|
Chris@87
|
504 class Repository (DataSource):
|
Chris@87
|
505 """
|
Chris@87
|
506 Repository(baseurl, destpath='.')
|
Chris@87
|
507
|
Chris@87
|
508 A data repository where multiple DataSource's share a base
|
Chris@87
|
509 URL/directory.
|
Chris@87
|
510
|
Chris@87
|
511 `Repository` extends `DataSource` by prepending a base URL (or
|
Chris@87
|
512 directory) to all the files it handles. Use `Repository` when you will
|
Chris@87
|
513 be working with multiple files from one base URL. Initialize
|
Chris@87
|
514 `Repository` with the base URL, then refer to each file by its filename
|
Chris@87
|
515 only.
|
Chris@87
|
516
|
Chris@87
|
517 Parameters
|
Chris@87
|
518 ----------
|
Chris@87
|
519 baseurl : str
|
Chris@87
|
520 Path to the local directory or remote location that contains the
|
Chris@87
|
521 data files.
|
Chris@87
|
522 destpath : str or None, optional
|
Chris@87
|
523 Path to the directory where the source file gets downloaded to for
|
Chris@87
|
524 use. If `destpath` is None, a temporary directory will be created.
|
Chris@87
|
525 The default path is the current directory.
|
Chris@87
|
526
|
Chris@87
|
527 Examples
|
Chris@87
|
528 --------
|
Chris@87
|
529 To analyze all files in the repository, do something like this
|
Chris@87
|
530 (note: this is not self-contained code)::
|
Chris@87
|
531
|
Chris@87
|
532 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
|
Chris@87
|
533 >>> for filename in filelist:
|
Chris@87
|
534 ... fp = repos.open(filename)
|
Chris@87
|
535 ... fp.analyze()
|
Chris@87
|
536 ... fp.close()
|
Chris@87
|
537
|
Chris@87
|
538 Similarly you could use a URL for a repository::
|
Chris@87
|
539
|
Chris@87
|
540 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
|
Chris@87
|
541
|
Chris@87
|
542 """
|
Chris@87
|
543
|
Chris@87
|
544 def __init__(self, baseurl, destpath=os.curdir):
|
Chris@87
|
545 """Create a Repository with a shared url or directory of baseurl."""
|
Chris@87
|
546 DataSource.__init__(self, destpath=destpath)
|
Chris@87
|
547 self._baseurl = baseurl
|
Chris@87
|
548
|
Chris@87
|
549 def __del__(self):
|
Chris@87
|
550 DataSource.__del__(self)
|
Chris@87
|
551
|
Chris@87
|
552 def _fullpath(self, path):
|
Chris@87
|
553 """Return complete path for path. Prepends baseurl if necessary."""
|
Chris@87
|
554 splitpath = path.split(self._baseurl, 2)
|
Chris@87
|
555 if len(splitpath) == 1:
|
Chris@87
|
556 result = os.path.join(self._baseurl, path)
|
Chris@87
|
557 else:
|
Chris@87
|
558 result = path # path contains baseurl already
|
Chris@87
|
559 return result
|
Chris@87
|
560
|
Chris@87
|
561 def _findfile(self, path):
|
Chris@87
|
562 """Extend DataSource method to prepend baseurl to ``path``."""
|
Chris@87
|
563 return DataSource._findfile(self, self._fullpath(path))
|
Chris@87
|
564
|
Chris@87
|
565 def abspath(self, path):
|
Chris@87
|
566 """
|
Chris@87
|
567 Return absolute path of file in the Repository directory.
|
Chris@87
|
568
|
Chris@87
|
569 If `path` is an URL, then `abspath` will return either the location
|
Chris@87
|
570 the file exists locally or the location it would exist when opened
|
Chris@87
|
571 using the `open` method.
|
Chris@87
|
572
|
Chris@87
|
573 Parameters
|
Chris@87
|
574 ----------
|
Chris@87
|
575 path : str
|
Chris@87
|
576 Can be a local file or a remote URL. This may, but does not
|
Chris@87
|
577 have to, include the `baseurl` with which the `Repository` was
|
Chris@87
|
578 initialized.
|
Chris@87
|
579
|
Chris@87
|
580 Returns
|
Chris@87
|
581 -------
|
Chris@87
|
582 out : str
|
Chris@87
|
583 Complete path, including the `DataSource` destination directory.
|
Chris@87
|
584
|
Chris@87
|
585 """
|
Chris@87
|
586 return DataSource.abspath(self, self._fullpath(path))
|
Chris@87
|
587
|
Chris@87
|
588 def exists(self, path):
|
Chris@87
|
589 """
|
Chris@87
|
590 Test if path exists prepending Repository base URL to path.
|
Chris@87
|
591
|
Chris@87
|
592 Test if `path` exists as (and in this order):
|
Chris@87
|
593
|
Chris@87
|
594 - a local file.
|
Chris@87
|
595 - a remote URL that has been downloaded and stored locally in the
|
Chris@87
|
596 `DataSource` directory.
|
Chris@87
|
597 - a remote URL that has not been downloaded, but is valid and
|
Chris@87
|
598 accessible.
|
Chris@87
|
599
|
Chris@87
|
600 Parameters
|
Chris@87
|
601 ----------
|
Chris@87
|
602 path : str
|
Chris@87
|
603 Can be a local file or a remote URL. This may, but does not
|
Chris@87
|
604 have to, include the `baseurl` with which the `Repository` was
|
Chris@87
|
605 initialized.
|
Chris@87
|
606
|
Chris@87
|
607 Returns
|
Chris@87
|
608 -------
|
Chris@87
|
609 out : bool
|
Chris@87
|
610 True if `path` exists.
|
Chris@87
|
611
|
Chris@87
|
612 Notes
|
Chris@87
|
613 -----
|
Chris@87
|
614 When `path` is an URL, `exists` will return True if it's either
|
Chris@87
|
615 stored locally in the `DataSource` directory, or is a valid remote
|
Chris@87
|
616 URL. `DataSource` does not discriminate between the two, the file
|
Chris@87
|
617 is accessible if it exists in either location.
|
Chris@87
|
618
|
Chris@87
|
619 """
|
Chris@87
|
620 return DataSource.exists(self, self._fullpath(path))
|
Chris@87
|
621
|
Chris@87
|
622 def open(self, path, mode='r'):
|
Chris@87
|
623 """
|
Chris@87
|
624 Open and return file-like object prepending Repository base URL.
|
Chris@87
|
625
|
Chris@87
|
626 If `path` is an URL, it will be downloaded, stored in the
|
Chris@87
|
627 DataSource directory and opened from there.
|
Chris@87
|
628
|
Chris@87
|
629 Parameters
|
Chris@87
|
630 ----------
|
Chris@87
|
631 path : str
|
Chris@87
|
632 Local file path or URL to open. This may, but does not have to,
|
Chris@87
|
633 include the `baseurl` with which the `Repository` was
|
Chris@87
|
634 initialized.
|
Chris@87
|
635 mode : {'r', 'w', 'a'}, optional
|
Chris@87
|
636 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
|
Chris@87
|
637 'a' to append. Available modes depend on the type of object
|
Chris@87
|
638 specified by `path`. Default is 'r'.
|
Chris@87
|
639
|
Chris@87
|
640 Returns
|
Chris@87
|
641 -------
|
Chris@87
|
642 out : file object
|
Chris@87
|
643 File object.
|
Chris@87
|
644
|
Chris@87
|
645 """
|
Chris@87
|
646 return DataSource.open(self, self._fullpath(path), mode)
|
Chris@87
|
647
|
Chris@87
|
648 def listdir(self):
|
Chris@87
|
649 """
|
Chris@87
|
650 List files in the source Repository.
|
Chris@87
|
651
|
Chris@87
|
652 Returns
|
Chris@87
|
653 -------
|
Chris@87
|
654 files : list of str
|
Chris@87
|
655 List of file names (not containing a directory part).
|
Chris@87
|
656
|
Chris@87
|
657 Notes
|
Chris@87
|
658 -----
|
Chris@87
|
659 Does not currently work for remote repositories.
|
Chris@87
|
660
|
Chris@87
|
661 """
|
Chris@87
|
662 if self._isurl(self._baseurl):
|
Chris@87
|
663 raise NotImplementedError(
|
Chris@87
|
664 "Directory listing of URLs, not supported yet.")
|
Chris@87
|
665 else:
|
Chris@87
|
666 return os.listdir(self._baseurl)
|