comparison DEPENDENCIES/mingw32/Python27/Lib/site-packages/numpy/lib/_datasource.py @ 87:2a2c65a20a8b

Add Python libs and headers
author Chris Cannam
date Wed, 25 Feb 2015 14:05:22 +0000
parents
children
comparison
equal deleted inserted replaced
86:413a9d26189e 87:2a2c65a20a8b
1 """A file interface for handling local and remote data files.
2
3 The goal of datasource is to abstract some of the file system operations
4 when dealing with data files so the researcher doesn't have to know all the
5 low-level details. Through datasource, a researcher can obtain and use a
6 file with one function call, regardless of location of the file.
7
8 DataSource is meant to augment standard python libraries, not replace them.
9 It should work seemlessly with standard file IO operations and the os
10 module.
11
12 DataSource files can originate locally or remotely:
13
14 - local files : '/home/guido/src/local/data.txt'
15 - URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
16
17 DataSource files can also be compressed or uncompressed. Currently only
18 gzip and bz2 are supported.
19
20 Example::
21
22 >>> # Create a DataSource, use os.curdir (default) for local storage.
23 >>> ds = datasource.DataSource()
24 >>>
25 >>> # Open a remote file.
26 >>> # DataSource downloads the file, stores it locally in:
27 >>> # './www.google.com/index.html'
28 >>> # opens the file and returns a file object.
29 >>> fp = ds.open('http://www.google.com/index.html')
30 >>>
31 >>> # Use the file as you normally would
32 >>> fp.read()
33 >>> fp.close()
34
35 """
36 from __future__ import division, absolute_import, print_function
37
38 import os
39 import sys
40 import shutil
41
42 _open = open
43
44
45 # Using a class instead of a module-level dictionary
46 # to reduce the inital 'import numpy' overhead by
47 # deferring the import of bz2 and gzip until needed
48
49 # TODO: .zip support, .tar support?
50 class _FileOpeners(object):
51 """
52 Container for different methods to open (un-)compressed files.
53
54 `_FileOpeners` contains a dictionary that holds one method for each
55 supported file format. Attribute lookup is implemented in such a way
56 that an instance of `_FileOpeners` itself can be indexed with the keys
57 of that dictionary. Currently uncompressed files as well as files
58 compressed with ``gzip`` or ``bz2`` compression are supported.
59
60 Notes
61 -----
62 `_file_openers`, an instance of `_FileOpeners`, is made available for
63 use in the `_datasource` module.
64
65 Examples
66 --------
67 >>> np.lib._datasource._file_openers.keys()
68 [None, '.bz2', '.gz']
69 >>> np.lib._datasource._file_openers['.gz'] is gzip.open
70 True
71
72 """
73
74 def __init__(self):
75 self._loaded = False
76 self._file_openers = {None: open}
77
78 def _load(self):
79 if self._loaded:
80 return
81 try:
82 import bz2
83 self._file_openers[".bz2"] = bz2.BZ2File
84 except ImportError:
85 pass
86 try:
87 import gzip
88 self._file_openers[".gz"] = gzip.open
89 except ImportError:
90 pass
91 self._loaded = True
92
93 def keys(self):
94 """
95 Return the keys of currently supported file openers.
96
97 Parameters
98 ----------
99 None
100
101 Returns
102 -------
103 keys : list
104 The keys are None for uncompressed files and the file extension
105 strings (i.e. ``'.gz'``, ``'.bz2'``) for supported compression
106 methods.
107
108 """
109 self._load()
110 return list(self._file_openers.keys())
111
112 def __getitem__(self, key):
113 self._load()
114 return self._file_openers[key]
115
116 _file_openers = _FileOpeners()
117
118 def open(path, mode='r', destpath=os.curdir):
119 """
120 Open `path` with `mode` and return the file object.
121
122 If ``path`` is an URL, it will be downloaded, stored in the
123 `DataSource` `destpath` directory and opened from there.
124
125 Parameters
126 ----------
127 path : str
128 Local file path or URL to open.
129 mode : str, optional
130 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
131 append. Available modes depend on the type of object specified by
132 path. Default is 'r'.
133 destpath : str, optional
134 Path to the directory where the source file gets downloaded to for
135 use. If `destpath` is None, a temporary directory will be created.
136 The default path is the current directory.
137
138 Returns
139 -------
140 out : file object
141 The opened file.
142
143 Notes
144 -----
145 This is a convenience function that instantiates a `DataSource` and
146 returns the file object from ``DataSource.open(path)``.
147
148 """
149
150 ds = DataSource(destpath)
151 return ds.open(path, mode)
152
153
154 class DataSource (object):
155 """
156 DataSource(destpath='.')
157
158 A generic data source file (file, http, ftp, ...).
159
160 DataSources can be local files or remote files/URLs. The files may
161 also be compressed or uncompressed. DataSource hides some of the
162 low-level details of downloading the file, allowing you to simply pass
163 in a valid file path (or URL) and obtain a file object.
164
165 Parameters
166 ----------
167 destpath : str or None, optional
168 Path to the directory where the source file gets downloaded to for
169 use. If `destpath` is None, a temporary directory will be created.
170 The default path is the current directory.
171
172 Notes
173 -----
174 URLs require a scheme string (``http://``) to be used, without it they
175 will fail::
176
177 >>> repos = DataSource()
178 >>> repos.exists('www.google.com/index.html')
179 False
180 >>> repos.exists('http://www.google.com/index.html')
181 True
182
183 Temporary directories are deleted when the DataSource is deleted.
184
185 Examples
186 --------
187 ::
188
189 >>> ds = DataSource('/home/guido')
190 >>> urlname = 'http://www.google.com/index.html'
191 >>> gfile = ds.open('http://www.google.com/index.html') # remote file
192 >>> ds.abspath(urlname)
193 '/home/guido/www.google.com/site/index.html'
194
195 >>> ds = DataSource(None) # use with temporary file
196 >>> ds.open('/home/guido/foobar.txt')
197 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
198 >>> ds.abspath('/home/guido/foobar.txt')
199 '/tmp/tmpy4pgsP/home/guido/foobar.txt'
200
201 """
202
203 def __init__(self, destpath=os.curdir):
204 """Create a DataSource with a local path at destpath."""
205 if destpath:
206 self._destpath = os.path.abspath(destpath)
207 self._istmpdest = False
208 else:
209 import tempfile # deferring import to improve startup time
210 self._destpath = tempfile.mkdtemp()
211 self._istmpdest = True
212
213 def __del__(self):
214 # Remove temp directories
215 if self._istmpdest:
216 shutil.rmtree(self._destpath)
217
218 def _iszip(self, filename):
219 """Test if the filename is a zip file by looking at the file extension.
220
221 """
222 fname, ext = os.path.splitext(filename)
223 return ext in _file_openers.keys()
224
225 def _iswritemode(self, mode):
226 """Test if the given mode will open a file for writing."""
227
228 # Currently only used to test the bz2 files.
229 _writemodes = ("w", "+")
230 for c in mode:
231 if c in _writemodes:
232 return True
233 return False
234
235 def _splitzipext(self, filename):
236 """Split zip extension from filename and return filename.
237
238 *Returns*:
239 base, zip_ext : {tuple}
240
241 """
242
243 if self._iszip(filename):
244 return os.path.splitext(filename)
245 else:
246 return filename, None
247
248 def _possible_names(self, filename):
249 """Return a tuple containing compressed filename variations."""
250 names = [filename]
251 if not self._iszip(filename):
252 for zipext in _file_openers.keys():
253 if zipext:
254 names.append(filename+zipext)
255 return names
256
257 def _isurl(self, path):
258 """Test if path is a net location. Tests the scheme and netloc."""
259
260 # We do this here to reduce the 'import numpy' initial import time.
261 if sys.version_info[0] >= 3:
262 from urllib.parse import urlparse
263 else:
264 from urlparse import urlparse
265
266 # BUG : URLs require a scheme string ('http://') to be used.
267 # www.google.com will fail.
268 # Should we prepend the scheme for those that don't have it and
269 # test that also? Similar to the way we append .gz and test for
270 # for compressed versions of files.
271
272 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
273 return bool(scheme and netloc)
274
275 def _cache(self, path):
276 """Cache the file specified by path.
277
278 Creates a copy of the file in the datasource cache.
279
280 """
281 # We import these here because importing urllib2 is slow and
282 # a significant fraction of numpy's total import time.
283 if sys.version_info[0] >= 3:
284 from urllib.request import urlopen
285 from urllib.error import URLError
286 else:
287 from urllib2 import urlopen
288 from urllib2 import URLError
289
290 upath = self.abspath(path)
291
292 # ensure directory exists
293 if not os.path.exists(os.path.dirname(upath)):
294 os.makedirs(os.path.dirname(upath))
295
296 # TODO: Doesn't handle compressed files!
297 if self._isurl(path):
298 try:
299 openedurl = urlopen(path)
300 f = _open(upath, 'wb')
301 try:
302 shutil.copyfileobj(openedurl, f)
303 finally:
304 f.close()
305 openedurl.close()
306 except URLError:
307 raise URLError("URL not found: %s" % path)
308 else:
309 shutil.copyfile(path, upath)
310 return upath
311
312 def _findfile(self, path):
313 """Searches for ``path`` and returns full path if found.
314
315 If path is an URL, _findfile will cache a local copy and return the
316 path to the cached file. If path is a local file, _findfile will
317 return a path to that local file.
318
319 The search will include possible compressed versions of the file
320 and return the first occurence found.
321
322 """
323
324 # Build list of possible local file paths
325 if not self._isurl(path):
326 # Valid local paths
327 filelist = self._possible_names(path)
328 # Paths in self._destpath
329 filelist += self._possible_names(self.abspath(path))
330 else:
331 # Cached URLs in self._destpath
332 filelist = self._possible_names(self.abspath(path))
333 # Remote URLs
334 filelist = filelist + self._possible_names(path)
335
336 for name in filelist:
337 if self.exists(name):
338 if self._isurl(name):
339 name = self._cache(name)
340 return name
341 return None
342
343 def abspath(self, path):
344 """
345 Return absolute path of file in the DataSource directory.
346
347 If `path` is an URL, then `abspath` will return either the location
348 the file exists locally or the location it would exist when opened
349 using the `open` method.
350
351 Parameters
352 ----------
353 path : str
354 Can be a local file or a remote URL.
355
356 Returns
357 -------
358 out : str
359 Complete path, including the `DataSource` destination directory.
360
361 Notes
362 -----
363 The functionality is based on `os.path.abspath`.
364
365 """
366 # We do this here to reduce the 'import numpy' initial import time.
367 if sys.version_info[0] >= 3:
368 from urllib.parse import urlparse
369 else:
370 from urlparse import urlparse
371
372 # TODO: This should be more robust. Handles case where path includes
373 # the destpath, but not other sub-paths. Failing case:
374 # path = /home/guido/datafile.txt
375 # destpath = /home/alex/
376 # upath = self.abspath(path)
377 # upath == '/home/alex/home/guido/datafile.txt'
378
379 # handle case where path includes self._destpath
380 splitpath = path.split(self._destpath, 2)
381 if len(splitpath) > 1:
382 path = splitpath[1]
383 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
384 netloc = self._sanitize_relative_path(netloc)
385 upath = self._sanitize_relative_path(upath)
386 return os.path.join(self._destpath, netloc, upath)
387
388 def _sanitize_relative_path(self, path):
389 """Return a sanitised relative path for which
390 os.path.abspath(os.path.join(base, path)).startswith(base)
391 """
392 last = None
393 path = os.path.normpath(path)
394 while path != last:
395 last = path
396 # Note: os.path.join treats '/' as os.sep on Windows
397 path = path.lstrip(os.sep).lstrip('/')
398 path = path.lstrip(os.pardir).lstrip('..')
399 drive, path = os.path.splitdrive(path) # for Windows
400 return path
401
402 def exists(self, path):
403 """
404 Test if path exists.
405
406 Test if `path` exists as (and in this order):
407
408 - a local file.
409 - a remote URL that has been downloaded and stored locally in the
410 `DataSource` directory.
411 - a remote URL that has not been downloaded, but is valid and
412 accessible.
413
414 Parameters
415 ----------
416 path : str
417 Can be a local file or a remote URL.
418
419 Returns
420 -------
421 out : bool
422 True if `path` exists.
423
424 Notes
425 -----
426 When `path` is an URL, `exists` will return True if it's either
427 stored locally in the `DataSource` directory, or is a valid remote
428 URL. `DataSource` does not discriminate between the two, the file
429 is accessible if it exists in either location.
430
431 """
432 # We import this here because importing urllib2 is slow and
433 # a significant fraction of numpy's total import time.
434 if sys.version_info[0] >= 3:
435 from urllib.request import urlopen
436 from urllib.error import URLError
437 else:
438 from urllib2 import urlopen
439 from urllib2 import URLError
440
441 # Test local path
442 if os.path.exists(path):
443 return True
444
445 # Test cached url
446 upath = self.abspath(path)
447 if os.path.exists(upath):
448 return True
449
450 # Test remote url
451 if self._isurl(path):
452 try:
453 netfile = urlopen(path)
454 netfile.close()
455 del(netfile)
456 return True
457 except URLError:
458 return False
459 return False
460
461 def open(self, path, mode='r'):
462 """
463 Open and return file-like object.
464
465 If `path` is an URL, it will be downloaded, stored in the
466 `DataSource` directory and opened from there.
467
468 Parameters
469 ----------
470 path : str
471 Local file path or URL to open.
472 mode : {'r', 'w', 'a'}, optional
473 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
474 'a' to append. Available modes depend on the type of object
475 specified by `path`. Default is 'r'.
476
477 Returns
478 -------
479 out : file object
480 File object.
481
482 """
483
484 # TODO: There is no support for opening a file for writing which
485 # doesn't exist yet (creating a file). Should there be?
486
487 # TODO: Add a ``subdir`` parameter for specifying the subdirectory
488 # used to store URLs in self._destpath.
489
490 if self._isurl(path) and self._iswritemode(mode):
491 raise ValueError("URLs are not writeable")
492
493 # NOTE: _findfile will fail on a new file opened for writing.
494 found = self._findfile(path)
495 if found:
496 _fname, ext = self._splitzipext(found)
497 if ext == 'bz2':
498 mode.replace("+", "")
499 return _file_openers[ext](found, mode=mode)
500 else:
501 raise IOError("%s not found." % path)
502
503
504 class Repository (DataSource):
505 """
506 Repository(baseurl, destpath='.')
507
508 A data repository where multiple DataSource's share a base
509 URL/directory.
510
511 `Repository` extends `DataSource` by prepending a base URL (or
512 directory) to all the files it handles. Use `Repository` when you will
513 be working with multiple files from one base URL. Initialize
514 `Repository` with the base URL, then refer to each file by its filename
515 only.
516
517 Parameters
518 ----------
519 baseurl : str
520 Path to the local directory or remote location that contains the
521 data files.
522 destpath : str or None, optional
523 Path to the directory where the source file gets downloaded to for
524 use. If `destpath` is None, a temporary directory will be created.
525 The default path is the current directory.
526
527 Examples
528 --------
529 To analyze all files in the repository, do something like this
530 (note: this is not self-contained code)::
531
532 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
533 >>> for filename in filelist:
534 ... fp = repos.open(filename)
535 ... fp.analyze()
536 ... fp.close()
537
538 Similarly you could use a URL for a repository::
539
540 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
541
542 """
543
544 def __init__(self, baseurl, destpath=os.curdir):
545 """Create a Repository with a shared url or directory of baseurl."""
546 DataSource.__init__(self, destpath=destpath)
547 self._baseurl = baseurl
548
549 def __del__(self):
550 DataSource.__del__(self)
551
552 def _fullpath(self, path):
553 """Return complete path for path. Prepends baseurl if necessary."""
554 splitpath = path.split(self._baseurl, 2)
555 if len(splitpath) == 1:
556 result = os.path.join(self._baseurl, path)
557 else:
558 result = path # path contains baseurl already
559 return result
560
561 def _findfile(self, path):
562 """Extend DataSource method to prepend baseurl to ``path``."""
563 return DataSource._findfile(self, self._fullpath(path))
564
565 def abspath(self, path):
566 """
567 Return absolute path of file in the Repository directory.
568
569 If `path` is an URL, then `abspath` will return either the location
570 the file exists locally or the location it would exist when opened
571 using the `open` method.
572
573 Parameters
574 ----------
575 path : str
576 Can be a local file or a remote URL. This may, but does not
577 have to, include the `baseurl` with which the `Repository` was
578 initialized.
579
580 Returns
581 -------
582 out : str
583 Complete path, including the `DataSource` destination directory.
584
585 """
586 return DataSource.abspath(self, self._fullpath(path))
587
588 def exists(self, path):
589 """
590 Test if path exists prepending Repository base URL to path.
591
592 Test if `path` exists as (and in this order):
593
594 - a local file.
595 - a remote URL that has been downloaded and stored locally in the
596 `DataSource` directory.
597 - a remote URL that has not been downloaded, but is valid and
598 accessible.
599
600 Parameters
601 ----------
602 path : str
603 Can be a local file or a remote URL. This may, but does not
604 have to, include the `baseurl` with which the `Repository` was
605 initialized.
606
607 Returns
608 -------
609 out : bool
610 True if `path` exists.
611
612 Notes
613 -----
614 When `path` is an URL, `exists` will return True if it's either
615 stored locally in the `DataSource` directory, or is a valid remote
616 URL. `DataSource` does not discriminate between the two, the file
617 is accessible if it exists in either location.
618
619 """
620 return DataSource.exists(self, self._fullpath(path))
621
622 def open(self, path, mode='r'):
623 """
624 Open and return file-like object prepending Repository base URL.
625
626 If `path` is an URL, it will be downloaded, stored in the
627 DataSource directory and opened from there.
628
629 Parameters
630 ----------
631 path : str
632 Local file path or URL to open. This may, but does not have to,
633 include the `baseurl` with which the `Repository` was
634 initialized.
635 mode : {'r', 'w', 'a'}, optional
636 Mode to open `path`. Mode 'r' for reading, 'w' for writing,
637 'a' to append. Available modes depend on the type of object
638 specified by `path`. Default is 'r'.
639
640 Returns
641 -------
642 out : file object
643 File object.
644
645 """
646 return DataSource.open(self, self._fullpath(path), mode)
647
648 def listdir(self):
649 """
650 List files in the source Repository.
651
652 Returns
653 -------
654 files : list of str
655 List of file names (not containing a directory part).
656
657 Notes
658 -----
659 Does not currently work for remote repositories.
660
661 """
662 if self._isurl(self._baseurl):
663 raise NotImplementedError(
664 "Directory listing of URLs, not supported yet.")
665 else:
666 return os.listdir(self._baseurl)