Mercurial > hg > vamp-build-and-test
comparison DEPENDENCIES/mingw32/Python27/Lib/site-packages/numpy/lib/_datasource.py @ 87:2a2c65a20a8b
Add Python libs and headers
author | Chris Cannam |
---|---|
date | Wed, 25 Feb 2015 14:05:22 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
86:413a9d26189e | 87:2a2c65a20a8b |
---|---|
1 """A file interface for handling local and remote data files. | |
2 | |
3 The goal of datasource is to abstract some of the file system operations | |
4 when dealing with data files so the researcher doesn't have to know all the | |
5 low-level details. Through datasource, a researcher can obtain and use a | |
6 file with one function call, regardless of location of the file. | |
7 | |
8 DataSource is meant to augment standard python libraries, not replace them. | |
9 It should work seemlessly with standard file IO operations and the os | |
10 module. | |
11 | |
12 DataSource files can originate locally or remotely: | |
13 | |
14 - local files : '/home/guido/src/local/data.txt' | |
15 - URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' | |
16 | |
17 DataSource files can also be compressed or uncompressed. Currently only | |
18 gzip and bz2 are supported. | |
19 | |
20 Example:: | |
21 | |
22 >>> # Create a DataSource, use os.curdir (default) for local storage. | |
23 >>> ds = datasource.DataSource() | |
24 >>> | |
25 >>> # Open a remote file. | |
26 >>> # DataSource downloads the file, stores it locally in: | |
27 >>> # './www.google.com/index.html' | |
28 >>> # opens the file and returns a file object. | |
29 >>> fp = ds.open('http://www.google.com/index.html') | |
30 >>> | |
31 >>> # Use the file as you normally would | |
32 >>> fp.read() | |
33 >>> fp.close() | |
34 | |
35 """ | |
36 from __future__ import division, absolute_import, print_function | |
37 | |
38 import os | |
39 import sys | |
40 import shutil | |
41 | |
42 _open = open | |
43 | |
44 | |
45 # Using a class instead of a module-level dictionary | |
46 # to reduce the inital 'import numpy' overhead by | |
47 # deferring the import of bz2 and gzip until needed | |
48 | |
49 # TODO: .zip support, .tar support? | |
50 class _FileOpeners(object): | |
51 """ | |
52 Container for different methods to open (un-)compressed files. | |
53 | |
54 `_FileOpeners` contains a dictionary that holds one method for each | |
55 supported file format. Attribute lookup is implemented in such a way | |
56 that an instance of `_FileOpeners` itself can be indexed with the keys | |
57 of that dictionary. Currently uncompressed files as well as files | |
58 compressed with ``gzip`` or ``bz2`` compression are supported. | |
59 | |
60 Notes | |
61 ----- | |
62 `_file_openers`, an instance of `_FileOpeners`, is made available for | |
63 use in the `_datasource` module. | |
64 | |
65 Examples | |
66 -------- | |
67 >>> np.lib._datasource._file_openers.keys() | |
68 [None, '.bz2', '.gz'] | |
69 >>> np.lib._datasource._file_openers['.gz'] is gzip.open | |
70 True | |
71 | |
72 """ | |
73 | |
74 def __init__(self): | |
75 self._loaded = False | |
76 self._file_openers = {None: open} | |
77 | |
78 def _load(self): | |
79 if self._loaded: | |
80 return | |
81 try: | |
82 import bz2 | |
83 self._file_openers[".bz2"] = bz2.BZ2File | |
84 except ImportError: | |
85 pass | |
86 try: | |
87 import gzip | |
88 self._file_openers[".gz"] = gzip.open | |
89 except ImportError: | |
90 pass | |
91 self._loaded = True | |
92 | |
93 def keys(self): | |
94 """ | |
95 Return the keys of currently supported file openers. | |
96 | |
97 Parameters | |
98 ---------- | |
99 None | |
100 | |
101 Returns | |
102 ------- | |
103 keys : list | |
104 The keys are None for uncompressed files and the file extension | |
105 strings (i.e. ``'.gz'``, ``'.bz2'``) for supported compression | |
106 methods. | |
107 | |
108 """ | |
109 self._load() | |
110 return list(self._file_openers.keys()) | |
111 | |
112 def __getitem__(self, key): | |
113 self._load() | |
114 return self._file_openers[key] | |
115 | |
116 _file_openers = _FileOpeners() | |
117 | |
118 def open(path, mode='r', destpath=os.curdir): | |
119 """ | |
120 Open `path` with `mode` and return the file object. | |
121 | |
122 If ``path`` is an URL, it will be downloaded, stored in the | |
123 `DataSource` `destpath` directory and opened from there. | |
124 | |
125 Parameters | |
126 ---------- | |
127 path : str | |
128 Local file path or URL to open. | |
129 mode : str, optional | |
130 Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to | |
131 append. Available modes depend on the type of object specified by | |
132 path. Default is 'r'. | |
133 destpath : str, optional | |
134 Path to the directory where the source file gets downloaded to for | |
135 use. If `destpath` is None, a temporary directory will be created. | |
136 The default path is the current directory. | |
137 | |
138 Returns | |
139 ------- | |
140 out : file object | |
141 The opened file. | |
142 | |
143 Notes | |
144 ----- | |
145 This is a convenience function that instantiates a `DataSource` and | |
146 returns the file object from ``DataSource.open(path)``. | |
147 | |
148 """ | |
149 | |
150 ds = DataSource(destpath) | |
151 return ds.open(path, mode) | |
152 | |
153 | |
154 class DataSource (object): | |
155 """ | |
156 DataSource(destpath='.') | |
157 | |
158 A generic data source file (file, http, ftp, ...). | |
159 | |
160 DataSources can be local files or remote files/URLs. The files may | |
161 also be compressed or uncompressed. DataSource hides some of the | |
162 low-level details of downloading the file, allowing you to simply pass | |
163 in a valid file path (or URL) and obtain a file object. | |
164 | |
165 Parameters | |
166 ---------- | |
167 destpath : str or None, optional | |
168 Path to the directory where the source file gets downloaded to for | |
169 use. If `destpath` is None, a temporary directory will be created. | |
170 The default path is the current directory. | |
171 | |
172 Notes | |
173 ----- | |
174 URLs require a scheme string (``http://``) to be used, without it they | |
175 will fail:: | |
176 | |
177 >>> repos = DataSource() | |
178 >>> repos.exists('www.google.com/index.html') | |
179 False | |
180 >>> repos.exists('http://www.google.com/index.html') | |
181 True | |
182 | |
183 Temporary directories are deleted when the DataSource is deleted. | |
184 | |
185 Examples | |
186 -------- | |
187 :: | |
188 | |
189 >>> ds = DataSource('/home/guido') | |
190 >>> urlname = 'http://www.google.com/index.html' | |
191 >>> gfile = ds.open('http://www.google.com/index.html') # remote file | |
192 >>> ds.abspath(urlname) | |
193 '/home/guido/www.google.com/site/index.html' | |
194 | |
195 >>> ds = DataSource(None) # use with temporary file | |
196 >>> ds.open('/home/guido/foobar.txt') | |
197 <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430> | |
198 >>> ds.abspath('/home/guido/foobar.txt') | |
199 '/tmp/tmpy4pgsP/home/guido/foobar.txt' | |
200 | |
201 """ | |
202 | |
203 def __init__(self, destpath=os.curdir): | |
204 """Create a DataSource with a local path at destpath.""" | |
205 if destpath: | |
206 self._destpath = os.path.abspath(destpath) | |
207 self._istmpdest = False | |
208 else: | |
209 import tempfile # deferring import to improve startup time | |
210 self._destpath = tempfile.mkdtemp() | |
211 self._istmpdest = True | |
212 | |
213 def __del__(self): | |
214 # Remove temp directories | |
215 if self._istmpdest: | |
216 shutil.rmtree(self._destpath) | |
217 | |
218 def _iszip(self, filename): | |
219 """Test if the filename is a zip file by looking at the file extension. | |
220 | |
221 """ | |
222 fname, ext = os.path.splitext(filename) | |
223 return ext in _file_openers.keys() | |
224 | |
225 def _iswritemode(self, mode): | |
226 """Test if the given mode will open a file for writing.""" | |
227 | |
228 # Currently only used to test the bz2 files. | |
229 _writemodes = ("w", "+") | |
230 for c in mode: | |
231 if c in _writemodes: | |
232 return True | |
233 return False | |
234 | |
235 def _splitzipext(self, filename): | |
236 """Split zip extension from filename and return filename. | |
237 | |
238 *Returns*: | |
239 base, zip_ext : {tuple} | |
240 | |
241 """ | |
242 | |
243 if self._iszip(filename): | |
244 return os.path.splitext(filename) | |
245 else: | |
246 return filename, None | |
247 | |
248 def _possible_names(self, filename): | |
249 """Return a tuple containing compressed filename variations.""" | |
250 names = [filename] | |
251 if not self._iszip(filename): | |
252 for zipext in _file_openers.keys(): | |
253 if zipext: | |
254 names.append(filename+zipext) | |
255 return names | |
256 | |
257 def _isurl(self, path): | |
258 """Test if path is a net location. Tests the scheme and netloc.""" | |
259 | |
260 # We do this here to reduce the 'import numpy' initial import time. | |
261 if sys.version_info[0] >= 3: | |
262 from urllib.parse import urlparse | |
263 else: | |
264 from urlparse import urlparse | |
265 | |
266 # BUG : URLs require a scheme string ('http://') to be used. | |
267 # www.google.com will fail. | |
268 # Should we prepend the scheme for those that don't have it and | |
269 # test that also? Similar to the way we append .gz and test for | |
270 # for compressed versions of files. | |
271 | |
272 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) | |
273 return bool(scheme and netloc) | |
274 | |
275 def _cache(self, path): | |
276 """Cache the file specified by path. | |
277 | |
278 Creates a copy of the file in the datasource cache. | |
279 | |
280 """ | |
281 # We import these here because importing urllib2 is slow and | |
282 # a significant fraction of numpy's total import time. | |
283 if sys.version_info[0] >= 3: | |
284 from urllib.request import urlopen | |
285 from urllib.error import URLError | |
286 else: | |
287 from urllib2 import urlopen | |
288 from urllib2 import URLError | |
289 | |
290 upath = self.abspath(path) | |
291 | |
292 # ensure directory exists | |
293 if not os.path.exists(os.path.dirname(upath)): | |
294 os.makedirs(os.path.dirname(upath)) | |
295 | |
296 # TODO: Doesn't handle compressed files! | |
297 if self._isurl(path): | |
298 try: | |
299 openedurl = urlopen(path) | |
300 f = _open(upath, 'wb') | |
301 try: | |
302 shutil.copyfileobj(openedurl, f) | |
303 finally: | |
304 f.close() | |
305 openedurl.close() | |
306 except URLError: | |
307 raise URLError("URL not found: %s" % path) | |
308 else: | |
309 shutil.copyfile(path, upath) | |
310 return upath | |
311 | |
312 def _findfile(self, path): | |
313 """Searches for ``path`` and returns full path if found. | |
314 | |
315 If path is an URL, _findfile will cache a local copy and return the | |
316 path to the cached file. If path is a local file, _findfile will | |
317 return a path to that local file. | |
318 | |
319 The search will include possible compressed versions of the file | |
320 and return the first occurence found. | |
321 | |
322 """ | |
323 | |
324 # Build list of possible local file paths | |
325 if not self._isurl(path): | |
326 # Valid local paths | |
327 filelist = self._possible_names(path) | |
328 # Paths in self._destpath | |
329 filelist += self._possible_names(self.abspath(path)) | |
330 else: | |
331 # Cached URLs in self._destpath | |
332 filelist = self._possible_names(self.abspath(path)) | |
333 # Remote URLs | |
334 filelist = filelist + self._possible_names(path) | |
335 | |
336 for name in filelist: | |
337 if self.exists(name): | |
338 if self._isurl(name): | |
339 name = self._cache(name) | |
340 return name | |
341 return None | |
342 | |
343 def abspath(self, path): | |
344 """ | |
345 Return absolute path of file in the DataSource directory. | |
346 | |
347 If `path` is an URL, then `abspath` will return either the location | |
348 the file exists locally or the location it would exist when opened | |
349 using the `open` method. | |
350 | |
351 Parameters | |
352 ---------- | |
353 path : str | |
354 Can be a local file or a remote URL. | |
355 | |
356 Returns | |
357 ------- | |
358 out : str | |
359 Complete path, including the `DataSource` destination directory. | |
360 | |
361 Notes | |
362 ----- | |
363 The functionality is based on `os.path.abspath`. | |
364 | |
365 """ | |
366 # We do this here to reduce the 'import numpy' initial import time. | |
367 if sys.version_info[0] >= 3: | |
368 from urllib.parse import urlparse | |
369 else: | |
370 from urlparse import urlparse | |
371 | |
372 # TODO: This should be more robust. Handles case where path includes | |
373 # the destpath, but not other sub-paths. Failing case: | |
374 # path = /home/guido/datafile.txt | |
375 # destpath = /home/alex/ | |
376 # upath = self.abspath(path) | |
377 # upath == '/home/alex/home/guido/datafile.txt' | |
378 | |
379 # handle case where path includes self._destpath | |
380 splitpath = path.split(self._destpath, 2) | |
381 if len(splitpath) > 1: | |
382 path = splitpath[1] | |
383 scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) | |
384 netloc = self._sanitize_relative_path(netloc) | |
385 upath = self._sanitize_relative_path(upath) | |
386 return os.path.join(self._destpath, netloc, upath) | |
387 | |
388 def _sanitize_relative_path(self, path): | |
389 """Return a sanitised relative path for which | |
390 os.path.abspath(os.path.join(base, path)).startswith(base) | |
391 """ | |
392 last = None | |
393 path = os.path.normpath(path) | |
394 while path != last: | |
395 last = path | |
396 # Note: os.path.join treats '/' as os.sep on Windows | |
397 path = path.lstrip(os.sep).lstrip('/') | |
398 path = path.lstrip(os.pardir).lstrip('..') | |
399 drive, path = os.path.splitdrive(path) # for Windows | |
400 return path | |
401 | |
402 def exists(self, path): | |
403 """ | |
404 Test if path exists. | |
405 | |
406 Test if `path` exists as (and in this order): | |
407 | |
408 - a local file. | |
409 - a remote URL that has been downloaded and stored locally in the | |
410 `DataSource` directory. | |
411 - a remote URL that has not been downloaded, but is valid and | |
412 accessible. | |
413 | |
414 Parameters | |
415 ---------- | |
416 path : str | |
417 Can be a local file or a remote URL. | |
418 | |
419 Returns | |
420 ------- | |
421 out : bool | |
422 True if `path` exists. | |
423 | |
424 Notes | |
425 ----- | |
426 When `path` is an URL, `exists` will return True if it's either | |
427 stored locally in the `DataSource` directory, or is a valid remote | |
428 URL. `DataSource` does not discriminate between the two, the file | |
429 is accessible if it exists in either location. | |
430 | |
431 """ | |
432 # We import this here because importing urllib2 is slow and | |
433 # a significant fraction of numpy's total import time. | |
434 if sys.version_info[0] >= 3: | |
435 from urllib.request import urlopen | |
436 from urllib.error import URLError | |
437 else: | |
438 from urllib2 import urlopen | |
439 from urllib2 import URLError | |
440 | |
441 # Test local path | |
442 if os.path.exists(path): | |
443 return True | |
444 | |
445 # Test cached url | |
446 upath = self.abspath(path) | |
447 if os.path.exists(upath): | |
448 return True | |
449 | |
450 # Test remote url | |
451 if self._isurl(path): | |
452 try: | |
453 netfile = urlopen(path) | |
454 netfile.close() | |
455 del(netfile) | |
456 return True | |
457 except URLError: | |
458 return False | |
459 return False | |
460 | |
461 def open(self, path, mode='r'): | |
462 """ | |
463 Open and return file-like object. | |
464 | |
465 If `path` is an URL, it will be downloaded, stored in the | |
466 `DataSource` directory and opened from there. | |
467 | |
468 Parameters | |
469 ---------- | |
470 path : str | |
471 Local file path or URL to open. | |
472 mode : {'r', 'w', 'a'}, optional | |
473 Mode to open `path`. Mode 'r' for reading, 'w' for writing, | |
474 'a' to append. Available modes depend on the type of object | |
475 specified by `path`. Default is 'r'. | |
476 | |
477 Returns | |
478 ------- | |
479 out : file object | |
480 File object. | |
481 | |
482 """ | |
483 | |
484 # TODO: There is no support for opening a file for writing which | |
485 # doesn't exist yet (creating a file). Should there be? | |
486 | |
487 # TODO: Add a ``subdir`` parameter for specifying the subdirectory | |
488 # used to store URLs in self._destpath. | |
489 | |
490 if self._isurl(path) and self._iswritemode(mode): | |
491 raise ValueError("URLs are not writeable") | |
492 | |
493 # NOTE: _findfile will fail on a new file opened for writing. | |
494 found = self._findfile(path) | |
495 if found: | |
496 _fname, ext = self._splitzipext(found) | |
497 if ext == 'bz2': | |
498 mode.replace("+", "") | |
499 return _file_openers[ext](found, mode=mode) | |
500 else: | |
501 raise IOError("%s not found." % path) | |
502 | |
503 | |
504 class Repository (DataSource): | |
505 """ | |
506 Repository(baseurl, destpath='.') | |
507 | |
508 A data repository where multiple DataSource's share a base | |
509 URL/directory. | |
510 | |
511 `Repository` extends `DataSource` by prepending a base URL (or | |
512 directory) to all the files it handles. Use `Repository` when you will | |
513 be working with multiple files from one base URL. Initialize | |
514 `Repository` with the base URL, then refer to each file by its filename | |
515 only. | |
516 | |
517 Parameters | |
518 ---------- | |
519 baseurl : str | |
520 Path to the local directory or remote location that contains the | |
521 data files. | |
522 destpath : str or None, optional | |
523 Path to the directory where the source file gets downloaded to for | |
524 use. If `destpath` is None, a temporary directory will be created. | |
525 The default path is the current directory. | |
526 | |
527 Examples | |
528 -------- | |
529 To analyze all files in the repository, do something like this | |
530 (note: this is not self-contained code):: | |
531 | |
532 >>> repos = np.lib._datasource.Repository('/home/user/data/dir/') | |
533 >>> for filename in filelist: | |
534 ... fp = repos.open(filename) | |
535 ... fp.analyze() | |
536 ... fp.close() | |
537 | |
538 Similarly you could use a URL for a repository:: | |
539 | |
540 >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data') | |
541 | |
542 """ | |
543 | |
544 def __init__(self, baseurl, destpath=os.curdir): | |
545 """Create a Repository with a shared url or directory of baseurl.""" | |
546 DataSource.__init__(self, destpath=destpath) | |
547 self._baseurl = baseurl | |
548 | |
549 def __del__(self): | |
550 DataSource.__del__(self) | |
551 | |
552 def _fullpath(self, path): | |
553 """Return complete path for path. Prepends baseurl if necessary.""" | |
554 splitpath = path.split(self._baseurl, 2) | |
555 if len(splitpath) == 1: | |
556 result = os.path.join(self._baseurl, path) | |
557 else: | |
558 result = path # path contains baseurl already | |
559 return result | |
560 | |
561 def _findfile(self, path): | |
562 """Extend DataSource method to prepend baseurl to ``path``.""" | |
563 return DataSource._findfile(self, self._fullpath(path)) | |
564 | |
565 def abspath(self, path): | |
566 """ | |
567 Return absolute path of file in the Repository directory. | |
568 | |
569 If `path` is an URL, then `abspath` will return either the location | |
570 the file exists locally or the location it would exist when opened | |
571 using the `open` method. | |
572 | |
573 Parameters | |
574 ---------- | |
575 path : str | |
576 Can be a local file or a remote URL. This may, but does not | |
577 have to, include the `baseurl` with which the `Repository` was | |
578 initialized. | |
579 | |
580 Returns | |
581 ------- | |
582 out : str | |
583 Complete path, including the `DataSource` destination directory. | |
584 | |
585 """ | |
586 return DataSource.abspath(self, self._fullpath(path)) | |
587 | |
588 def exists(self, path): | |
589 """ | |
590 Test if path exists prepending Repository base URL to path. | |
591 | |
592 Test if `path` exists as (and in this order): | |
593 | |
594 - a local file. | |
595 - a remote URL that has been downloaded and stored locally in the | |
596 `DataSource` directory. | |
597 - a remote URL that has not been downloaded, but is valid and | |
598 accessible. | |
599 | |
600 Parameters | |
601 ---------- | |
602 path : str | |
603 Can be a local file or a remote URL. This may, but does not | |
604 have to, include the `baseurl` with which the `Repository` was | |
605 initialized. | |
606 | |
607 Returns | |
608 ------- | |
609 out : bool | |
610 True if `path` exists. | |
611 | |
612 Notes | |
613 ----- | |
614 When `path` is an URL, `exists` will return True if it's either | |
615 stored locally in the `DataSource` directory, or is a valid remote | |
616 URL. `DataSource` does not discriminate between the two, the file | |
617 is accessible if it exists in either location. | |
618 | |
619 """ | |
620 return DataSource.exists(self, self._fullpath(path)) | |
621 | |
622 def open(self, path, mode='r'): | |
623 """ | |
624 Open and return file-like object prepending Repository base URL. | |
625 | |
626 If `path` is an URL, it will be downloaded, stored in the | |
627 DataSource directory and opened from there. | |
628 | |
629 Parameters | |
630 ---------- | |
631 path : str | |
632 Local file path or URL to open. This may, but does not have to, | |
633 include the `baseurl` with which the `Repository` was | |
634 initialized. | |
635 mode : {'r', 'w', 'a'}, optional | |
636 Mode to open `path`. Mode 'r' for reading, 'w' for writing, | |
637 'a' to append. Available modes depend on the type of object | |
638 specified by `path`. Default is 'r'. | |
639 | |
640 Returns | |
641 ------- | |
642 out : file object | |
643 File object. | |
644 | |
645 """ | |
646 return DataSource.open(self, self._fullpath(path), mode) | |
647 | |
648 def listdir(self): | |
649 """ | |
650 List files in the source Repository. | |
651 | |
652 Returns | |
653 ------- | |
654 files : list of str | |
655 List of file names (not containing a directory part). | |
656 | |
657 Notes | |
658 ----- | |
659 Does not currently work for remote repositories. | |
660 | |
661 """ | |
662 if self._isurl(self._baseurl): | |
663 raise NotImplementedError( | |
664 "Directory listing of URLs, not supported yet.") | |
665 else: | |
666 return os.listdir(self._baseurl) |