diff DEPENDENCIES/mingw32/Python27/Lib/site-packages/numpy/lib/_datasource.py @ 87:2a2c65a20a8b

Add Python libs and headers
author Chris Cannam
date Wed, 25 Feb 2015 14:05:22 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DEPENDENCIES/mingw32/Python27/Lib/site-packages/numpy/lib/_datasource.py	Wed Feb 25 14:05:22 2015 +0000
@@ -0,0 +1,666 @@
+"""A file interface for handling local and remote data files.
+
+The goal of datasource is to abstract some of the file system operations
+when dealing with data files so the researcher doesn't have to know all the
+low-level details.  Through datasource, a researcher can obtain and use a
+file with one function call, regardless of location of the file.
+
+DataSource is meant to augment standard python libraries, not replace them.
+It should work seemlessly with standard file IO operations and the os
+module.
+
+DataSource files can originate locally or remotely:
+
+- local files : '/home/guido/src/local/data.txt'
+- URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt'
+
+DataSource files can also be compressed or uncompressed.  Currently only
+gzip and bz2 are supported.
+
+Example::
+
+    >>> # Create a DataSource, use os.curdir (default) for local storage.
+    >>> ds = datasource.DataSource()
+    >>>
+    >>> # Open a remote file.
+    >>> # DataSource downloads the file, stores it locally in:
+    >>> #     './www.google.com/index.html'
+    >>> # opens the file and returns a file object.
+    >>> fp = ds.open('http://www.google.com/index.html')
+    >>>
+    >>> # Use the file as you normally would
+    >>> fp.read()
+    >>> fp.close()
+
+"""
+from __future__ import division, absolute_import, print_function
+
+import os
+import sys
+import shutil
+
+_open = open
+
+
+# Using a class instead of a module-level dictionary
+# to reduce the inital 'import numpy' overhead by
+# deferring the import of bz2 and gzip until needed
+
+# TODO: .zip support, .tar support?
+class _FileOpeners(object):
+    """
+    Container for different methods to open (un-)compressed files.
+
+    `_FileOpeners` contains a dictionary that holds one method for each
+    supported file format. Attribute lookup is implemented in such a way
+    that an instance of `_FileOpeners` itself can be indexed with the keys
+    of that dictionary. Currently uncompressed files as well as files
+    compressed with ``gzip`` or ``bz2`` compression are supported.
+
+    Notes
+    -----
+    `_file_openers`, an instance of `_FileOpeners`, is made available for
+    use in the `_datasource` module.
+
+    Examples
+    --------
+    >>> np.lib._datasource._file_openers.keys()
+    [None, '.bz2', '.gz']
+    >>> np.lib._datasource._file_openers['.gz'] is gzip.open
+    True
+
+    """
+
+    def __init__(self):
+        self._loaded = False
+        self._file_openers = {None: open}
+
+    def _load(self):
+        if self._loaded:
+            return
+        try:
+            import bz2
+            self._file_openers[".bz2"] = bz2.BZ2File
+        except ImportError:
+            pass
+        try:
+            import gzip
+            self._file_openers[".gz"] = gzip.open
+        except ImportError:
+            pass
+        self._loaded = True
+
+    def keys(self):
+        """
+        Return the keys of currently supported file openers.
+
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        keys : list
+            The keys are None for uncompressed files and the file extension
+            strings (i.e. ``'.gz'``, ``'.bz2'``) for supported compression
+            methods.
+
+        """
+        self._load()
+        return list(self._file_openers.keys())
+
+    def __getitem__(self, key):
+        self._load()
+        return self._file_openers[key]
+
+_file_openers = _FileOpeners()
+
+def open(path, mode='r', destpath=os.curdir):
+    """
+    Open `path` with `mode` and return the file object.
+
+    If ``path`` is an URL, it will be downloaded, stored in the
+    `DataSource` `destpath` directory and opened from there.
+
+    Parameters
+    ----------
+    path : str
+        Local file path or URL to open.
+    mode : str, optional
+        Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to
+        append. Available modes depend on the type of object specified by
+        path.  Default is 'r'.
+    destpath : str, optional
+        Path to the directory where the source file gets downloaded to for
+        use.  If `destpath` is None, a temporary directory will be created.
+        The default path is the current directory.
+
+    Returns
+    -------
+    out : file object
+        The opened file.
+
+    Notes
+    -----
+    This is a convenience function that instantiates a `DataSource` and
+    returns the file object from ``DataSource.open(path)``.
+
+    """
+
+    ds = DataSource(destpath)
+    return ds.open(path, mode)
+
+
+class DataSource (object):
+    """
+    DataSource(destpath='.')
+
+    A generic data source file (file, http, ftp, ...).
+
+    DataSources can be local files or remote files/URLs.  The files may
+    also be compressed or uncompressed. DataSource hides some of the
+    low-level details of downloading the file, allowing you to simply pass
+    in a valid file path (or URL) and obtain a file object.
+
+    Parameters
+    ----------
+    destpath : str or None, optional
+        Path to the directory where the source file gets downloaded to for
+        use.  If `destpath` is None, a temporary directory will be created.
+        The default path is the current directory.
+
+    Notes
+    -----
+    URLs require a scheme string (``http://``) to be used, without it they
+    will fail::
+
+        >>> repos = DataSource()
+        >>> repos.exists('www.google.com/index.html')
+        False
+        >>> repos.exists('http://www.google.com/index.html')
+        True
+
+    Temporary directories are deleted when the DataSource is deleted.
+
+    Examples
+    --------
+    ::
+
+        >>> ds = DataSource('/home/guido')
+        >>> urlname = 'http://www.google.com/index.html'
+        >>> gfile = ds.open('http://www.google.com/index.html')  # remote file
+        >>> ds.abspath(urlname)
+        '/home/guido/www.google.com/site/index.html'
+
+        >>> ds = DataSource(None)  # use with temporary file
+        >>> ds.open('/home/guido/foobar.txt')
+        <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430>
+        >>> ds.abspath('/home/guido/foobar.txt')
+        '/tmp/tmpy4pgsP/home/guido/foobar.txt'
+
+    """
+
+    def __init__(self, destpath=os.curdir):
+        """Create a DataSource with a local path at destpath."""
+        if destpath:
+            self._destpath = os.path.abspath(destpath)
+            self._istmpdest = False
+        else:
+            import tempfile  # deferring import to improve startup time
+            self._destpath = tempfile.mkdtemp()
+            self._istmpdest = True
+
+    def __del__(self):
+        # Remove temp directories
+        if self._istmpdest:
+            shutil.rmtree(self._destpath)
+
+    def _iszip(self, filename):
+        """Test if the filename is a zip file by looking at the file extension.
+
+        """
+        fname, ext = os.path.splitext(filename)
+        return ext in _file_openers.keys()
+
+    def _iswritemode(self, mode):
+        """Test if the given mode will open a file for writing."""
+
+        # Currently only used to test the bz2 files.
+        _writemodes = ("w", "+")
+        for c in mode:
+            if c in _writemodes:
+                return True
+        return False
+
+    def _splitzipext(self, filename):
+        """Split zip extension from filename and return filename.
+
+        *Returns*:
+            base, zip_ext : {tuple}
+
+        """
+
+        if self._iszip(filename):
+            return os.path.splitext(filename)
+        else:
+            return filename, None
+
+    def _possible_names(self, filename):
+        """Return a tuple containing compressed filename variations."""
+        names = [filename]
+        if not self._iszip(filename):
+            for zipext in _file_openers.keys():
+                if zipext:
+                    names.append(filename+zipext)
+        return names
+
+    def _isurl(self, path):
+        """Test if path is a net location.  Tests the scheme and netloc."""
+
+        # We do this here to reduce the 'import numpy' initial import time.
+        if sys.version_info[0] >= 3:
+            from urllib.parse import urlparse
+        else:
+            from urlparse import urlparse
+
+        # BUG : URLs require a scheme string ('http://') to be used.
+        #       www.google.com will fail.
+        #       Should we prepend the scheme for those that don't have it and
+        #       test that also?  Similar to the way we append .gz and test for
+        #       for compressed versions of files.
+
+        scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+        return bool(scheme and netloc)
+
+    def _cache(self, path):
+        """Cache the file specified by path.
+
+        Creates a copy of the file in the datasource cache.
+
+        """
+        # We import these here because importing urllib2 is slow and
+        # a significant fraction of numpy's total import time.
+        if sys.version_info[0] >= 3:
+            from urllib.request import urlopen
+            from urllib.error import URLError
+        else:
+            from urllib2 import urlopen
+            from urllib2 import URLError
+
+        upath = self.abspath(path)
+
+        # ensure directory exists
+        if not os.path.exists(os.path.dirname(upath)):
+            os.makedirs(os.path.dirname(upath))
+
+        # TODO: Doesn't handle compressed files!
+        if self._isurl(path):
+            try:
+                openedurl = urlopen(path)
+                f = _open(upath, 'wb')
+                try:
+                    shutil.copyfileobj(openedurl, f)
+                finally:
+                    f.close()
+                    openedurl.close()
+            except URLError:
+                raise URLError("URL not found: %s" % path)
+        else:
+            shutil.copyfile(path, upath)
+        return upath
+
+    def _findfile(self, path):
+        """Searches for ``path`` and returns full path if found.
+
+        If path is an URL, _findfile will cache a local copy and return the
+        path to the cached file.  If path is a local file, _findfile will
+        return a path to that local file.
+
+        The search will include possible compressed versions of the file
+        and return the first occurence found.
+
+        """
+
+        # Build list of possible local file paths
+        if not self._isurl(path):
+            # Valid local paths
+            filelist = self._possible_names(path)
+            # Paths in self._destpath
+            filelist += self._possible_names(self.abspath(path))
+        else:
+            # Cached URLs in self._destpath
+            filelist = self._possible_names(self.abspath(path))
+            # Remote URLs
+            filelist = filelist + self._possible_names(path)
+
+        for name in filelist:
+            if self.exists(name):
+                if self._isurl(name):
+                    name = self._cache(name)
+                return name
+        return None
+
+    def abspath(self, path):
+        """
+        Return absolute path of file in the DataSource directory.
+
+        If `path` is an URL, then `abspath` will return either the location
+        the file exists locally or the location it would exist when opened
+        using the `open` method.
+
+        Parameters
+        ----------
+        path : str
+            Can be a local file or a remote URL.
+
+        Returns
+        -------
+        out : str
+            Complete path, including the `DataSource` destination directory.
+
+        Notes
+        -----
+        The functionality is based on `os.path.abspath`.
+
+        """
+        # We do this here to reduce the 'import numpy' initial import time.
+        if sys.version_info[0] >= 3:
+            from urllib.parse import urlparse
+        else:
+            from urlparse import urlparse
+
+        # TODO:  This should be more robust.  Handles case where path includes
+        #        the destpath, but not other sub-paths. Failing case:
+        #        path = /home/guido/datafile.txt
+        #        destpath = /home/alex/
+        #        upath = self.abspath(path)
+        #        upath == '/home/alex/home/guido/datafile.txt'
+
+        # handle case where path includes self._destpath
+        splitpath = path.split(self._destpath, 2)
+        if len(splitpath) > 1:
+            path = splitpath[1]
+        scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path)
+        netloc = self._sanitize_relative_path(netloc)
+        upath = self._sanitize_relative_path(upath)
+        return os.path.join(self._destpath, netloc, upath)
+
+    def _sanitize_relative_path(self, path):
+        """Return a sanitised relative path for which
+        os.path.abspath(os.path.join(base, path)).startswith(base)
+        """
+        last = None
+        path = os.path.normpath(path)
+        while path != last:
+            last = path
+            # Note: os.path.join treats '/' as os.sep on Windows
+            path = path.lstrip(os.sep).lstrip('/')
+            path = path.lstrip(os.pardir).lstrip('..')
+            drive, path = os.path.splitdrive(path)  # for Windows
+        return path
+
+    def exists(self, path):
+        """
+        Test if path exists.
+
+        Test if `path` exists as (and in this order):
+
+        - a local file.
+        - a remote URL that has been downloaded and stored locally in the
+          `DataSource` directory.
+        - a remote URL that has not been downloaded, but is valid and
+          accessible.
+
+        Parameters
+        ----------
+        path : str
+            Can be a local file or a remote URL.
+
+        Returns
+        -------
+        out : bool
+            True if `path` exists.
+
+        Notes
+        -----
+        When `path` is an URL, `exists` will return True if it's either
+        stored locally in the `DataSource` directory, or is a valid remote
+        URL.  `DataSource` does not discriminate between the two, the file
+        is accessible if it exists in either location.
+
+        """
+        # We import this here because importing urllib2 is slow and
+        # a significant fraction of numpy's total import time.
+        if sys.version_info[0] >= 3:
+            from urllib.request import urlopen
+            from urllib.error import URLError
+        else:
+            from urllib2 import urlopen
+            from urllib2 import URLError
+
+        # Test local path
+        if os.path.exists(path):
+            return True
+
+        # Test cached url
+        upath = self.abspath(path)
+        if os.path.exists(upath):
+            return True
+
+        # Test remote url
+        if self._isurl(path):
+            try:
+                netfile = urlopen(path)
+                netfile.close()
+                del(netfile)
+                return True
+            except URLError:
+                return False
+        return False
+
+    def open(self, path, mode='r'):
+        """
+        Open and return file-like object.
+
+        If `path` is an URL, it will be downloaded, stored in the
+        `DataSource` directory and opened from there.
+
+        Parameters
+        ----------
+        path : str
+            Local file path or URL to open.
+        mode : {'r', 'w', 'a'}, optional
+            Mode to open `path`.  Mode 'r' for reading, 'w' for writing,
+            'a' to append. Available modes depend on the type of object
+            specified by `path`. Default is 'r'.
+
+        Returns
+        -------
+        out : file object
+            File object.
+
+        """
+
+        # TODO: There is no support for opening a file for writing which
+        #       doesn't exist yet (creating a file).  Should there be?
+
+        # TODO: Add a ``subdir`` parameter for specifying the subdirectory
+        #       used to store URLs in self._destpath.
+
+        if self._isurl(path) and self._iswritemode(mode):
+            raise ValueError("URLs are not writeable")
+
+        # NOTE: _findfile will fail on a new file opened for writing.
+        found = self._findfile(path)
+        if found:
+            _fname, ext = self._splitzipext(found)
+            if ext == 'bz2':
+                mode.replace("+", "")
+            return _file_openers[ext](found, mode=mode)
+        else:
+            raise IOError("%s not found." % path)
+
+
+class Repository (DataSource):
+    """
+    Repository(baseurl, destpath='.')
+
+    A data repository where multiple DataSource's share a base
+    URL/directory.
+
+    `Repository` extends `DataSource` by prepending a base URL (or
+    directory) to all the files it handles. Use `Repository` when you will
+    be working with multiple files from one base URL.  Initialize
+    `Repository` with the base URL, then refer to each file by its filename
+    only.
+
+    Parameters
+    ----------
+    baseurl : str
+        Path to the local directory or remote location that contains the
+        data files.
+    destpath : str or None, optional
+        Path to the directory where the source file gets downloaded to for
+        use.  If `destpath` is None, a temporary directory will be created.
+        The default path is the current directory.
+
+    Examples
+    --------
+    To analyze all files in the repository, do something like this
+    (note: this is not self-contained code)::
+
+        >>> repos = np.lib._datasource.Repository('/home/user/data/dir/')
+        >>> for filename in filelist:
+        ...     fp = repos.open(filename)
+        ...     fp.analyze()
+        ...     fp.close()
+
+    Similarly you could use a URL for a repository::
+
+        >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data')
+
+    """
+
+    def __init__(self, baseurl, destpath=os.curdir):
+        """Create a Repository with a shared url or directory of baseurl."""
+        DataSource.__init__(self, destpath=destpath)
+        self._baseurl = baseurl
+
+    def __del__(self):
+        DataSource.__del__(self)
+
+    def _fullpath(self, path):
+        """Return complete path for path.  Prepends baseurl if necessary."""
+        splitpath = path.split(self._baseurl, 2)
+        if len(splitpath) == 1:
+            result = os.path.join(self._baseurl, path)
+        else:
+            result = path    # path contains baseurl already
+        return result
+
+    def _findfile(self, path):
+        """Extend DataSource method to prepend baseurl to ``path``."""
+        return DataSource._findfile(self, self._fullpath(path))
+
+    def abspath(self, path):
+        """
+        Return absolute path of file in the Repository directory.
+
+        If `path` is an URL, then `abspath` will return either the location
+        the file exists locally or the location it would exist when opened
+        using the `open` method.
+
+        Parameters
+        ----------
+        path : str
+            Can be a local file or a remote URL. This may, but does not
+            have to, include the `baseurl` with which the `Repository` was
+            initialized.
+
+        Returns
+        -------
+        out : str
+            Complete path, including the `DataSource` destination directory.
+
+        """
+        return DataSource.abspath(self, self._fullpath(path))
+
+    def exists(self, path):
+        """
+        Test if path exists prepending Repository base URL to path.
+
+        Test if `path` exists as (and in this order):
+
+        - a local file.
+        - a remote URL that has been downloaded and stored locally in the
+          `DataSource` directory.
+        - a remote URL that has not been downloaded, but is valid and
+          accessible.
+
+        Parameters
+        ----------
+        path : str
+            Can be a local file or a remote URL. This may, but does not
+            have to, include the `baseurl` with which the `Repository` was
+            initialized.
+
+        Returns
+        -------
+        out : bool
+            True if `path` exists.
+
+        Notes
+        -----
+        When `path` is an URL, `exists` will return True if it's either
+        stored locally in the `DataSource` directory, or is a valid remote
+        URL.  `DataSource` does not discriminate between the two, the file
+        is accessible if it exists in either location.
+
+        """
+        return DataSource.exists(self, self._fullpath(path))
+
+    def open(self, path, mode='r'):
+        """
+        Open and return file-like object prepending Repository base URL.
+
+        If `path` is an URL, it will be downloaded, stored in the
+        DataSource directory and opened from there.
+
+        Parameters
+        ----------
+        path : str
+            Local file path or URL to open. This may, but does not have to,
+            include the `baseurl` with which the `Repository` was
+            initialized.
+        mode : {'r', 'w', 'a'}, optional
+            Mode to open `path`.  Mode 'r' for reading, 'w' for writing,
+            'a' to append. Available modes depend on the type of object
+            specified by `path`. Default is 'r'.
+
+        Returns
+        -------
+        out : file object
+            File object.
+
+        """
+        return DataSource.open(self, self._fullpath(path), mode)
+
+    def listdir(self):
+        """
+        List files in the source Repository.
+
+        Returns
+        -------
+        files : list of str
+            List of file names (not containing a directory part).
+
+        Notes
+        -----
+        Does not currently work for remote repositories.
+
+        """
+        if self._isurl(self._baseurl):
+            raise NotImplementedError(
+                  "Directory listing of URLs, not supported yet.")
+        else:
+            return os.listdir(self._baseurl)