Chris@87: """
Chris@87: Collection of utilities to manipulate structured arrays.
Chris@87: 
Chris@87: Most of these functions were initially implemented by John Hunter for
Chris@87: matplotlib.  They have been rewritten and extended for convenience.
Chris@87: 
Chris@87: """
Chris@87: from __future__ import division, absolute_import, print_function
Chris@87: 
Chris@87: import sys
Chris@87: import itertools
Chris@87: import numpy as np
Chris@87: import numpy.ma as ma
Chris@87: from numpy import ndarray, recarray
Chris@87: from numpy.ma import MaskedArray
Chris@87: from numpy.ma.mrecords import MaskedRecords
Chris@87: from numpy.lib._iotools import _is_string_like
Chris@87: from numpy.compat import basestring
Chris@87: 
Chris@87: if sys.version_info[0] < 3:
Chris@87:     from future_builtins import zip
Chris@87: 
Chris@87: _check_fill_value = np.ma.core._check_fill_value
Chris@87: 
Chris@87: 
Chris@87: __all__ = [
Chris@87:     'append_fields', 'drop_fields', 'find_duplicates',
Chris@87:     'get_fieldstructure', 'join_by', 'merge_arrays',
Chris@87:     'rec_append_fields', 'rec_drop_fields', 'rec_join',
Chris@87:     'recursive_fill_fields', 'rename_fields', 'stack_arrays',
Chris@87:     ]
Chris@87: 
Chris@87: 
Chris@87: def recursive_fill_fields(input, output):
Chris@87:     """
Chris@87:     Fills fields from output with fields from input,
Chris@87:     with support for nested structures.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     input : ndarray
Chris@87:         Input array.
Chris@87:     output : ndarray
Chris@87:         Output array.
Chris@87: 
Chris@87:     Notes
Chris@87:     -----
Chris@87:     * `output` should be at least the same size as `input`
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', int), ('B', float)])
Chris@87:     >>> b = np.zeros((3,), dtype=a.dtype)
Chris@87:     >>> rfn.recursive_fill_fields(a, b)
Chris@87:     array([(1, 10.0), (2, 20.0), (0, 0.0)],
Chris@87:           dtype=[('A', '<i4'), ('B', '<f8')])
Chris@87: 
Chris@87:     """
Chris@87:     newdtype = output.dtype
Chris@87:     for field in newdtype.names:
Chris@87:         try:
Chris@87:             current = input[field]
Chris@87:         except ValueError:
Chris@87:             continue
Chris@87:         if current.dtype.names:
Chris@87:             recursive_fill_fields(current, output[field])
Chris@87:         else:
Chris@87:             output[field][:len(current)] = current
Chris@87:     return output
Chris@87: 
Chris@87: 
Chris@87: def get_names(adtype):
Chris@87:     """
Chris@87:     Returns the field names of the input datatype as a tuple.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     adtype : dtype
Chris@87:         Input datatype
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> rfn.get_names(np.empty((1,), dtype=int)) is None
Chris@87:     True
Chris@87:     >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)]))
Chris@87:     ('A', 'B')
Chris@87:     >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
Chris@87:     >>> rfn.get_names(adtype)
Chris@87:     ('a', ('b', ('ba', 'bb')))
Chris@87:     """
Chris@87:     listnames = []
Chris@87:     names = adtype.names
Chris@87:     for name in names:
Chris@87:         current = adtype[name]
Chris@87:         if current.names:
Chris@87:             listnames.append((name, tuple(get_names(current))))
Chris@87:         else:
Chris@87:             listnames.append(name)
Chris@87:     return tuple(listnames) or None
Chris@87: 
Chris@87: 
Chris@87: def get_names_flat(adtype):
Chris@87:     """
Chris@87:     Returns the field names of the input datatype as a tuple. Nested structure
Chris@87:     are flattend beforehand.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     adtype : dtype
Chris@87:         Input datatype
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None
Chris@87:     True
Chris@87:     >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)]))
Chris@87:     ('A', 'B')
Chris@87:     >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
Chris@87:     >>> rfn.get_names_flat(adtype)
Chris@87:     ('a', 'b', 'ba', 'bb')
Chris@87:     """
Chris@87:     listnames = []
Chris@87:     names = adtype.names
Chris@87:     for name in names:
Chris@87:         listnames.append(name)
Chris@87:         current = adtype[name]
Chris@87:         if current.names:
Chris@87:             listnames.extend(get_names_flat(current))
Chris@87:     return tuple(listnames) or None
Chris@87: 
Chris@87: 
Chris@87: def flatten_descr(ndtype):
Chris@87:     """
Chris@87:     Flatten a structured data-type description.
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])])
Chris@87:     >>> rfn.flatten_descr(ndtype)
Chris@87:     (('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32')))
Chris@87: 
Chris@87:     """
Chris@87:     names = ndtype.names
Chris@87:     if names is None:
Chris@87:         return ndtype.descr
Chris@87:     else:
Chris@87:         descr = []
Chris@87:         for field in names:
Chris@87:             (typ, _) = ndtype.fields[field]
Chris@87:             if typ.names:
Chris@87:                 descr.extend(flatten_descr(typ))
Chris@87:             else:
Chris@87:                 descr.append((field, typ))
Chris@87:         return tuple(descr)
Chris@87: 
Chris@87: 
Chris@87: def zip_descr(seqarrays, flatten=False):
Chris@87:     """
Chris@87:     Combine the dtype description of a series of arrays.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     seqarrays : sequence of arrays
Chris@87:         Sequence of arrays
Chris@87:     flatten : {boolean}, optional
Chris@87:         Whether to collapse nested descriptions.
Chris@87:     """
Chris@87:     newdtype = []
Chris@87:     if flatten:
Chris@87:         for a in seqarrays:
Chris@87:             newdtype.extend(flatten_descr(a.dtype))
Chris@87:     else:
Chris@87:         for a in seqarrays:
Chris@87:             current = a.dtype
Chris@87:             names = current.names or ()
Chris@87:             if len(names) > 1:
Chris@87:                 newdtype.append(('', current.descr))
Chris@87:             else:
Chris@87:                 newdtype.extend(current.descr)
Chris@87:     return np.dtype(newdtype).descr
Chris@87: 
Chris@87: 
Chris@87: def get_fieldstructure(adtype, lastname=None, parents=None,):
Chris@87:     """
Chris@87:     Returns a dictionary with fields indexing lists of their parent fields.
Chris@87: 
Chris@87:     This function is used to simplify access to fields nested in other fields.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     adtype : np.dtype
Chris@87:         Input datatype
Chris@87:     lastname : optional
Chris@87:         Last processed field name (used internally during recursion).
Chris@87:     parents : dictionary
Chris@87:         Dictionary of parent fields (used interbally during recursion).
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> ndtype =  np.dtype([('A', int),
Chris@87:     ...                     ('B', [('BA', int),
Chris@87:     ...                            ('BB', [('BBA', int), ('BBB', int)])])])
Chris@87:     >>> rfn.get_fieldstructure(ndtype)
Chris@87:     ... # XXX: possible regression, order of BBA and BBB is swapped
Chris@87:     {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']}
Chris@87: 
Chris@87:     """
Chris@87:     if parents is None:
Chris@87:         parents = {}
Chris@87:     names = adtype.names
Chris@87:     for name in names:
Chris@87:         current = adtype[name]
Chris@87:         if current.names:
Chris@87:             if lastname:
Chris@87:                 parents[name] = [lastname, ]
Chris@87:             else:
Chris@87:                 parents[name] = []
Chris@87:             parents.update(get_fieldstructure(current, name, parents))
Chris@87:         else:
Chris@87:             lastparent = [_ for _ in (parents.get(lastname, []) or [])]
Chris@87:             if lastparent:
Chris@87:                 lastparent.append(lastname)
Chris@87:             elif lastname:
Chris@87:                 lastparent = [lastname, ]
Chris@87:             parents[name] = lastparent or []
Chris@87:     return parents or None
Chris@87: 
Chris@87: 
Chris@87: def _izip_fields_flat(iterable):
Chris@87:     """
Chris@87:     Returns an iterator of concatenated fields from a sequence of arrays,
Chris@87:     collapsing any nested structure.
Chris@87: 
Chris@87:     """
Chris@87:     for element in iterable:
Chris@87:         if isinstance(element, np.void):
Chris@87:             for f in _izip_fields_flat(tuple(element)):
Chris@87:                 yield f
Chris@87:         else:
Chris@87:             yield element
Chris@87: 
Chris@87: 
Chris@87: def _izip_fields(iterable):
Chris@87:     """
Chris@87:     Returns an iterator of concatenated fields from a sequence of arrays.
Chris@87: 
Chris@87:     """
Chris@87:     for element in iterable:
Chris@87:         if (hasattr(element, '__iter__') and
Chris@87:                 not isinstance(element, basestring)):
Chris@87:             for f in _izip_fields(element):
Chris@87:                 yield f
Chris@87:         elif isinstance(element, np.void) and len(tuple(element)) == 1:
Chris@87:             for f in _izip_fields(element):
Chris@87:                 yield f
Chris@87:         else:
Chris@87:             yield element
Chris@87: 
Chris@87: 
Chris@87: def izip_records(seqarrays, fill_value=None, flatten=True):
Chris@87:     """
Chris@87:     Returns an iterator of concatenated items from a sequence of arrays.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     seqarray : sequence of arrays
Chris@87:         Sequence of arrays.
Chris@87:     fill_value : {None, integer}
Chris@87:         Value used to pad shorter iterables.
Chris@87:     flatten : {True, False},
Chris@87:         Whether to
Chris@87:     """
Chris@87:     # OK, that's a complete ripoff from Python2.6 itertools.izip_longest
Chris@87:     def sentinel(counter=([fill_value] * (len(seqarrays) - 1)).pop):
Chris@87:         "Yields the fill_value or raises IndexError"
Chris@87:         yield counter()
Chris@87:     #
Chris@87:     fillers = itertools.repeat(fill_value)
Chris@87:     iters = [itertools.chain(it, sentinel(), fillers) for it in seqarrays]
Chris@87:     # Should we flatten the items, or just use a nested approach
Chris@87:     if flatten:
Chris@87:         zipfunc = _izip_fields_flat
Chris@87:     else:
Chris@87:         zipfunc = _izip_fields
Chris@87:     #
Chris@87:     try:
Chris@87:         for tup in zip(*iters):
Chris@87:             yield tuple(zipfunc(tup))
Chris@87:     except IndexError:
Chris@87:         pass
Chris@87: 
Chris@87: 
Chris@87: def _fix_output(output, usemask=True, asrecarray=False):
Chris@87:     """
Chris@87:     Private function: return a recarray, a ndarray, a MaskedArray
Chris@87:     or a MaskedRecords depending on the input parameters
Chris@87:     """
Chris@87:     if not isinstance(output, MaskedArray):
Chris@87:         usemask = False
Chris@87:     if usemask:
Chris@87:         if asrecarray:
Chris@87:             output = output.view(MaskedRecords)
Chris@87:     else:
Chris@87:         output = ma.filled(output)
Chris@87:         if asrecarray:
Chris@87:             output = output.view(recarray)
Chris@87:     return output
Chris@87: 
Chris@87: 
Chris@87: def _fix_defaults(output, defaults=None):
Chris@87:     """
Chris@87:     Update the fill_value and masked data of `output`
Chris@87:     from the default given in a dictionary defaults.
Chris@87:     """
Chris@87:     names = output.dtype.names
Chris@87:     (data, mask, fill_value) = (output.data, output.mask, output.fill_value)
Chris@87:     for (k, v) in (defaults or {}).items():
Chris@87:         if k in names:
Chris@87:             fill_value[k] = v
Chris@87:             data[k][mask[k]] = v
Chris@87:     return output
Chris@87: 
Chris@87: 
Chris@87: def merge_arrays(seqarrays, fill_value=-1, flatten=False,
Chris@87:                  usemask=False, asrecarray=False):
Chris@87:     """
Chris@87:     Merge arrays field by field.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     seqarrays : sequence of ndarrays
Chris@87:         Sequence of arrays
Chris@87:     fill_value : {float}, optional
Chris@87:         Filling value used to pad missing data on the shorter arrays.
Chris@87:     flatten : {False, True}, optional
Chris@87:         Whether to collapse nested fields.
Chris@87:     usemask : {False, True}, optional
Chris@87:         Whether to return a masked array or not.
Chris@87:     asrecarray : {False, True}, optional
Chris@87:         Whether to return a recarray (MaskedRecords) or not.
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])))
Chris@87:     masked_array(data = [(1, 10.0) (2, 20.0) (--, 30.0)],
Chris@87:                  mask = [(False, False) (False, False) (True, False)],
Chris@87:            fill_value = (999999, 1e+20),
Chris@87:                 dtype = [('f0', '<i4'), ('f1', '<f8')])
Chris@87: 
Chris@87:     >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])),
Chris@87:     ...              usemask=False)
Chris@87:     array([(1, 10.0), (2, 20.0), (-1, 30.0)],
Chris@87:           dtype=[('f0', '<i4'), ('f1', '<f8')])
Chris@87:     >>> rfn.merge_arrays((np.array([1, 2]).view([('a', int)]),
Chris@87:     ...               np.array([10., 20., 30.])),
Chris@87:     ...              usemask=False, asrecarray=True)
Chris@87:     rec.array([(1, 10.0), (2, 20.0), (-1, 30.0)],
Chris@87:               dtype=[('a', '<i4'), ('f1', '<f8')])
Chris@87: 
Chris@87:     Notes
Chris@87:     -----
Chris@87:     * Without a mask, the missing value will be filled with something,
Chris@87:     * depending on what its corresponding type:
Chris@87:             -1      for integers
Chris@87:             -1.0    for floating point numbers
Chris@87:             '-'     for characters
Chris@87:             '-1'    for strings
Chris@87:             True    for boolean values
Chris@87:     * XXX: I just obtained these values empirically
Chris@87:     """
Chris@87:     # Only one item in the input sequence ?
Chris@87:     if (len(seqarrays) == 1):
Chris@87:         seqarrays = np.asanyarray(seqarrays[0])
Chris@87:     # Do we have a single ndarray as input ?
Chris@87:     if isinstance(seqarrays, (ndarray, np.void)):
Chris@87:         seqdtype = seqarrays.dtype
Chris@87:         if (not flatten) or \
Chris@87:            (zip_descr((seqarrays,), flatten=True) == seqdtype.descr):
Chris@87:             # Minimal processing needed: just make sure everythng's a-ok
Chris@87:             seqarrays = seqarrays.ravel()
Chris@87:             # Make sure we have named fields
Chris@87:             if not seqdtype.names:
Chris@87:                 seqdtype = [('', seqdtype)]
Chris@87:             # Find what type of array we must return
Chris@87:             if usemask:
Chris@87:                 if asrecarray:
Chris@87:                     seqtype = MaskedRecords
Chris@87:                 else:
Chris@87:                     seqtype = MaskedArray
Chris@87:             elif asrecarray:
Chris@87:                 seqtype = recarray
Chris@87:             else:
Chris@87:                 seqtype = ndarray
Chris@87:             return seqarrays.view(dtype=seqdtype, type=seqtype)
Chris@87:         else:
Chris@87:             seqarrays = (seqarrays,)
Chris@87:     else:
Chris@87:         # Make sure we have arrays in the input sequence
Chris@87:         seqarrays = [np.asanyarray(_m) for _m in seqarrays]
Chris@87:     # Find the sizes of the inputs and their maximum
Chris@87:     sizes = tuple(a.size for a in seqarrays)
Chris@87:     maxlength = max(sizes)
Chris@87:     # Get the dtype of the output (flattening if needed)
Chris@87:     newdtype = zip_descr(seqarrays, flatten=flatten)
Chris@87:     # Initialize the sequences for data and mask
Chris@87:     seqdata = []
Chris@87:     seqmask = []
Chris@87:     # If we expect some kind of MaskedArray, make a special loop.
Chris@87:     if usemask:
Chris@87:         for (a, n) in zip(seqarrays, sizes):
Chris@87:             nbmissing = (maxlength - n)
Chris@87:             # Get the data and mask
Chris@87:             data = a.ravel().__array__()
Chris@87:             mask = ma.getmaskarray(a).ravel()
Chris@87:             # Get the filling value (if needed)
Chris@87:             if nbmissing:
Chris@87:                 fval = _check_fill_value(fill_value, a.dtype)
Chris@87:                 if isinstance(fval, (ndarray, np.void)):
Chris@87:                     if len(fval.dtype) == 1:
Chris@87:                         fval = fval.item()[0]
Chris@87:                         fmsk = True
Chris@87:                     else:
Chris@87:                         fval = np.array(fval, dtype=a.dtype, ndmin=1)
Chris@87:                         fmsk = np.ones((1,), dtype=mask.dtype)
Chris@87:             else:
Chris@87:                 fval = None
Chris@87:                 fmsk = True
Chris@87:             # Store an iterator padding the input to the expected length
Chris@87:             seqdata.append(itertools.chain(data, [fval] * nbmissing))
Chris@87:             seqmask.append(itertools.chain(mask, [fmsk] * nbmissing))
Chris@87:         # Create an iterator for the data
Chris@87:         data = tuple(izip_records(seqdata, flatten=flatten))
Chris@87:         output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength),
Chris@87:                           mask=list(izip_records(seqmask, flatten=flatten)))
Chris@87:         if asrecarray:
Chris@87:             output = output.view(MaskedRecords)
Chris@87:     else:
Chris@87:         # Same as before, without the mask we don't need...
Chris@87:         for (a, n) in zip(seqarrays, sizes):
Chris@87:             nbmissing = (maxlength - n)
Chris@87:             data = a.ravel().__array__()
Chris@87:             if nbmissing:
Chris@87:                 fval = _check_fill_value(fill_value, a.dtype)
Chris@87:                 if isinstance(fval, (ndarray, np.void)):
Chris@87:                     if len(fval.dtype) == 1:
Chris@87:                         fval = fval.item()[0]
Chris@87:                     else:
Chris@87:                         fval = np.array(fval, dtype=a.dtype, ndmin=1)
Chris@87:             else:
Chris@87:                 fval = None
Chris@87:             seqdata.append(itertools.chain(data, [fval] * nbmissing))
Chris@87:         output = np.fromiter(tuple(izip_records(seqdata, flatten=flatten)),
Chris@87:                              dtype=newdtype, count=maxlength)
Chris@87:         if asrecarray:
Chris@87:             output = output.view(recarray)
Chris@87:     # And we're done...
Chris@87:     return output
Chris@87: 
Chris@87: 
Chris@87: def drop_fields(base, drop_names, usemask=True, asrecarray=False):
Chris@87:     """
Chris@87:     Return a new array with fields in `drop_names` dropped.
Chris@87: 
Chris@87:     Nested fields are supported.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     base : array
Chris@87:         Input array
Chris@87:     drop_names : string or sequence
Chris@87:         String or sequence of strings corresponding to the names of the
Chris@87:         fields to drop.
Chris@87:     usemask : {False, True}, optional
Chris@87:         Whether to return a masked array or not.
Chris@87:     asrecarray : string or sequence, optional
Chris@87:         Whether to return a recarray or a mrecarray (`asrecarray=True`) or
Chris@87:         a plain ndarray or masked array with flexible dtype. The default
Chris@87:         is False.
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
Chris@87:     ...   dtype=[('a', int), ('b', [('ba', float), ('bb', int)])])
Chris@87:     >>> rfn.drop_fields(a, 'a')
Chris@87:     array([((2.0, 3),), ((5.0, 6),)],
Chris@87:           dtype=[('b', [('ba', '<f8'), ('bb', '<i4')])])
Chris@87:     >>> rfn.drop_fields(a, 'ba')
Chris@87:     array([(1, (3,)), (4, (6,))],
Chris@87:           dtype=[('a', '<i4'), ('b', [('bb', '<i4')])])
Chris@87:     >>> rfn.drop_fields(a, ['ba', 'bb'])
Chris@87:     array([(1,), (4,)],
Chris@87:           dtype=[('a', '<i4')])
Chris@87:     """
Chris@87:     if _is_string_like(drop_names):
Chris@87:         drop_names = [drop_names, ]
Chris@87:     else:
Chris@87:         drop_names = set(drop_names)
Chris@87: 
Chris@87:     def _drop_descr(ndtype, drop_names):
Chris@87:         names = ndtype.names
Chris@87:         newdtype = []
Chris@87:         for name in names:
Chris@87:             current = ndtype[name]
Chris@87:             if name in drop_names:
Chris@87:                 continue
Chris@87:             if current.names:
Chris@87:                 descr = _drop_descr(current, drop_names)
Chris@87:                 if descr:
Chris@87:                     newdtype.append((name, descr))
Chris@87:             else:
Chris@87:                 newdtype.append((name, current))
Chris@87:         return newdtype
Chris@87: 
Chris@87:     newdtype = _drop_descr(base.dtype, drop_names)
Chris@87:     if not newdtype:
Chris@87:         return None
Chris@87: 
Chris@87:     output = np.empty(base.shape, dtype=newdtype)
Chris@87:     output = recursive_fill_fields(base, output)
Chris@87:     return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
Chris@87: 
Chris@87: 
Chris@87: def rec_drop_fields(base, drop_names):
Chris@87:     """
Chris@87:     Returns a new numpy.recarray with fields in `drop_names` dropped.
Chris@87:     """
Chris@87:     return drop_fields(base, drop_names, usemask=False, asrecarray=True)
Chris@87: 
Chris@87: 
Chris@87: def rename_fields(base, namemapper):
Chris@87:     """
Chris@87:     Rename the fields from a flexible-datatype ndarray or recarray.
Chris@87: 
Chris@87:     Nested fields are supported.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     base : ndarray
Chris@87:         Input array whose fields must be modified.
Chris@87:     namemapper : dictionary
Chris@87:         Dictionary mapping old field names to their new version.
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
Chris@87:     ...   dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])])
Chris@87:     >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'})
Chris@87:     array([(1, (2.0, [3.0, 30.0])), (4, (5.0, [6.0, 60.0]))],
Chris@87:           dtype=[('A', '<i4'), ('b', [('ba', '<f8'), ('BB', '<f8', 2)])])
Chris@87: 
Chris@87:     """
Chris@87:     def _recursive_rename_fields(ndtype, namemapper):
Chris@87:         newdtype = []
Chris@87:         for name in ndtype.names:
Chris@87:             newname = namemapper.get(name, name)
Chris@87:             current = ndtype[name]
Chris@87:             if current.names:
Chris@87:                 newdtype.append(
Chris@87:                     (newname, _recursive_rename_fields(current, namemapper))
Chris@87:                     )
Chris@87:             else:
Chris@87:                 newdtype.append((newname, current))
Chris@87:         return newdtype
Chris@87:     newdtype = _recursive_rename_fields(base.dtype, namemapper)
Chris@87:     return base.view(newdtype)
Chris@87: 
Chris@87: 
Chris@87: def append_fields(base, names, data, dtypes=None,
Chris@87:                   fill_value=-1, usemask=True, asrecarray=False):
Chris@87:     """
Chris@87:     Add new fields to an existing array.
Chris@87: 
Chris@87:     The names of the fields are given with the `names` arguments,
Chris@87:     the corresponding values with the `data` arguments.
Chris@87:     If a single field is appended, `names`, `data` and `dtypes` do not have
Chris@87:     to be lists but just values.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     base : array
Chris@87:         Input array to extend.
Chris@87:     names : string, sequence
Chris@87:         String or sequence of strings corresponding to the names
Chris@87:         of the new fields.
Chris@87:     data : array or sequence of arrays
Chris@87:         Array or sequence of arrays storing the fields to add to the base.
Chris@87:     dtypes : sequence of datatypes, optional
Chris@87:         Datatype or sequence of datatypes.
Chris@87:         If None, the datatypes are estimated from the `data`.
Chris@87:     fill_value : {float}, optional
Chris@87:         Filling value used to pad missing data on the shorter arrays.
Chris@87:     usemask : {False, True}, optional
Chris@87:         Whether to return a masked array or not.
Chris@87:     asrecarray : {False, True}, optional
Chris@87:         Whether to return a recarray (MaskedRecords) or not.
Chris@87: 
Chris@87:     """
Chris@87:     # Check the names
Chris@87:     if isinstance(names, (tuple, list)):
Chris@87:         if len(names) != len(data):
Chris@87:             msg = "The number of arrays does not match the number of names"
Chris@87:             raise ValueError(msg)
Chris@87:     elif isinstance(names, basestring):
Chris@87:         names = [names, ]
Chris@87:         data = [data, ]
Chris@87:     #
Chris@87:     if dtypes is None:
Chris@87:         data = [np.array(a, copy=False, subok=True) for a in data]
Chris@87:         data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)]
Chris@87:     else:
Chris@87:         if not isinstance(dtypes, (tuple, list)):
Chris@87:             dtypes = [dtypes, ]
Chris@87:         if len(data) != len(dtypes):
Chris@87:             if len(dtypes) == 1:
Chris@87:                 dtypes = dtypes * len(data)
Chris@87:             else:
Chris@87:                 msg = "The dtypes argument must be None, a dtype, or a list."
Chris@87:                 raise ValueError(msg)
Chris@87:         data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)])
Chris@87:                 for (a, n, d) in zip(data, names, dtypes)]
Chris@87:     #
Chris@87:     base = merge_arrays(base, usemask=usemask, fill_value=fill_value)
Chris@87:     if len(data) > 1:
Chris@87:         data = merge_arrays(data, flatten=True, usemask=usemask,
Chris@87:                             fill_value=fill_value)
Chris@87:     else:
Chris@87:         data = data.pop()
Chris@87:     #
Chris@87:     output = ma.masked_all(max(len(base), len(data)),
Chris@87:                            dtype=base.dtype.descr + data.dtype.descr)
Chris@87:     output = recursive_fill_fields(base, output)
Chris@87:     output = recursive_fill_fields(data, output)
Chris@87:     #
Chris@87:     return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
Chris@87: 
Chris@87: 
Chris@87: def rec_append_fields(base, names, data, dtypes=None):
Chris@87:     """
Chris@87:     Add new fields to an existing array.
Chris@87: 
Chris@87:     The names of the fields are given with the `names` arguments,
Chris@87:     the corresponding values with the `data` arguments.
Chris@87:     If a single field is appended, `names`, `data` and `dtypes` do not have
Chris@87:     to be lists but just values.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     base : array
Chris@87:         Input array to extend.
Chris@87:     names : string, sequence
Chris@87:         String or sequence of strings corresponding to the names
Chris@87:         of the new fields.
Chris@87:     data : array or sequence of arrays
Chris@87:         Array or sequence of arrays storing the fields to add to the base.
Chris@87:     dtypes : sequence of datatypes, optional
Chris@87:         Datatype or sequence of datatypes.
Chris@87:         If None, the datatypes are estimated from the `data`.
Chris@87: 
Chris@87:     See Also
Chris@87:     --------
Chris@87:     append_fields
Chris@87: 
Chris@87:     Returns
Chris@87:     -------
Chris@87:     appended_array : np.recarray
Chris@87:     """
Chris@87:     return append_fields(base, names, data=data, dtypes=dtypes,
Chris@87:                          asrecarray=True, usemask=False)
Chris@87: 
Chris@87: 
Chris@87: def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
Chris@87:                  autoconvert=False):
Chris@87:     """
Chris@87:     Superposes arrays fields by fields
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     seqarrays : array or sequence
Chris@87:         Sequence of input arrays.
Chris@87:     defaults : dictionary, optional
Chris@87:         Dictionary mapping field names to the corresponding default values.
Chris@87:     usemask : {True, False}, optional
Chris@87:         Whether to return a MaskedArray (or MaskedRecords is
Chris@87:         `asrecarray==True`) or a ndarray.
Chris@87:     asrecarray : {False, True}, optional
Chris@87:         Whether to return a recarray (or MaskedRecords if `usemask==True`)
Chris@87:         or just a flexible-type ndarray.
Chris@87:     autoconvert : {False, True}, optional
Chris@87:         Whether automatically cast the type of the field to the maximum.
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> x = np.array([1, 2,])
Chris@87:     >>> rfn.stack_arrays(x) is x
Chris@87:     True
Chris@87:     >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)])
Chris@87:     >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
Chris@87:     ...   dtype=[('A', '|S3'), ('B', float), ('C', float)])
Chris@87:     >>> test = rfn.stack_arrays((z,zz))
Chris@87:     >>> test
Chris@87:     masked_array(data = [('A', 1.0, --) ('B', 2.0, --) ('a', 10.0, 100.0) ('b', 20.0, 200.0)
Chris@87:      ('c', 30.0, 300.0)],
Chris@87:                  mask = [(False, False, True) (False, False, True) (False, False, False)
Chris@87:      (False, False, False) (False, False, False)],
Chris@87:            fill_value = ('N/A', 1e+20, 1e+20),
Chris@87:                 dtype = [('A', '|S3'), ('B', '<f8'), ('C', '<f8')])
Chris@87: 
Chris@87:     """
Chris@87:     if isinstance(arrays, ndarray):
Chris@87:         return arrays
Chris@87:     elif len(arrays) == 1:
Chris@87:         return arrays[0]
Chris@87:     seqarrays = [np.asanyarray(a).ravel() for a in arrays]
Chris@87:     nrecords = [len(a) for a in seqarrays]
Chris@87:     ndtype = [a.dtype for a in seqarrays]
Chris@87:     fldnames = [d.names for d in ndtype]
Chris@87:     #
Chris@87:     dtype_l = ndtype[0]
Chris@87:     newdescr = dtype_l.descr
Chris@87:     names = [_[0] for _ in newdescr]
Chris@87:     for dtype_n in ndtype[1:]:
Chris@87:         for descr in dtype_n.descr:
Chris@87:             name = descr[0] or ''
Chris@87:             if name not in names:
Chris@87:                 newdescr.append(descr)
Chris@87:                 names.append(name)
Chris@87:             else:
Chris@87:                 nameidx = names.index(name)
Chris@87:                 current_descr = newdescr[nameidx]
Chris@87:                 if autoconvert:
Chris@87:                     if np.dtype(descr[1]) > np.dtype(current_descr[-1]):
Chris@87:                         current_descr = list(current_descr)
Chris@87:                         current_descr[-1] = descr[1]
Chris@87:                         newdescr[nameidx] = tuple(current_descr)
Chris@87:                 elif descr[1] != current_descr[-1]:
Chris@87:                     raise TypeError("Incompatible type '%s' <> '%s'" %
Chris@87:                                     (dict(newdescr)[name], descr[1]))
Chris@87:     # Only one field: use concatenate
Chris@87:     if len(newdescr) == 1:
Chris@87:         output = ma.concatenate(seqarrays)
Chris@87:     else:
Chris@87:         #
Chris@87:         output = ma.masked_all((np.sum(nrecords),), newdescr)
Chris@87:         offset = np.cumsum(np.r_[0, nrecords])
Chris@87:         seen = []
Chris@87:         for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]):
Chris@87:             names = a.dtype.names
Chris@87:             if names is None:
Chris@87:                 output['f%i' % len(seen)][i:j] = a
Chris@87:             else:
Chris@87:                 for name in n:
Chris@87:                     output[name][i:j] = a[name]
Chris@87:                     if name not in seen:
Chris@87:                         seen.append(name)
Chris@87:     #
Chris@87:     return _fix_output(_fix_defaults(output, defaults),
Chris@87:                        usemask=usemask, asrecarray=asrecarray)
Chris@87: 
Chris@87: 
Chris@87: def find_duplicates(a, key=None, ignoremask=True, return_index=False):
Chris@87:     """
Chris@87:     Find the duplicates in a structured array along a given key
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     a : array-like
Chris@87:         Input array
Chris@87:     key : {string, None}, optional
Chris@87:         Name of the fields along which to check the duplicates.
Chris@87:         If None, the search is performed by records
Chris@87:     ignoremask : {True, False}, optional
Chris@87:         Whether masked data should be discarded or considered as duplicates.
Chris@87:     return_index : {False, True}, optional
Chris@87:         Whether to return the indices of the duplicated values.
Chris@87: 
Chris@87:     Examples
Chris@87:     --------
Chris@87:     >>> from numpy.lib import recfunctions as rfn
Chris@87:     >>> ndtype = [('a', int)]
Chris@87:     >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3],
Chris@87:     ...         mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
Chris@87:     >>> rfn.find_duplicates(a, ignoremask=True, return_index=True)
Chris@87:     ... # XXX: judging by the output, the ignoremask flag has no effect
Chris@87:     """
Chris@87:     a = np.asanyarray(a).ravel()
Chris@87:     # Get a dictionary of fields
Chris@87:     fields = get_fieldstructure(a.dtype)
Chris@87:     # Get the sorting data (by selecting the corresponding field)
Chris@87:     base = a
Chris@87:     if key:
Chris@87:         for f in fields[key]:
Chris@87:             base = base[f]
Chris@87:         base = base[key]
Chris@87:     # Get the sorting indices and the sorted data
Chris@87:     sortidx = base.argsort()
Chris@87:     sortedbase = base[sortidx]
Chris@87:     sorteddata = sortedbase.filled()
Chris@87:     # Compare the sorting data
Chris@87:     flag = (sorteddata[:-1] == sorteddata[1:])
Chris@87:     # If masked data must be ignored, set the flag to false where needed
Chris@87:     if ignoremask:
Chris@87:         sortedmask = sortedbase.recordmask
Chris@87:         flag[sortedmask[1:]] = False
Chris@87:     flag = np.concatenate(([False], flag))
Chris@87:     # We need to take the point on the left as well (else we're missing it)
Chris@87:     flag[:-1] = flag[:-1] + flag[1:]
Chris@87:     duplicates = a[sortidx][flag]
Chris@87:     if return_index:
Chris@87:         return (duplicates, sortidx[flag])
Chris@87:     else:
Chris@87:         return duplicates
Chris@87: 
Chris@87: 
Chris@87: def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
Chris@87:                 defaults=None, usemask=True, asrecarray=False):
Chris@87:     """
Chris@87:     Join arrays `r1` and `r2` on key `key`.
Chris@87: 
Chris@87:     The key should be either a string or a sequence of string corresponding
Chris@87:     to the fields used to join the array.  An exception is raised if the
Chris@87:     `key` field cannot be found in the two input arrays.  Neither `r1` nor
Chris@87:     `r2` should have any duplicates along `key`: the presence of duplicates
Chris@87:     will make the output quite unreliable. Note that duplicates are not
Chris@87:     looked for by the algorithm.
Chris@87: 
Chris@87:     Parameters
Chris@87:     ----------
Chris@87:     key : {string, sequence}
Chris@87:         A string or a sequence of strings corresponding to the fields used
Chris@87:         for comparison.
Chris@87:     r1, r2 : arrays
Chris@87:         Structured arrays.
Chris@87:     jointype : {'inner', 'outer', 'leftouter'}, optional
Chris@87:         If 'inner', returns the elements common to both r1 and r2.
Chris@87:         If 'outer', returns the common elements as well as the elements of
Chris@87:         r1 not in r2 and the elements of not in r2.
Chris@87:         If 'leftouter', returns the common elements and the elements of r1
Chris@87:         not in r2.
Chris@87:     r1postfix : string, optional
Chris@87:         String appended to the names of the fields of r1 that are present
Chris@87:         in r2 but absent of the key.
Chris@87:     r2postfix : string, optional
Chris@87:         String appended to the names of the fields of r2 that are present
Chris@87:         in r1 but absent of the key.
Chris@87:     defaults : {dictionary}, optional
Chris@87:         Dictionary mapping field names to the corresponding default values.
Chris@87:     usemask : {True, False}, optional
Chris@87:         Whether to return a MaskedArray (or MaskedRecords is
Chris@87:         `asrecarray==True`) or a ndarray.
Chris@87:     asrecarray : {False, True}, optional
Chris@87:         Whether to return a recarray (or MaskedRecords if `usemask==True`)
Chris@87:         or just a flexible-type ndarray.
Chris@87: 
Chris@87:     Notes
Chris@87:     -----
Chris@87:     * The output is sorted along the key.
Chris@87:     * A temporary array is formed by dropping the fields not in the key for
Chris@87:       the two arrays and concatenating the result. This array is then
Chris@87:       sorted, and the common entries selected. The output is constructed by
Chris@87:       filling the fields with the selected entries. Matching is not
Chris@87:       preserved if there are some duplicates...
Chris@87: 
Chris@87:     """
Chris@87:     # Check jointype
Chris@87:     if jointype not in ('inner', 'outer', 'leftouter'):
Chris@87:         raise ValueError(
Chris@87:                 "The 'jointype' argument should be in 'inner', "
Chris@87:                 "'outer' or 'leftouter' (got '%s' instead)" % jointype
Chris@87:                 )
Chris@87:     # If we have a single key, put it in a tuple
Chris@87:     if isinstance(key, basestring):
Chris@87:         key = (key,)
Chris@87: 
Chris@87:     # Check the keys
Chris@87:     for name in key:
Chris@87:         if name not in r1.dtype.names:
Chris@87:             raise ValueError('r1 does not have key field %s' % name)
Chris@87:         if name not in r2.dtype.names:
Chris@87:             raise ValueError('r2 does not have key field %s' % name)
Chris@87: 
Chris@87:     # Make sure we work with ravelled arrays
Chris@87:     r1 = r1.ravel()
Chris@87:     r2 = r2.ravel()
Chris@87:     # Fixme: nb2 below is never used. Commenting out for pyflakes.
Chris@87:     # (nb1, nb2) = (len(r1), len(r2))
Chris@87:     nb1 = len(r1)
Chris@87:     (r1names, r2names) = (r1.dtype.names, r2.dtype.names)
Chris@87: 
Chris@87:     # Check the names for collision
Chris@87:     if (set.intersection(set(r1names), set(r2names)).difference(key) and
Chris@87:             not (r1postfix or r2postfix)):
Chris@87:         msg = "r1 and r2 contain common names, r1postfix and r2postfix "
Chris@87:         msg += "can't be empty"
Chris@87:         raise ValueError(msg)
Chris@87: 
Chris@87:     # Make temporary arrays of just the keys
Chris@87:     r1k = drop_fields(r1, [n for n in r1names if n not in key])
Chris@87:     r2k = drop_fields(r2, [n for n in r2names if n not in key])
Chris@87: 
Chris@87:     # Concatenate the two arrays for comparison
Chris@87:     aux = ma.concatenate((r1k, r2k))
Chris@87:     idx_sort = aux.argsort(order=key)
Chris@87:     aux = aux[idx_sort]
Chris@87:     #
Chris@87:     # Get the common keys
Chris@87:     flag_in = ma.concatenate(([False], aux[1:] == aux[:-1]))
Chris@87:     flag_in[:-1] = flag_in[1:] + flag_in[:-1]
Chris@87:     idx_in = idx_sort[flag_in]
Chris@87:     idx_1 = idx_in[(idx_in < nb1)]
Chris@87:     idx_2 = idx_in[(idx_in >= nb1)] - nb1
Chris@87:     (r1cmn, r2cmn) = (len(idx_1), len(idx_2))
Chris@87:     if jointype == 'inner':
Chris@87:         (r1spc, r2spc) = (0, 0)
Chris@87:     elif jointype == 'outer':
Chris@87:         idx_out = idx_sort[~flag_in]
Chris@87:         idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
Chris@87:         idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1))
Chris@87:         (r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn)
Chris@87:     elif jointype == 'leftouter':
Chris@87:         idx_out = idx_sort[~flag_in]
Chris@87:         idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
Chris@87:         (r1spc, r2spc) = (len(idx_1) - r1cmn, 0)
Chris@87:     # Select the entries from each input
Chris@87:     (s1, s2) = (r1[idx_1], r2[idx_2])
Chris@87:     #
Chris@87:     # Build the new description of the output array .......
Chris@87:     # Start with the key fields
Chris@87:     ndtype = [list(_) for _ in r1k.dtype.descr]
Chris@87:     # Add the other fields
Chris@87:     ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key)
Chris@87:     # Find the new list of names (it may be different from r1names)
Chris@87:     names = list(_[0] for _ in ndtype)
Chris@87:     for desc in r2.dtype.descr:
Chris@87:         desc = list(desc)
Chris@87:         name = desc[0]
Chris@87:         # Have we seen the current name already ?
Chris@87:         if name in names:
Chris@87:             nameidx = ndtype.index(desc)
Chris@87:             current = ndtype[nameidx]
Chris@87:             # The current field is part of the key: take the largest dtype
Chris@87:             if name in key:
Chris@87:                 current[-1] = max(desc[1], current[-1])
Chris@87:             # The current field is not part of the key: add the suffixes
Chris@87:             else:
Chris@87:                 current[0] += r1postfix
Chris@87:                 desc[0] += r2postfix
Chris@87:                 ndtype.insert(nameidx + 1, desc)
Chris@87:         #... we haven't: just add the description to the current list
Chris@87:         else:
Chris@87:             names.extend(desc[0])
Chris@87:             ndtype.append(desc)
Chris@87:     # Revert the elements to tuples
Chris@87:     ndtype = [tuple(_) for _ in ndtype]
Chris@87:     # Find the largest nb of common fields :
Chris@87:     # r1cmn and r2cmn should be equal, but...
Chris@87:     cmn = max(r1cmn, r2cmn)
Chris@87:     # Construct an empty array
Chris@87:     output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype)
Chris@87:     names = output.dtype.names
Chris@87:     for f in r1names:
Chris@87:         selected = s1[f]
Chris@87:         if f not in names or (f in r2names and not r2postfix and f not in key):
Chris@87:             f += r1postfix
Chris@87:         current = output[f]
Chris@87:         current[:r1cmn] = selected[:r1cmn]
Chris@87:         if jointype in ('outer', 'leftouter'):
Chris@87:             current[cmn:cmn + r1spc] = selected[r1cmn:]
Chris@87:     for f in r2names:
Chris@87:         selected = s2[f]
Chris@87:         if f not in names or (f in r1names and not r1postfix and f not in key):
Chris@87:             f += r2postfix
Chris@87:         current = output[f]
Chris@87:         current[:r2cmn] = selected[:r2cmn]
Chris@87:         if (jointype == 'outer') and r2spc:
Chris@87:             current[-r2spc:] = selected[r2cmn:]
Chris@87:     # Sort and finalize the output
Chris@87:     output.sort(order=key)
Chris@87:     kwargs = dict(usemask=usemask, asrecarray=asrecarray)
Chris@87:     return _fix_output(_fix_defaults(output, defaults), **kwargs)
Chris@87: 
Chris@87: 
Chris@87: def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
Chris@87:              defaults=None):
Chris@87:     """
Chris@87:     Join arrays `r1` and `r2` on keys.
Chris@87:     Alternative to join_by, that always returns a np.recarray.
Chris@87: 
Chris@87:     See Also
Chris@87:     --------
Chris@87:     join_by : equivalent function
Chris@87:     """
Chris@87:     kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix,
Chris@87:                   defaults=defaults, usemask=False, asrecarray=True)
Chris@87:     return join_by(key, r1, r2, **kwargs)