Chris@87: """ Chris@87: Collection of utilities to manipulate structured arrays. Chris@87: Chris@87: Most of these functions were initially implemented by John Hunter for Chris@87: matplotlib. They have been rewritten and extended for convenience. Chris@87: Chris@87: """ Chris@87: from __future__ import division, absolute_import, print_function Chris@87: Chris@87: import sys Chris@87: import itertools Chris@87: import numpy as np Chris@87: import numpy.ma as ma Chris@87: from numpy import ndarray, recarray Chris@87: from numpy.ma import MaskedArray Chris@87: from numpy.ma.mrecords import MaskedRecords Chris@87: from numpy.lib._iotools import _is_string_like Chris@87: from numpy.compat import basestring Chris@87: Chris@87: if sys.version_info[0] < 3: Chris@87: from future_builtins import zip Chris@87: Chris@87: _check_fill_value = np.ma.core._check_fill_value Chris@87: Chris@87: Chris@87: __all__ = [ Chris@87: 'append_fields', 'drop_fields', 'find_duplicates', Chris@87: 'get_fieldstructure', 'join_by', 'merge_arrays', Chris@87: 'rec_append_fields', 'rec_drop_fields', 'rec_join', Chris@87: 'recursive_fill_fields', 'rename_fields', 'stack_arrays', Chris@87: ] Chris@87: Chris@87: Chris@87: def recursive_fill_fields(input, output): Chris@87: """ Chris@87: Fills fields from output with fields from input, Chris@87: with support for nested structures. Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: input : ndarray Chris@87: Input array. Chris@87: output : ndarray Chris@87: Output array. Chris@87: Chris@87: Notes Chris@87: ----- Chris@87: * `output` should be at least the same size as `input` Chris@87: Chris@87: Examples Chris@87: -------- Chris@87: >>> from numpy.lib import recfunctions as rfn Chris@87: >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', int), ('B', float)]) Chris@87: >>> b = np.zeros((3,), dtype=a.dtype) Chris@87: >>> rfn.recursive_fill_fields(a, b) Chris@87: array([(1, 10.0), (2, 20.0), (0, 0.0)], Chris@87: dtype=[('A', '>> from numpy.lib import recfunctions as rfn Chris@87: >>> rfn.get_names(np.empty((1,), dtype=int)) is None Chris@87: True Chris@87: >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)])) Chris@87: ('A', 'B') Chris@87: >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])]) Chris@87: >>> rfn.get_names(adtype) Chris@87: ('a', ('b', ('ba', 'bb'))) Chris@87: """ Chris@87: listnames = [] Chris@87: names = adtype.names Chris@87: for name in names: Chris@87: current = adtype[name] Chris@87: if current.names: Chris@87: listnames.append((name, tuple(get_names(current)))) Chris@87: else: Chris@87: listnames.append(name) Chris@87: return tuple(listnames) or None Chris@87: Chris@87: Chris@87: def get_names_flat(adtype): Chris@87: """ Chris@87: Returns the field names of the input datatype as a tuple. Nested structure Chris@87: are flattend beforehand. Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: adtype : dtype Chris@87: Input datatype Chris@87: Chris@87: Examples Chris@87: -------- Chris@87: >>> from numpy.lib import recfunctions as rfn Chris@87: >>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None Chris@87: True Chris@87: >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)])) Chris@87: ('A', 'B') Chris@87: >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])]) Chris@87: >>> rfn.get_names_flat(adtype) Chris@87: ('a', 'b', 'ba', 'bb') Chris@87: """ Chris@87: listnames = [] Chris@87: names = adtype.names Chris@87: for name in names: Chris@87: listnames.append(name) Chris@87: current = adtype[name] Chris@87: if current.names: Chris@87: listnames.extend(get_names_flat(current)) Chris@87: return tuple(listnames) or None Chris@87: Chris@87: Chris@87: def flatten_descr(ndtype): Chris@87: """ Chris@87: Flatten a structured data-type description. Chris@87: Chris@87: Examples Chris@87: -------- Chris@87: >>> from numpy.lib import recfunctions as rfn Chris@87: >>> ndtype = np.dtype([('a', '>> rfn.flatten_descr(ndtype) Chris@87: (('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32'))) Chris@87: Chris@87: """ Chris@87: names = ndtype.names Chris@87: if names is None: Chris@87: return ndtype.descr Chris@87: else: Chris@87: descr = [] Chris@87: for field in names: Chris@87: (typ, _) = ndtype.fields[field] Chris@87: if typ.names: Chris@87: descr.extend(flatten_descr(typ)) Chris@87: else: Chris@87: descr.append((field, typ)) Chris@87: return tuple(descr) Chris@87: Chris@87: Chris@87: def zip_descr(seqarrays, flatten=False): Chris@87: """ Chris@87: Combine the dtype description of a series of arrays. Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: seqarrays : sequence of arrays Chris@87: Sequence of arrays Chris@87: flatten : {boolean}, optional Chris@87: Whether to collapse nested descriptions. Chris@87: """ Chris@87: newdtype = [] Chris@87: if flatten: Chris@87: for a in seqarrays: Chris@87: newdtype.extend(flatten_descr(a.dtype)) Chris@87: else: Chris@87: for a in seqarrays: Chris@87: current = a.dtype Chris@87: names = current.names or () Chris@87: if len(names) > 1: Chris@87: newdtype.append(('', current.descr)) Chris@87: else: Chris@87: newdtype.extend(current.descr) Chris@87: return np.dtype(newdtype).descr Chris@87: Chris@87: Chris@87: def get_fieldstructure(adtype, lastname=None, parents=None,): Chris@87: """ Chris@87: Returns a dictionary with fields indexing lists of their parent fields. Chris@87: Chris@87: This function is used to simplify access to fields nested in other fields. Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: adtype : np.dtype Chris@87: Input datatype Chris@87: lastname : optional Chris@87: Last processed field name (used internally during recursion). Chris@87: parents : dictionary Chris@87: Dictionary of parent fields (used interbally during recursion). Chris@87: Chris@87: Examples Chris@87: -------- Chris@87: >>> from numpy.lib import recfunctions as rfn Chris@87: >>> ndtype = np.dtype([('A', int), Chris@87: ... ('B', [('BA', int), Chris@87: ... ('BB', [('BBA', int), ('BBB', int)])])]) Chris@87: >>> rfn.get_fieldstructure(ndtype) Chris@87: ... # XXX: possible regression, order of BBA and BBB is swapped Chris@87: {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']} Chris@87: Chris@87: """ Chris@87: if parents is None: Chris@87: parents = {} Chris@87: names = adtype.names Chris@87: for name in names: Chris@87: current = adtype[name] Chris@87: if current.names: Chris@87: if lastname: Chris@87: parents[name] = [lastname, ] Chris@87: else: Chris@87: parents[name] = [] Chris@87: parents.update(get_fieldstructure(current, name, parents)) Chris@87: else: Chris@87: lastparent = [_ for _ in (parents.get(lastname, []) or [])] Chris@87: if lastparent: Chris@87: lastparent.append(lastname) Chris@87: elif lastname: Chris@87: lastparent = [lastname, ] Chris@87: parents[name] = lastparent or [] Chris@87: return parents or None Chris@87: Chris@87: Chris@87: def _izip_fields_flat(iterable): Chris@87: """ Chris@87: Returns an iterator of concatenated fields from a sequence of arrays, Chris@87: collapsing any nested structure. Chris@87: Chris@87: """ Chris@87: for element in iterable: Chris@87: if isinstance(element, np.void): Chris@87: for f in _izip_fields_flat(tuple(element)): Chris@87: yield f Chris@87: else: Chris@87: yield element Chris@87: Chris@87: Chris@87: def _izip_fields(iterable): Chris@87: """ Chris@87: Returns an iterator of concatenated fields from a sequence of arrays. Chris@87: Chris@87: """ Chris@87: for element in iterable: Chris@87: if (hasattr(element, '__iter__') and Chris@87: not isinstance(element, basestring)): Chris@87: for f in _izip_fields(element): Chris@87: yield f Chris@87: elif isinstance(element, np.void) and len(tuple(element)) == 1: Chris@87: for f in _izip_fields(element): Chris@87: yield f Chris@87: else: Chris@87: yield element Chris@87: Chris@87: Chris@87: def izip_records(seqarrays, fill_value=None, flatten=True): Chris@87: """ Chris@87: Returns an iterator of concatenated items from a sequence of arrays. Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: seqarray : sequence of arrays Chris@87: Sequence of arrays. Chris@87: fill_value : {None, integer} Chris@87: Value used to pad shorter iterables. Chris@87: flatten : {True, False}, Chris@87: Whether to Chris@87: """ Chris@87: # OK, that's a complete ripoff from Python2.6 itertools.izip_longest Chris@87: def sentinel(counter=([fill_value] * (len(seqarrays) - 1)).pop): Chris@87: "Yields the fill_value or raises IndexError" Chris@87: yield counter() Chris@87: # Chris@87: fillers = itertools.repeat(fill_value) Chris@87: iters = [itertools.chain(it, sentinel(), fillers) for it in seqarrays] Chris@87: # Should we flatten the items, or just use a nested approach Chris@87: if flatten: Chris@87: zipfunc = _izip_fields_flat Chris@87: else: Chris@87: zipfunc = _izip_fields Chris@87: # Chris@87: try: Chris@87: for tup in zip(*iters): Chris@87: yield tuple(zipfunc(tup)) Chris@87: except IndexError: Chris@87: pass Chris@87: Chris@87: Chris@87: def _fix_output(output, usemask=True, asrecarray=False): Chris@87: """ Chris@87: Private function: return a recarray, a ndarray, a MaskedArray Chris@87: or a MaskedRecords depending on the input parameters Chris@87: """ Chris@87: if not isinstance(output, MaskedArray): Chris@87: usemask = False Chris@87: if usemask: Chris@87: if asrecarray: Chris@87: output = output.view(MaskedRecords) Chris@87: else: Chris@87: output = ma.filled(output) Chris@87: if asrecarray: Chris@87: output = output.view(recarray) Chris@87: return output Chris@87: Chris@87: Chris@87: def _fix_defaults(output, defaults=None): Chris@87: """ Chris@87: Update the fill_value and masked data of `output` Chris@87: from the default given in a dictionary defaults. Chris@87: """ Chris@87: names = output.dtype.names Chris@87: (data, mask, fill_value) = (output.data, output.mask, output.fill_value) Chris@87: for (k, v) in (defaults or {}).items(): Chris@87: if k in names: Chris@87: fill_value[k] = v Chris@87: data[k][mask[k]] = v Chris@87: return output Chris@87: Chris@87: Chris@87: def merge_arrays(seqarrays, fill_value=-1, flatten=False, Chris@87: usemask=False, asrecarray=False): Chris@87: """ Chris@87: Merge arrays field by field. Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: seqarrays : sequence of ndarrays Chris@87: Sequence of arrays Chris@87: fill_value : {float}, optional Chris@87: Filling value used to pad missing data on the shorter arrays. Chris@87: flatten : {False, True}, optional Chris@87: Whether to collapse nested fields. Chris@87: usemask : {False, True}, optional Chris@87: Whether to return a masked array or not. Chris@87: asrecarray : {False, True}, optional Chris@87: Whether to return a recarray (MaskedRecords) or not. Chris@87: Chris@87: Examples Chris@87: -------- Chris@87: >>> from numpy.lib import recfunctions as rfn Chris@87: >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.]))) Chris@87: masked_array(data = [(1, 10.0) (2, 20.0) (--, 30.0)], Chris@87: mask = [(False, False) (False, False) (True, False)], Chris@87: fill_value = (999999, 1e+20), Chris@87: dtype = [('f0', '>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])), Chris@87: ... usemask=False) Chris@87: array([(1, 10.0), (2, 20.0), (-1, 30.0)], Chris@87: dtype=[('f0', '>> rfn.merge_arrays((np.array([1, 2]).view([('a', int)]), Chris@87: ... np.array([10., 20., 30.])), Chris@87: ... usemask=False, asrecarray=True) Chris@87: rec.array([(1, 10.0), (2, 20.0), (-1, 30.0)], Chris@87: dtype=[('a', '>> from numpy.lib import recfunctions as rfn Chris@87: >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))], Chris@87: ... dtype=[('a', int), ('b', [('ba', float), ('bb', int)])]) Chris@87: >>> rfn.drop_fields(a, 'a') Chris@87: array([((2.0, 3),), ((5.0, 6),)], Chris@87: dtype=[('b', [('ba', '>> rfn.drop_fields(a, 'ba') Chris@87: array([(1, (3,)), (4, (6,))], Chris@87: dtype=[('a', '>> rfn.drop_fields(a, ['ba', 'bb']) Chris@87: array([(1,), (4,)], Chris@87: dtype=[('a', '>> from numpy.lib import recfunctions as rfn Chris@87: >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))], Chris@87: ... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])]) Chris@87: >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'}) Chris@87: array([(1, (2.0, [3.0, 30.0])), (4, (5.0, [6.0, 60.0]))], Chris@87: dtype=[('A', ' 1: Chris@87: data = merge_arrays(data, flatten=True, usemask=usemask, Chris@87: fill_value=fill_value) Chris@87: else: Chris@87: data = data.pop() Chris@87: # Chris@87: output = ma.masked_all(max(len(base), len(data)), Chris@87: dtype=base.dtype.descr + data.dtype.descr) Chris@87: output = recursive_fill_fields(base, output) Chris@87: output = recursive_fill_fields(data, output) Chris@87: # Chris@87: return _fix_output(output, usemask=usemask, asrecarray=asrecarray) Chris@87: Chris@87: Chris@87: def rec_append_fields(base, names, data, dtypes=None): Chris@87: """ Chris@87: Add new fields to an existing array. Chris@87: Chris@87: The names of the fields are given with the `names` arguments, Chris@87: the corresponding values with the `data` arguments. Chris@87: If a single field is appended, `names`, `data` and `dtypes` do not have Chris@87: to be lists but just values. Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: base : array Chris@87: Input array to extend. Chris@87: names : string, sequence Chris@87: String or sequence of strings corresponding to the names Chris@87: of the new fields. Chris@87: data : array or sequence of arrays Chris@87: Array or sequence of arrays storing the fields to add to the base. Chris@87: dtypes : sequence of datatypes, optional Chris@87: Datatype or sequence of datatypes. Chris@87: If None, the datatypes are estimated from the `data`. Chris@87: Chris@87: See Also Chris@87: -------- Chris@87: append_fields Chris@87: Chris@87: Returns Chris@87: ------- Chris@87: appended_array : np.recarray Chris@87: """ Chris@87: return append_fields(base, names, data=data, dtypes=dtypes, Chris@87: asrecarray=True, usemask=False) Chris@87: Chris@87: Chris@87: def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False, Chris@87: autoconvert=False): Chris@87: """ Chris@87: Superposes arrays fields by fields Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: seqarrays : array or sequence Chris@87: Sequence of input arrays. Chris@87: defaults : dictionary, optional Chris@87: Dictionary mapping field names to the corresponding default values. Chris@87: usemask : {True, False}, optional Chris@87: Whether to return a MaskedArray (or MaskedRecords is Chris@87: `asrecarray==True`) or a ndarray. Chris@87: asrecarray : {False, True}, optional Chris@87: Whether to return a recarray (or MaskedRecords if `usemask==True`) Chris@87: or just a flexible-type ndarray. Chris@87: autoconvert : {False, True}, optional Chris@87: Whether automatically cast the type of the field to the maximum. Chris@87: Chris@87: Examples Chris@87: -------- Chris@87: >>> from numpy.lib import recfunctions as rfn Chris@87: >>> x = np.array([1, 2,]) Chris@87: >>> rfn.stack_arrays(x) is x Chris@87: True Chris@87: >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)]) Chris@87: >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)], Chris@87: ... dtype=[('A', '|S3'), ('B', float), ('C', float)]) Chris@87: >>> test = rfn.stack_arrays((z,zz)) Chris@87: >>> test Chris@87: masked_array(data = [('A', 1.0, --) ('B', 2.0, --) ('a', 10.0, 100.0) ('b', 20.0, 200.0) Chris@87: ('c', 30.0, 300.0)], Chris@87: mask = [(False, False, True) (False, False, True) (False, False, False) Chris@87: (False, False, False) (False, False, False)], Chris@87: fill_value = ('N/A', 1e+20, 1e+20), Chris@87: dtype = [('A', '|S3'), ('B', ' np.dtype(current_descr[-1]): Chris@87: current_descr = list(current_descr) Chris@87: current_descr[-1] = descr[1] Chris@87: newdescr[nameidx] = tuple(current_descr) Chris@87: elif descr[1] != current_descr[-1]: Chris@87: raise TypeError("Incompatible type '%s' <> '%s'" % Chris@87: (dict(newdescr)[name], descr[1])) Chris@87: # Only one field: use concatenate Chris@87: if len(newdescr) == 1: Chris@87: output = ma.concatenate(seqarrays) Chris@87: else: Chris@87: # Chris@87: output = ma.masked_all((np.sum(nrecords),), newdescr) Chris@87: offset = np.cumsum(np.r_[0, nrecords]) Chris@87: seen = [] Chris@87: for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]): Chris@87: names = a.dtype.names Chris@87: if names is None: Chris@87: output['f%i' % len(seen)][i:j] = a Chris@87: else: Chris@87: for name in n: Chris@87: output[name][i:j] = a[name] Chris@87: if name not in seen: Chris@87: seen.append(name) Chris@87: # Chris@87: return _fix_output(_fix_defaults(output, defaults), Chris@87: usemask=usemask, asrecarray=asrecarray) Chris@87: Chris@87: Chris@87: def find_duplicates(a, key=None, ignoremask=True, return_index=False): Chris@87: """ Chris@87: Find the duplicates in a structured array along a given key Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: a : array-like Chris@87: Input array Chris@87: key : {string, None}, optional Chris@87: Name of the fields along which to check the duplicates. Chris@87: If None, the search is performed by records Chris@87: ignoremask : {True, False}, optional Chris@87: Whether masked data should be discarded or considered as duplicates. Chris@87: return_index : {False, True}, optional Chris@87: Whether to return the indices of the duplicated values. Chris@87: Chris@87: Examples Chris@87: -------- Chris@87: >>> from numpy.lib import recfunctions as rfn Chris@87: >>> ndtype = [('a', int)] Chris@87: >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3], Chris@87: ... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype) Chris@87: >>> rfn.find_duplicates(a, ignoremask=True, return_index=True) Chris@87: ... # XXX: judging by the output, the ignoremask flag has no effect Chris@87: """ Chris@87: a = np.asanyarray(a).ravel() Chris@87: # Get a dictionary of fields Chris@87: fields = get_fieldstructure(a.dtype) Chris@87: # Get the sorting data (by selecting the corresponding field) Chris@87: base = a Chris@87: if key: Chris@87: for f in fields[key]: Chris@87: base = base[f] Chris@87: base = base[key] Chris@87: # Get the sorting indices and the sorted data Chris@87: sortidx = base.argsort() Chris@87: sortedbase = base[sortidx] Chris@87: sorteddata = sortedbase.filled() Chris@87: # Compare the sorting data Chris@87: flag = (sorteddata[:-1] == sorteddata[1:]) Chris@87: # If masked data must be ignored, set the flag to false where needed Chris@87: if ignoremask: Chris@87: sortedmask = sortedbase.recordmask Chris@87: flag[sortedmask[1:]] = False Chris@87: flag = np.concatenate(([False], flag)) Chris@87: # We need to take the point on the left as well (else we're missing it) Chris@87: flag[:-1] = flag[:-1] + flag[1:] Chris@87: duplicates = a[sortidx][flag] Chris@87: if return_index: Chris@87: return (duplicates, sortidx[flag]) Chris@87: else: Chris@87: return duplicates Chris@87: Chris@87: Chris@87: def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2', Chris@87: defaults=None, usemask=True, asrecarray=False): Chris@87: """ Chris@87: Join arrays `r1` and `r2` on key `key`. Chris@87: Chris@87: The key should be either a string or a sequence of string corresponding Chris@87: to the fields used to join the array. An exception is raised if the Chris@87: `key` field cannot be found in the two input arrays. Neither `r1` nor Chris@87: `r2` should have any duplicates along `key`: the presence of duplicates Chris@87: will make the output quite unreliable. Note that duplicates are not Chris@87: looked for by the algorithm. Chris@87: Chris@87: Parameters Chris@87: ---------- Chris@87: key : {string, sequence} Chris@87: A string or a sequence of strings corresponding to the fields used Chris@87: for comparison. Chris@87: r1, r2 : arrays Chris@87: Structured arrays. Chris@87: jointype : {'inner', 'outer', 'leftouter'}, optional Chris@87: If 'inner', returns the elements common to both r1 and r2. Chris@87: If 'outer', returns the common elements as well as the elements of Chris@87: r1 not in r2 and the elements of not in r2. Chris@87: If 'leftouter', returns the common elements and the elements of r1 Chris@87: not in r2. Chris@87: r1postfix : string, optional Chris@87: String appended to the names of the fields of r1 that are present Chris@87: in r2 but absent of the key. Chris@87: r2postfix : string, optional Chris@87: String appended to the names of the fields of r2 that are present Chris@87: in r1 but absent of the key. Chris@87: defaults : {dictionary}, optional Chris@87: Dictionary mapping field names to the corresponding default values. Chris@87: usemask : {True, False}, optional Chris@87: Whether to return a MaskedArray (or MaskedRecords is Chris@87: `asrecarray==True`) or a ndarray. Chris@87: asrecarray : {False, True}, optional Chris@87: Whether to return a recarray (or MaskedRecords if `usemask==True`) Chris@87: or just a flexible-type ndarray. Chris@87: Chris@87: Notes Chris@87: ----- Chris@87: * The output is sorted along the key. Chris@87: * A temporary array is formed by dropping the fields not in the key for Chris@87: the two arrays and concatenating the result. This array is then Chris@87: sorted, and the common entries selected. The output is constructed by Chris@87: filling the fields with the selected entries. Matching is not Chris@87: preserved if there are some duplicates... Chris@87: Chris@87: """ Chris@87: # Check jointype Chris@87: if jointype not in ('inner', 'outer', 'leftouter'): Chris@87: raise ValueError( Chris@87: "The 'jointype' argument should be in 'inner', " Chris@87: "'outer' or 'leftouter' (got '%s' instead)" % jointype Chris@87: ) Chris@87: # If we have a single key, put it in a tuple Chris@87: if isinstance(key, basestring): Chris@87: key = (key,) Chris@87: Chris@87: # Check the keys Chris@87: for name in key: Chris@87: if name not in r1.dtype.names: Chris@87: raise ValueError('r1 does not have key field %s' % name) Chris@87: if name not in r2.dtype.names: Chris@87: raise ValueError('r2 does not have key field %s' % name) Chris@87: Chris@87: # Make sure we work with ravelled arrays Chris@87: r1 = r1.ravel() Chris@87: r2 = r2.ravel() Chris@87: # Fixme: nb2 below is never used. Commenting out for pyflakes. Chris@87: # (nb1, nb2) = (len(r1), len(r2)) Chris@87: nb1 = len(r1) Chris@87: (r1names, r2names) = (r1.dtype.names, r2.dtype.names) Chris@87: Chris@87: # Check the names for collision Chris@87: if (set.intersection(set(r1names), set(r2names)).difference(key) and Chris@87: not (r1postfix or r2postfix)): Chris@87: msg = "r1 and r2 contain common names, r1postfix and r2postfix " Chris@87: msg += "can't be empty" Chris@87: raise ValueError(msg) Chris@87: Chris@87: # Make temporary arrays of just the keys Chris@87: r1k = drop_fields(r1, [n for n in r1names if n not in key]) Chris@87: r2k = drop_fields(r2, [n for n in r2names if n not in key]) Chris@87: Chris@87: # Concatenate the two arrays for comparison Chris@87: aux = ma.concatenate((r1k, r2k)) Chris@87: idx_sort = aux.argsort(order=key) Chris@87: aux = aux[idx_sort] Chris@87: # Chris@87: # Get the common keys Chris@87: flag_in = ma.concatenate(([False], aux[1:] == aux[:-1])) Chris@87: flag_in[:-1] = flag_in[1:] + flag_in[:-1] Chris@87: idx_in = idx_sort[flag_in] Chris@87: idx_1 = idx_in[(idx_in < nb1)] Chris@87: idx_2 = idx_in[(idx_in >= nb1)] - nb1 Chris@87: (r1cmn, r2cmn) = (len(idx_1), len(idx_2)) Chris@87: if jointype == 'inner': Chris@87: (r1spc, r2spc) = (0, 0) Chris@87: elif jointype == 'outer': Chris@87: idx_out = idx_sort[~flag_in] Chris@87: idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)])) Chris@87: idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1)) Chris@87: (r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn) Chris@87: elif jointype == 'leftouter': Chris@87: idx_out = idx_sort[~flag_in] Chris@87: idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)])) Chris@87: (r1spc, r2spc) = (len(idx_1) - r1cmn, 0) Chris@87: # Select the entries from each input Chris@87: (s1, s2) = (r1[idx_1], r2[idx_2]) Chris@87: # Chris@87: # Build the new description of the output array ....... Chris@87: # Start with the key fields Chris@87: ndtype = [list(_) for _ in r1k.dtype.descr] Chris@87: # Add the other fields Chris@87: ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key) Chris@87: # Find the new list of names (it may be different from r1names) Chris@87: names = list(_[0] for _ in ndtype) Chris@87: for desc in r2.dtype.descr: Chris@87: desc = list(desc) Chris@87: name = desc[0] Chris@87: # Have we seen the current name already ? Chris@87: if name in names: Chris@87: nameidx = ndtype.index(desc) Chris@87: current = ndtype[nameidx] Chris@87: # The current field is part of the key: take the largest dtype Chris@87: if name in key: Chris@87: current[-1] = max(desc[1], current[-1]) Chris@87: # The current field is not part of the key: add the suffixes Chris@87: else: Chris@87: current[0] += r1postfix Chris@87: desc[0] += r2postfix Chris@87: ndtype.insert(nameidx + 1, desc) Chris@87: #... we haven't: just add the description to the current list Chris@87: else: Chris@87: names.extend(desc[0]) Chris@87: ndtype.append(desc) Chris@87: # Revert the elements to tuples Chris@87: ndtype = [tuple(_) for _ in ndtype] Chris@87: # Find the largest nb of common fields : Chris@87: # r1cmn and r2cmn should be equal, but... Chris@87: cmn = max(r1cmn, r2cmn) Chris@87: # Construct an empty array Chris@87: output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype) Chris@87: names = output.dtype.names Chris@87: for f in r1names: Chris@87: selected = s1[f] Chris@87: if f not in names or (f in r2names and not r2postfix and f not in key): Chris@87: f += r1postfix Chris@87: current = output[f] Chris@87: current[:r1cmn] = selected[:r1cmn] Chris@87: if jointype in ('outer', 'leftouter'): Chris@87: current[cmn:cmn + r1spc] = selected[r1cmn:] Chris@87: for f in r2names: Chris@87: selected = s2[f] Chris@87: if f not in names or (f in r1names and not r1postfix and f not in key): Chris@87: f += r2postfix Chris@87: current = output[f] Chris@87: current[:r2cmn] = selected[:r2cmn] Chris@87: if (jointype == 'outer') and r2spc: Chris@87: current[-r2spc:] = selected[r2cmn:] Chris@87: # Sort and finalize the output Chris@87: output.sort(order=key) Chris@87: kwargs = dict(usemask=usemask, asrecarray=asrecarray) Chris@87: return _fix_output(_fix_defaults(output, defaults), **kwargs) Chris@87: Chris@87: Chris@87: def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2', Chris@87: defaults=None): Chris@87: """ Chris@87: Join arrays `r1` and `r2` on keys. Chris@87: Alternative to join_by, that always returns a np.recarray. Chris@87: Chris@87: See Also Chris@87: -------- Chris@87: join_by : equivalent function Chris@87: """ Chris@87: kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix, Chris@87: defaults=defaults, usemask=False, asrecarray=True) Chris@87: return join_by(key, r1, r2, **kwargs)