comparison DEPENDENCIES/mingw32/Python27/Lib/site-packages/numpy/lib/recfunctions.py @ 87:2a2c65a20a8b

Add Python libs and headers
author Chris Cannam
date Wed, 25 Feb 2015 14:05:22 +0000
parents
children
comparison
equal deleted inserted replaced
86:413a9d26189e 87:2a2c65a20a8b
1 """
2 Collection of utilities to manipulate structured arrays.
3
4 Most of these functions were initially implemented by John Hunter for
5 matplotlib. They have been rewritten and extended for convenience.
6
7 """
8 from __future__ import division, absolute_import, print_function
9
10 import sys
11 import itertools
12 import numpy as np
13 import numpy.ma as ma
14 from numpy import ndarray, recarray
15 from numpy.ma import MaskedArray
16 from numpy.ma.mrecords import MaskedRecords
17 from numpy.lib._iotools import _is_string_like
18 from numpy.compat import basestring
19
20 if sys.version_info[0] < 3:
21 from future_builtins import zip
22
23 _check_fill_value = np.ma.core._check_fill_value
24
25
26 __all__ = [
27 'append_fields', 'drop_fields', 'find_duplicates',
28 'get_fieldstructure', 'join_by', 'merge_arrays',
29 'rec_append_fields', 'rec_drop_fields', 'rec_join',
30 'recursive_fill_fields', 'rename_fields', 'stack_arrays',
31 ]
32
33
34 def recursive_fill_fields(input, output):
35 """
36 Fills fields from output with fields from input,
37 with support for nested structures.
38
39 Parameters
40 ----------
41 input : ndarray
42 Input array.
43 output : ndarray
44 Output array.
45
46 Notes
47 -----
48 * `output` should be at least the same size as `input`
49
50 Examples
51 --------
52 >>> from numpy.lib import recfunctions as rfn
53 >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', int), ('B', float)])
54 >>> b = np.zeros((3,), dtype=a.dtype)
55 >>> rfn.recursive_fill_fields(a, b)
56 array([(1, 10.0), (2, 20.0), (0, 0.0)],
57 dtype=[('A', '<i4'), ('B', '<f8')])
58
59 """
60 newdtype = output.dtype
61 for field in newdtype.names:
62 try:
63 current = input[field]
64 except ValueError:
65 continue
66 if current.dtype.names:
67 recursive_fill_fields(current, output[field])
68 else:
69 output[field][:len(current)] = current
70 return output
71
72
73 def get_names(adtype):
74 """
75 Returns the field names of the input datatype as a tuple.
76
77 Parameters
78 ----------
79 adtype : dtype
80 Input datatype
81
82 Examples
83 --------
84 >>> from numpy.lib import recfunctions as rfn
85 >>> rfn.get_names(np.empty((1,), dtype=int)) is None
86 True
87 >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)]))
88 ('A', 'B')
89 >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
90 >>> rfn.get_names(adtype)
91 ('a', ('b', ('ba', 'bb')))
92 """
93 listnames = []
94 names = adtype.names
95 for name in names:
96 current = adtype[name]
97 if current.names:
98 listnames.append((name, tuple(get_names(current))))
99 else:
100 listnames.append(name)
101 return tuple(listnames) or None
102
103
104 def get_names_flat(adtype):
105 """
106 Returns the field names of the input datatype as a tuple. Nested structure
107 are flattend beforehand.
108
109 Parameters
110 ----------
111 adtype : dtype
112 Input datatype
113
114 Examples
115 --------
116 >>> from numpy.lib import recfunctions as rfn
117 >>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None
118 True
119 >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)]))
120 ('A', 'B')
121 >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
122 >>> rfn.get_names_flat(adtype)
123 ('a', 'b', 'ba', 'bb')
124 """
125 listnames = []
126 names = adtype.names
127 for name in names:
128 listnames.append(name)
129 current = adtype[name]
130 if current.names:
131 listnames.extend(get_names_flat(current))
132 return tuple(listnames) or None
133
134
135 def flatten_descr(ndtype):
136 """
137 Flatten a structured data-type description.
138
139 Examples
140 --------
141 >>> from numpy.lib import recfunctions as rfn
142 >>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])])
143 >>> rfn.flatten_descr(ndtype)
144 (('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32')))
145
146 """
147 names = ndtype.names
148 if names is None:
149 return ndtype.descr
150 else:
151 descr = []
152 for field in names:
153 (typ, _) = ndtype.fields[field]
154 if typ.names:
155 descr.extend(flatten_descr(typ))
156 else:
157 descr.append((field, typ))
158 return tuple(descr)
159
160
161 def zip_descr(seqarrays, flatten=False):
162 """
163 Combine the dtype description of a series of arrays.
164
165 Parameters
166 ----------
167 seqarrays : sequence of arrays
168 Sequence of arrays
169 flatten : {boolean}, optional
170 Whether to collapse nested descriptions.
171 """
172 newdtype = []
173 if flatten:
174 for a in seqarrays:
175 newdtype.extend(flatten_descr(a.dtype))
176 else:
177 for a in seqarrays:
178 current = a.dtype
179 names = current.names or ()
180 if len(names) > 1:
181 newdtype.append(('', current.descr))
182 else:
183 newdtype.extend(current.descr)
184 return np.dtype(newdtype).descr
185
186
187 def get_fieldstructure(adtype, lastname=None, parents=None,):
188 """
189 Returns a dictionary with fields indexing lists of their parent fields.
190
191 This function is used to simplify access to fields nested in other fields.
192
193 Parameters
194 ----------
195 adtype : np.dtype
196 Input datatype
197 lastname : optional
198 Last processed field name (used internally during recursion).
199 parents : dictionary
200 Dictionary of parent fields (used interbally during recursion).
201
202 Examples
203 --------
204 >>> from numpy.lib import recfunctions as rfn
205 >>> ndtype = np.dtype([('A', int),
206 ... ('B', [('BA', int),
207 ... ('BB', [('BBA', int), ('BBB', int)])])])
208 >>> rfn.get_fieldstructure(ndtype)
209 ... # XXX: possible regression, order of BBA and BBB is swapped
210 {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']}
211
212 """
213 if parents is None:
214 parents = {}
215 names = adtype.names
216 for name in names:
217 current = adtype[name]
218 if current.names:
219 if lastname:
220 parents[name] = [lastname, ]
221 else:
222 parents[name] = []
223 parents.update(get_fieldstructure(current, name, parents))
224 else:
225 lastparent = [_ for _ in (parents.get(lastname, []) or [])]
226 if lastparent:
227 lastparent.append(lastname)
228 elif lastname:
229 lastparent = [lastname, ]
230 parents[name] = lastparent or []
231 return parents or None
232
233
234 def _izip_fields_flat(iterable):
235 """
236 Returns an iterator of concatenated fields from a sequence of arrays,
237 collapsing any nested structure.
238
239 """
240 for element in iterable:
241 if isinstance(element, np.void):
242 for f in _izip_fields_flat(tuple(element)):
243 yield f
244 else:
245 yield element
246
247
248 def _izip_fields(iterable):
249 """
250 Returns an iterator of concatenated fields from a sequence of arrays.
251
252 """
253 for element in iterable:
254 if (hasattr(element, '__iter__') and
255 not isinstance(element, basestring)):
256 for f in _izip_fields(element):
257 yield f
258 elif isinstance(element, np.void) and len(tuple(element)) == 1:
259 for f in _izip_fields(element):
260 yield f
261 else:
262 yield element
263
264
265 def izip_records(seqarrays, fill_value=None, flatten=True):
266 """
267 Returns an iterator of concatenated items from a sequence of arrays.
268
269 Parameters
270 ----------
271 seqarray : sequence of arrays
272 Sequence of arrays.
273 fill_value : {None, integer}
274 Value used to pad shorter iterables.
275 flatten : {True, False},
276 Whether to
277 """
278 # OK, that's a complete ripoff from Python2.6 itertools.izip_longest
279 def sentinel(counter=([fill_value] * (len(seqarrays) - 1)).pop):
280 "Yields the fill_value or raises IndexError"
281 yield counter()
282 #
283 fillers = itertools.repeat(fill_value)
284 iters = [itertools.chain(it, sentinel(), fillers) for it in seqarrays]
285 # Should we flatten the items, or just use a nested approach
286 if flatten:
287 zipfunc = _izip_fields_flat
288 else:
289 zipfunc = _izip_fields
290 #
291 try:
292 for tup in zip(*iters):
293 yield tuple(zipfunc(tup))
294 except IndexError:
295 pass
296
297
298 def _fix_output(output, usemask=True, asrecarray=False):
299 """
300 Private function: return a recarray, a ndarray, a MaskedArray
301 or a MaskedRecords depending on the input parameters
302 """
303 if not isinstance(output, MaskedArray):
304 usemask = False
305 if usemask:
306 if asrecarray:
307 output = output.view(MaskedRecords)
308 else:
309 output = ma.filled(output)
310 if asrecarray:
311 output = output.view(recarray)
312 return output
313
314
315 def _fix_defaults(output, defaults=None):
316 """
317 Update the fill_value and masked data of `output`
318 from the default given in a dictionary defaults.
319 """
320 names = output.dtype.names
321 (data, mask, fill_value) = (output.data, output.mask, output.fill_value)
322 for (k, v) in (defaults or {}).items():
323 if k in names:
324 fill_value[k] = v
325 data[k][mask[k]] = v
326 return output
327
328
329 def merge_arrays(seqarrays, fill_value=-1, flatten=False,
330 usemask=False, asrecarray=False):
331 """
332 Merge arrays field by field.
333
334 Parameters
335 ----------
336 seqarrays : sequence of ndarrays
337 Sequence of arrays
338 fill_value : {float}, optional
339 Filling value used to pad missing data on the shorter arrays.
340 flatten : {False, True}, optional
341 Whether to collapse nested fields.
342 usemask : {False, True}, optional
343 Whether to return a masked array or not.
344 asrecarray : {False, True}, optional
345 Whether to return a recarray (MaskedRecords) or not.
346
347 Examples
348 --------
349 >>> from numpy.lib import recfunctions as rfn
350 >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])))
351 masked_array(data = [(1, 10.0) (2, 20.0) (--, 30.0)],
352 mask = [(False, False) (False, False) (True, False)],
353 fill_value = (999999, 1e+20),
354 dtype = [('f0', '<i4'), ('f1', '<f8')])
355
356 >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])),
357 ... usemask=False)
358 array([(1, 10.0), (2, 20.0), (-1, 30.0)],
359 dtype=[('f0', '<i4'), ('f1', '<f8')])
360 >>> rfn.merge_arrays((np.array([1, 2]).view([('a', int)]),
361 ... np.array([10., 20., 30.])),
362 ... usemask=False, asrecarray=True)
363 rec.array([(1, 10.0), (2, 20.0), (-1, 30.0)],
364 dtype=[('a', '<i4'), ('f1', '<f8')])
365
366 Notes
367 -----
368 * Without a mask, the missing value will be filled with something,
369 * depending on what its corresponding type:
370 -1 for integers
371 -1.0 for floating point numbers
372 '-' for characters
373 '-1' for strings
374 True for boolean values
375 * XXX: I just obtained these values empirically
376 """
377 # Only one item in the input sequence ?
378 if (len(seqarrays) == 1):
379 seqarrays = np.asanyarray(seqarrays[0])
380 # Do we have a single ndarray as input ?
381 if isinstance(seqarrays, (ndarray, np.void)):
382 seqdtype = seqarrays.dtype
383 if (not flatten) or \
384 (zip_descr((seqarrays,), flatten=True) == seqdtype.descr):
385 # Minimal processing needed: just make sure everythng's a-ok
386 seqarrays = seqarrays.ravel()
387 # Make sure we have named fields
388 if not seqdtype.names:
389 seqdtype = [('', seqdtype)]
390 # Find what type of array we must return
391 if usemask:
392 if asrecarray:
393 seqtype = MaskedRecords
394 else:
395 seqtype = MaskedArray
396 elif asrecarray:
397 seqtype = recarray
398 else:
399 seqtype = ndarray
400 return seqarrays.view(dtype=seqdtype, type=seqtype)
401 else:
402 seqarrays = (seqarrays,)
403 else:
404 # Make sure we have arrays in the input sequence
405 seqarrays = [np.asanyarray(_m) for _m in seqarrays]
406 # Find the sizes of the inputs and their maximum
407 sizes = tuple(a.size for a in seqarrays)
408 maxlength = max(sizes)
409 # Get the dtype of the output (flattening if needed)
410 newdtype = zip_descr(seqarrays, flatten=flatten)
411 # Initialize the sequences for data and mask
412 seqdata = []
413 seqmask = []
414 # If we expect some kind of MaskedArray, make a special loop.
415 if usemask:
416 for (a, n) in zip(seqarrays, sizes):
417 nbmissing = (maxlength - n)
418 # Get the data and mask
419 data = a.ravel().__array__()
420 mask = ma.getmaskarray(a).ravel()
421 # Get the filling value (if needed)
422 if nbmissing:
423 fval = _check_fill_value(fill_value, a.dtype)
424 if isinstance(fval, (ndarray, np.void)):
425 if len(fval.dtype) == 1:
426 fval = fval.item()[0]
427 fmsk = True
428 else:
429 fval = np.array(fval, dtype=a.dtype, ndmin=1)
430 fmsk = np.ones((1,), dtype=mask.dtype)
431 else:
432 fval = None
433 fmsk = True
434 # Store an iterator padding the input to the expected length
435 seqdata.append(itertools.chain(data, [fval] * nbmissing))
436 seqmask.append(itertools.chain(mask, [fmsk] * nbmissing))
437 # Create an iterator for the data
438 data = tuple(izip_records(seqdata, flatten=flatten))
439 output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength),
440 mask=list(izip_records(seqmask, flatten=flatten)))
441 if asrecarray:
442 output = output.view(MaskedRecords)
443 else:
444 # Same as before, without the mask we don't need...
445 for (a, n) in zip(seqarrays, sizes):
446 nbmissing = (maxlength - n)
447 data = a.ravel().__array__()
448 if nbmissing:
449 fval = _check_fill_value(fill_value, a.dtype)
450 if isinstance(fval, (ndarray, np.void)):
451 if len(fval.dtype) == 1:
452 fval = fval.item()[0]
453 else:
454 fval = np.array(fval, dtype=a.dtype, ndmin=1)
455 else:
456 fval = None
457 seqdata.append(itertools.chain(data, [fval] * nbmissing))
458 output = np.fromiter(tuple(izip_records(seqdata, flatten=flatten)),
459 dtype=newdtype, count=maxlength)
460 if asrecarray:
461 output = output.view(recarray)
462 # And we're done...
463 return output
464
465
466 def drop_fields(base, drop_names, usemask=True, asrecarray=False):
467 """
468 Return a new array with fields in `drop_names` dropped.
469
470 Nested fields are supported.
471
472 Parameters
473 ----------
474 base : array
475 Input array
476 drop_names : string or sequence
477 String or sequence of strings corresponding to the names of the
478 fields to drop.
479 usemask : {False, True}, optional
480 Whether to return a masked array or not.
481 asrecarray : string or sequence, optional
482 Whether to return a recarray or a mrecarray (`asrecarray=True`) or
483 a plain ndarray or masked array with flexible dtype. The default
484 is False.
485
486 Examples
487 --------
488 >>> from numpy.lib import recfunctions as rfn
489 >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
490 ... dtype=[('a', int), ('b', [('ba', float), ('bb', int)])])
491 >>> rfn.drop_fields(a, 'a')
492 array([((2.0, 3),), ((5.0, 6),)],
493 dtype=[('b', [('ba', '<f8'), ('bb', '<i4')])])
494 >>> rfn.drop_fields(a, 'ba')
495 array([(1, (3,)), (4, (6,))],
496 dtype=[('a', '<i4'), ('b', [('bb', '<i4')])])
497 >>> rfn.drop_fields(a, ['ba', 'bb'])
498 array([(1,), (4,)],
499 dtype=[('a', '<i4')])
500 """
501 if _is_string_like(drop_names):
502 drop_names = [drop_names, ]
503 else:
504 drop_names = set(drop_names)
505
506 def _drop_descr(ndtype, drop_names):
507 names = ndtype.names
508 newdtype = []
509 for name in names:
510 current = ndtype[name]
511 if name in drop_names:
512 continue
513 if current.names:
514 descr = _drop_descr(current, drop_names)
515 if descr:
516 newdtype.append((name, descr))
517 else:
518 newdtype.append((name, current))
519 return newdtype
520
521 newdtype = _drop_descr(base.dtype, drop_names)
522 if not newdtype:
523 return None
524
525 output = np.empty(base.shape, dtype=newdtype)
526 output = recursive_fill_fields(base, output)
527 return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
528
529
530 def rec_drop_fields(base, drop_names):
531 """
532 Returns a new numpy.recarray with fields in `drop_names` dropped.
533 """
534 return drop_fields(base, drop_names, usemask=False, asrecarray=True)
535
536
537 def rename_fields(base, namemapper):
538 """
539 Rename the fields from a flexible-datatype ndarray or recarray.
540
541 Nested fields are supported.
542
543 Parameters
544 ----------
545 base : ndarray
546 Input array whose fields must be modified.
547 namemapper : dictionary
548 Dictionary mapping old field names to their new version.
549
550 Examples
551 --------
552 >>> from numpy.lib import recfunctions as rfn
553 >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
554 ... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])])
555 >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'})
556 array([(1, (2.0, [3.0, 30.0])), (4, (5.0, [6.0, 60.0]))],
557 dtype=[('A', '<i4'), ('b', [('ba', '<f8'), ('BB', '<f8', 2)])])
558
559 """
560 def _recursive_rename_fields(ndtype, namemapper):
561 newdtype = []
562 for name in ndtype.names:
563 newname = namemapper.get(name, name)
564 current = ndtype[name]
565 if current.names:
566 newdtype.append(
567 (newname, _recursive_rename_fields(current, namemapper))
568 )
569 else:
570 newdtype.append((newname, current))
571 return newdtype
572 newdtype = _recursive_rename_fields(base.dtype, namemapper)
573 return base.view(newdtype)
574
575
576 def append_fields(base, names, data, dtypes=None,
577 fill_value=-1, usemask=True, asrecarray=False):
578 """
579 Add new fields to an existing array.
580
581 The names of the fields are given with the `names` arguments,
582 the corresponding values with the `data` arguments.
583 If a single field is appended, `names`, `data` and `dtypes` do not have
584 to be lists but just values.
585
586 Parameters
587 ----------
588 base : array
589 Input array to extend.
590 names : string, sequence
591 String or sequence of strings corresponding to the names
592 of the new fields.
593 data : array or sequence of arrays
594 Array or sequence of arrays storing the fields to add to the base.
595 dtypes : sequence of datatypes, optional
596 Datatype or sequence of datatypes.
597 If None, the datatypes are estimated from the `data`.
598 fill_value : {float}, optional
599 Filling value used to pad missing data on the shorter arrays.
600 usemask : {False, True}, optional
601 Whether to return a masked array or not.
602 asrecarray : {False, True}, optional
603 Whether to return a recarray (MaskedRecords) or not.
604
605 """
606 # Check the names
607 if isinstance(names, (tuple, list)):
608 if len(names) != len(data):
609 msg = "The number of arrays does not match the number of names"
610 raise ValueError(msg)
611 elif isinstance(names, basestring):
612 names = [names, ]
613 data = [data, ]
614 #
615 if dtypes is None:
616 data = [np.array(a, copy=False, subok=True) for a in data]
617 data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)]
618 else:
619 if not isinstance(dtypes, (tuple, list)):
620 dtypes = [dtypes, ]
621 if len(data) != len(dtypes):
622 if len(dtypes) == 1:
623 dtypes = dtypes * len(data)
624 else:
625 msg = "The dtypes argument must be None, a dtype, or a list."
626 raise ValueError(msg)
627 data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)])
628 for (a, n, d) in zip(data, names, dtypes)]
629 #
630 base = merge_arrays(base, usemask=usemask, fill_value=fill_value)
631 if len(data) > 1:
632 data = merge_arrays(data, flatten=True, usemask=usemask,
633 fill_value=fill_value)
634 else:
635 data = data.pop()
636 #
637 output = ma.masked_all(max(len(base), len(data)),
638 dtype=base.dtype.descr + data.dtype.descr)
639 output = recursive_fill_fields(base, output)
640 output = recursive_fill_fields(data, output)
641 #
642 return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
643
644
645 def rec_append_fields(base, names, data, dtypes=None):
646 """
647 Add new fields to an existing array.
648
649 The names of the fields are given with the `names` arguments,
650 the corresponding values with the `data` arguments.
651 If a single field is appended, `names`, `data` and `dtypes` do not have
652 to be lists but just values.
653
654 Parameters
655 ----------
656 base : array
657 Input array to extend.
658 names : string, sequence
659 String or sequence of strings corresponding to the names
660 of the new fields.
661 data : array or sequence of arrays
662 Array or sequence of arrays storing the fields to add to the base.
663 dtypes : sequence of datatypes, optional
664 Datatype or sequence of datatypes.
665 If None, the datatypes are estimated from the `data`.
666
667 See Also
668 --------
669 append_fields
670
671 Returns
672 -------
673 appended_array : np.recarray
674 """
675 return append_fields(base, names, data=data, dtypes=dtypes,
676 asrecarray=True, usemask=False)
677
678
679 def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
680 autoconvert=False):
681 """
682 Superposes arrays fields by fields
683
684 Parameters
685 ----------
686 seqarrays : array or sequence
687 Sequence of input arrays.
688 defaults : dictionary, optional
689 Dictionary mapping field names to the corresponding default values.
690 usemask : {True, False}, optional
691 Whether to return a MaskedArray (or MaskedRecords is
692 `asrecarray==True`) or a ndarray.
693 asrecarray : {False, True}, optional
694 Whether to return a recarray (or MaskedRecords if `usemask==True`)
695 or just a flexible-type ndarray.
696 autoconvert : {False, True}, optional
697 Whether automatically cast the type of the field to the maximum.
698
699 Examples
700 --------
701 >>> from numpy.lib import recfunctions as rfn
702 >>> x = np.array([1, 2,])
703 >>> rfn.stack_arrays(x) is x
704 True
705 >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)])
706 >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
707 ... dtype=[('A', '|S3'), ('B', float), ('C', float)])
708 >>> test = rfn.stack_arrays((z,zz))
709 >>> test
710 masked_array(data = [('A', 1.0, --) ('B', 2.0, --) ('a', 10.0, 100.0) ('b', 20.0, 200.0)
711 ('c', 30.0, 300.0)],
712 mask = [(False, False, True) (False, False, True) (False, False, False)
713 (False, False, False) (False, False, False)],
714 fill_value = ('N/A', 1e+20, 1e+20),
715 dtype = [('A', '|S3'), ('B', '<f8'), ('C', '<f8')])
716
717 """
718 if isinstance(arrays, ndarray):
719 return arrays
720 elif len(arrays) == 1:
721 return arrays[0]
722 seqarrays = [np.asanyarray(a).ravel() for a in arrays]
723 nrecords = [len(a) for a in seqarrays]
724 ndtype = [a.dtype for a in seqarrays]
725 fldnames = [d.names for d in ndtype]
726 #
727 dtype_l = ndtype[0]
728 newdescr = dtype_l.descr
729 names = [_[0] for _ in newdescr]
730 for dtype_n in ndtype[1:]:
731 for descr in dtype_n.descr:
732 name = descr[0] or ''
733 if name not in names:
734 newdescr.append(descr)
735 names.append(name)
736 else:
737 nameidx = names.index(name)
738 current_descr = newdescr[nameidx]
739 if autoconvert:
740 if np.dtype(descr[1]) > np.dtype(current_descr[-1]):
741 current_descr = list(current_descr)
742 current_descr[-1] = descr[1]
743 newdescr[nameidx] = tuple(current_descr)
744 elif descr[1] != current_descr[-1]:
745 raise TypeError("Incompatible type '%s' <> '%s'" %
746 (dict(newdescr)[name], descr[1]))
747 # Only one field: use concatenate
748 if len(newdescr) == 1:
749 output = ma.concatenate(seqarrays)
750 else:
751 #
752 output = ma.masked_all((np.sum(nrecords),), newdescr)
753 offset = np.cumsum(np.r_[0, nrecords])
754 seen = []
755 for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]):
756 names = a.dtype.names
757 if names is None:
758 output['f%i' % len(seen)][i:j] = a
759 else:
760 for name in n:
761 output[name][i:j] = a[name]
762 if name not in seen:
763 seen.append(name)
764 #
765 return _fix_output(_fix_defaults(output, defaults),
766 usemask=usemask, asrecarray=asrecarray)
767
768
769 def find_duplicates(a, key=None, ignoremask=True, return_index=False):
770 """
771 Find the duplicates in a structured array along a given key
772
773 Parameters
774 ----------
775 a : array-like
776 Input array
777 key : {string, None}, optional
778 Name of the fields along which to check the duplicates.
779 If None, the search is performed by records
780 ignoremask : {True, False}, optional
781 Whether masked data should be discarded or considered as duplicates.
782 return_index : {False, True}, optional
783 Whether to return the indices of the duplicated values.
784
785 Examples
786 --------
787 >>> from numpy.lib import recfunctions as rfn
788 >>> ndtype = [('a', int)]
789 >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3],
790 ... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
791 >>> rfn.find_duplicates(a, ignoremask=True, return_index=True)
792 ... # XXX: judging by the output, the ignoremask flag has no effect
793 """
794 a = np.asanyarray(a).ravel()
795 # Get a dictionary of fields
796 fields = get_fieldstructure(a.dtype)
797 # Get the sorting data (by selecting the corresponding field)
798 base = a
799 if key:
800 for f in fields[key]:
801 base = base[f]
802 base = base[key]
803 # Get the sorting indices and the sorted data
804 sortidx = base.argsort()
805 sortedbase = base[sortidx]
806 sorteddata = sortedbase.filled()
807 # Compare the sorting data
808 flag = (sorteddata[:-1] == sorteddata[1:])
809 # If masked data must be ignored, set the flag to false where needed
810 if ignoremask:
811 sortedmask = sortedbase.recordmask
812 flag[sortedmask[1:]] = False
813 flag = np.concatenate(([False], flag))
814 # We need to take the point on the left as well (else we're missing it)
815 flag[:-1] = flag[:-1] + flag[1:]
816 duplicates = a[sortidx][flag]
817 if return_index:
818 return (duplicates, sortidx[flag])
819 else:
820 return duplicates
821
822
823 def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
824 defaults=None, usemask=True, asrecarray=False):
825 """
826 Join arrays `r1` and `r2` on key `key`.
827
828 The key should be either a string or a sequence of string corresponding
829 to the fields used to join the array. An exception is raised if the
830 `key` field cannot be found in the two input arrays. Neither `r1` nor
831 `r2` should have any duplicates along `key`: the presence of duplicates
832 will make the output quite unreliable. Note that duplicates are not
833 looked for by the algorithm.
834
835 Parameters
836 ----------
837 key : {string, sequence}
838 A string or a sequence of strings corresponding to the fields used
839 for comparison.
840 r1, r2 : arrays
841 Structured arrays.
842 jointype : {'inner', 'outer', 'leftouter'}, optional
843 If 'inner', returns the elements common to both r1 and r2.
844 If 'outer', returns the common elements as well as the elements of
845 r1 not in r2 and the elements of not in r2.
846 If 'leftouter', returns the common elements and the elements of r1
847 not in r2.
848 r1postfix : string, optional
849 String appended to the names of the fields of r1 that are present
850 in r2 but absent of the key.
851 r2postfix : string, optional
852 String appended to the names of the fields of r2 that are present
853 in r1 but absent of the key.
854 defaults : {dictionary}, optional
855 Dictionary mapping field names to the corresponding default values.
856 usemask : {True, False}, optional
857 Whether to return a MaskedArray (or MaskedRecords is
858 `asrecarray==True`) or a ndarray.
859 asrecarray : {False, True}, optional
860 Whether to return a recarray (or MaskedRecords if `usemask==True`)
861 or just a flexible-type ndarray.
862
863 Notes
864 -----
865 * The output is sorted along the key.
866 * A temporary array is formed by dropping the fields not in the key for
867 the two arrays and concatenating the result. This array is then
868 sorted, and the common entries selected. The output is constructed by
869 filling the fields with the selected entries. Matching is not
870 preserved if there are some duplicates...
871
872 """
873 # Check jointype
874 if jointype not in ('inner', 'outer', 'leftouter'):
875 raise ValueError(
876 "The 'jointype' argument should be in 'inner', "
877 "'outer' or 'leftouter' (got '%s' instead)" % jointype
878 )
879 # If we have a single key, put it in a tuple
880 if isinstance(key, basestring):
881 key = (key,)
882
883 # Check the keys
884 for name in key:
885 if name not in r1.dtype.names:
886 raise ValueError('r1 does not have key field %s' % name)
887 if name not in r2.dtype.names:
888 raise ValueError('r2 does not have key field %s' % name)
889
890 # Make sure we work with ravelled arrays
891 r1 = r1.ravel()
892 r2 = r2.ravel()
893 # Fixme: nb2 below is never used. Commenting out for pyflakes.
894 # (nb1, nb2) = (len(r1), len(r2))
895 nb1 = len(r1)
896 (r1names, r2names) = (r1.dtype.names, r2.dtype.names)
897
898 # Check the names for collision
899 if (set.intersection(set(r1names), set(r2names)).difference(key) and
900 not (r1postfix or r2postfix)):
901 msg = "r1 and r2 contain common names, r1postfix and r2postfix "
902 msg += "can't be empty"
903 raise ValueError(msg)
904
905 # Make temporary arrays of just the keys
906 r1k = drop_fields(r1, [n for n in r1names if n not in key])
907 r2k = drop_fields(r2, [n for n in r2names if n not in key])
908
909 # Concatenate the two arrays for comparison
910 aux = ma.concatenate((r1k, r2k))
911 idx_sort = aux.argsort(order=key)
912 aux = aux[idx_sort]
913 #
914 # Get the common keys
915 flag_in = ma.concatenate(([False], aux[1:] == aux[:-1]))
916 flag_in[:-1] = flag_in[1:] + flag_in[:-1]
917 idx_in = idx_sort[flag_in]
918 idx_1 = idx_in[(idx_in < nb1)]
919 idx_2 = idx_in[(idx_in >= nb1)] - nb1
920 (r1cmn, r2cmn) = (len(idx_1), len(idx_2))
921 if jointype == 'inner':
922 (r1spc, r2spc) = (0, 0)
923 elif jointype == 'outer':
924 idx_out = idx_sort[~flag_in]
925 idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
926 idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1))
927 (r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn)
928 elif jointype == 'leftouter':
929 idx_out = idx_sort[~flag_in]
930 idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
931 (r1spc, r2spc) = (len(idx_1) - r1cmn, 0)
932 # Select the entries from each input
933 (s1, s2) = (r1[idx_1], r2[idx_2])
934 #
935 # Build the new description of the output array .......
936 # Start with the key fields
937 ndtype = [list(_) for _ in r1k.dtype.descr]
938 # Add the other fields
939 ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key)
940 # Find the new list of names (it may be different from r1names)
941 names = list(_[0] for _ in ndtype)
942 for desc in r2.dtype.descr:
943 desc = list(desc)
944 name = desc[0]
945 # Have we seen the current name already ?
946 if name in names:
947 nameidx = ndtype.index(desc)
948 current = ndtype[nameidx]
949 # The current field is part of the key: take the largest dtype
950 if name in key:
951 current[-1] = max(desc[1], current[-1])
952 # The current field is not part of the key: add the suffixes
953 else:
954 current[0] += r1postfix
955 desc[0] += r2postfix
956 ndtype.insert(nameidx + 1, desc)
957 #... we haven't: just add the description to the current list
958 else:
959 names.extend(desc[0])
960 ndtype.append(desc)
961 # Revert the elements to tuples
962 ndtype = [tuple(_) for _ in ndtype]
963 # Find the largest nb of common fields :
964 # r1cmn and r2cmn should be equal, but...
965 cmn = max(r1cmn, r2cmn)
966 # Construct an empty array
967 output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype)
968 names = output.dtype.names
969 for f in r1names:
970 selected = s1[f]
971 if f not in names or (f in r2names and not r2postfix and f not in key):
972 f += r1postfix
973 current = output[f]
974 current[:r1cmn] = selected[:r1cmn]
975 if jointype in ('outer', 'leftouter'):
976 current[cmn:cmn + r1spc] = selected[r1cmn:]
977 for f in r2names:
978 selected = s2[f]
979 if f not in names or (f in r1names and not r1postfix and f not in key):
980 f += r2postfix
981 current = output[f]
982 current[:r2cmn] = selected[:r2cmn]
983 if (jointype == 'outer') and r2spc:
984 current[-r2spc:] = selected[r2cmn:]
985 # Sort and finalize the output
986 output.sort(order=key)
987 kwargs = dict(usemask=usemask, asrecarray=asrecarray)
988 return _fix_output(_fix_defaults(output, defaults), **kwargs)
989
990
991 def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
992 defaults=None):
993 """
994 Join arrays `r1` and `r2` on keys.
995 Alternative to join_by, that always returns a np.recarray.
996
997 See Also
998 --------
999 join_by : equivalent function
1000 """
1001 kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix,
1002 defaults=defaults, usemask=False, asrecarray=True)
1003 return join_by(key, r1, r2, **kwargs)