Mercurial > hg > vamp-build-and-test
comparison DEPENDENCIES/mingw32/Python27/Lib/site-packages/numpy/lib/recfunctions.py @ 87:2a2c65a20a8b
Add Python libs and headers
author | Chris Cannam |
---|---|
date | Wed, 25 Feb 2015 14:05:22 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
86:413a9d26189e | 87:2a2c65a20a8b |
---|---|
1 """ | |
2 Collection of utilities to manipulate structured arrays. | |
3 | |
4 Most of these functions were initially implemented by John Hunter for | |
5 matplotlib. They have been rewritten and extended for convenience. | |
6 | |
7 """ | |
8 from __future__ import division, absolute_import, print_function | |
9 | |
10 import sys | |
11 import itertools | |
12 import numpy as np | |
13 import numpy.ma as ma | |
14 from numpy import ndarray, recarray | |
15 from numpy.ma import MaskedArray | |
16 from numpy.ma.mrecords import MaskedRecords | |
17 from numpy.lib._iotools import _is_string_like | |
18 from numpy.compat import basestring | |
19 | |
20 if sys.version_info[0] < 3: | |
21 from future_builtins import zip | |
22 | |
23 _check_fill_value = np.ma.core._check_fill_value | |
24 | |
25 | |
26 __all__ = [ | |
27 'append_fields', 'drop_fields', 'find_duplicates', | |
28 'get_fieldstructure', 'join_by', 'merge_arrays', | |
29 'rec_append_fields', 'rec_drop_fields', 'rec_join', | |
30 'recursive_fill_fields', 'rename_fields', 'stack_arrays', | |
31 ] | |
32 | |
33 | |
34 def recursive_fill_fields(input, output): | |
35 """ | |
36 Fills fields from output with fields from input, | |
37 with support for nested structures. | |
38 | |
39 Parameters | |
40 ---------- | |
41 input : ndarray | |
42 Input array. | |
43 output : ndarray | |
44 Output array. | |
45 | |
46 Notes | |
47 ----- | |
48 * `output` should be at least the same size as `input` | |
49 | |
50 Examples | |
51 -------- | |
52 >>> from numpy.lib import recfunctions as rfn | |
53 >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', int), ('B', float)]) | |
54 >>> b = np.zeros((3,), dtype=a.dtype) | |
55 >>> rfn.recursive_fill_fields(a, b) | |
56 array([(1, 10.0), (2, 20.0), (0, 0.0)], | |
57 dtype=[('A', '<i4'), ('B', '<f8')]) | |
58 | |
59 """ | |
60 newdtype = output.dtype | |
61 for field in newdtype.names: | |
62 try: | |
63 current = input[field] | |
64 except ValueError: | |
65 continue | |
66 if current.dtype.names: | |
67 recursive_fill_fields(current, output[field]) | |
68 else: | |
69 output[field][:len(current)] = current | |
70 return output | |
71 | |
72 | |
73 def get_names(adtype): | |
74 """ | |
75 Returns the field names of the input datatype as a tuple. | |
76 | |
77 Parameters | |
78 ---------- | |
79 adtype : dtype | |
80 Input datatype | |
81 | |
82 Examples | |
83 -------- | |
84 >>> from numpy.lib import recfunctions as rfn | |
85 >>> rfn.get_names(np.empty((1,), dtype=int)) is None | |
86 True | |
87 >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)])) | |
88 ('A', 'B') | |
89 >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])]) | |
90 >>> rfn.get_names(adtype) | |
91 ('a', ('b', ('ba', 'bb'))) | |
92 """ | |
93 listnames = [] | |
94 names = adtype.names | |
95 for name in names: | |
96 current = adtype[name] | |
97 if current.names: | |
98 listnames.append((name, tuple(get_names(current)))) | |
99 else: | |
100 listnames.append(name) | |
101 return tuple(listnames) or None | |
102 | |
103 | |
104 def get_names_flat(adtype): | |
105 """ | |
106 Returns the field names of the input datatype as a tuple. Nested structure | |
107 are flattend beforehand. | |
108 | |
109 Parameters | |
110 ---------- | |
111 adtype : dtype | |
112 Input datatype | |
113 | |
114 Examples | |
115 -------- | |
116 >>> from numpy.lib import recfunctions as rfn | |
117 >>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None | |
118 True | |
119 >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)])) | |
120 ('A', 'B') | |
121 >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])]) | |
122 >>> rfn.get_names_flat(adtype) | |
123 ('a', 'b', 'ba', 'bb') | |
124 """ | |
125 listnames = [] | |
126 names = adtype.names | |
127 for name in names: | |
128 listnames.append(name) | |
129 current = adtype[name] | |
130 if current.names: | |
131 listnames.extend(get_names_flat(current)) | |
132 return tuple(listnames) or None | |
133 | |
134 | |
135 def flatten_descr(ndtype): | |
136 """ | |
137 Flatten a structured data-type description. | |
138 | |
139 Examples | |
140 -------- | |
141 >>> from numpy.lib import recfunctions as rfn | |
142 >>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])]) | |
143 >>> rfn.flatten_descr(ndtype) | |
144 (('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32'))) | |
145 | |
146 """ | |
147 names = ndtype.names | |
148 if names is None: | |
149 return ndtype.descr | |
150 else: | |
151 descr = [] | |
152 for field in names: | |
153 (typ, _) = ndtype.fields[field] | |
154 if typ.names: | |
155 descr.extend(flatten_descr(typ)) | |
156 else: | |
157 descr.append((field, typ)) | |
158 return tuple(descr) | |
159 | |
160 | |
161 def zip_descr(seqarrays, flatten=False): | |
162 """ | |
163 Combine the dtype description of a series of arrays. | |
164 | |
165 Parameters | |
166 ---------- | |
167 seqarrays : sequence of arrays | |
168 Sequence of arrays | |
169 flatten : {boolean}, optional | |
170 Whether to collapse nested descriptions. | |
171 """ | |
172 newdtype = [] | |
173 if flatten: | |
174 for a in seqarrays: | |
175 newdtype.extend(flatten_descr(a.dtype)) | |
176 else: | |
177 for a in seqarrays: | |
178 current = a.dtype | |
179 names = current.names or () | |
180 if len(names) > 1: | |
181 newdtype.append(('', current.descr)) | |
182 else: | |
183 newdtype.extend(current.descr) | |
184 return np.dtype(newdtype).descr | |
185 | |
186 | |
187 def get_fieldstructure(adtype, lastname=None, parents=None,): | |
188 """ | |
189 Returns a dictionary with fields indexing lists of their parent fields. | |
190 | |
191 This function is used to simplify access to fields nested in other fields. | |
192 | |
193 Parameters | |
194 ---------- | |
195 adtype : np.dtype | |
196 Input datatype | |
197 lastname : optional | |
198 Last processed field name (used internally during recursion). | |
199 parents : dictionary | |
200 Dictionary of parent fields (used interbally during recursion). | |
201 | |
202 Examples | |
203 -------- | |
204 >>> from numpy.lib import recfunctions as rfn | |
205 >>> ndtype = np.dtype([('A', int), | |
206 ... ('B', [('BA', int), | |
207 ... ('BB', [('BBA', int), ('BBB', int)])])]) | |
208 >>> rfn.get_fieldstructure(ndtype) | |
209 ... # XXX: possible regression, order of BBA and BBB is swapped | |
210 {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']} | |
211 | |
212 """ | |
213 if parents is None: | |
214 parents = {} | |
215 names = adtype.names | |
216 for name in names: | |
217 current = adtype[name] | |
218 if current.names: | |
219 if lastname: | |
220 parents[name] = [lastname, ] | |
221 else: | |
222 parents[name] = [] | |
223 parents.update(get_fieldstructure(current, name, parents)) | |
224 else: | |
225 lastparent = [_ for _ in (parents.get(lastname, []) or [])] | |
226 if lastparent: | |
227 lastparent.append(lastname) | |
228 elif lastname: | |
229 lastparent = [lastname, ] | |
230 parents[name] = lastparent or [] | |
231 return parents or None | |
232 | |
233 | |
234 def _izip_fields_flat(iterable): | |
235 """ | |
236 Returns an iterator of concatenated fields from a sequence of arrays, | |
237 collapsing any nested structure. | |
238 | |
239 """ | |
240 for element in iterable: | |
241 if isinstance(element, np.void): | |
242 for f in _izip_fields_flat(tuple(element)): | |
243 yield f | |
244 else: | |
245 yield element | |
246 | |
247 | |
248 def _izip_fields(iterable): | |
249 """ | |
250 Returns an iterator of concatenated fields from a sequence of arrays. | |
251 | |
252 """ | |
253 for element in iterable: | |
254 if (hasattr(element, '__iter__') and | |
255 not isinstance(element, basestring)): | |
256 for f in _izip_fields(element): | |
257 yield f | |
258 elif isinstance(element, np.void) and len(tuple(element)) == 1: | |
259 for f in _izip_fields(element): | |
260 yield f | |
261 else: | |
262 yield element | |
263 | |
264 | |
265 def izip_records(seqarrays, fill_value=None, flatten=True): | |
266 """ | |
267 Returns an iterator of concatenated items from a sequence of arrays. | |
268 | |
269 Parameters | |
270 ---------- | |
271 seqarray : sequence of arrays | |
272 Sequence of arrays. | |
273 fill_value : {None, integer} | |
274 Value used to pad shorter iterables. | |
275 flatten : {True, False}, | |
276 Whether to | |
277 """ | |
278 # OK, that's a complete ripoff from Python2.6 itertools.izip_longest | |
279 def sentinel(counter=([fill_value] * (len(seqarrays) - 1)).pop): | |
280 "Yields the fill_value or raises IndexError" | |
281 yield counter() | |
282 # | |
283 fillers = itertools.repeat(fill_value) | |
284 iters = [itertools.chain(it, sentinel(), fillers) for it in seqarrays] | |
285 # Should we flatten the items, or just use a nested approach | |
286 if flatten: | |
287 zipfunc = _izip_fields_flat | |
288 else: | |
289 zipfunc = _izip_fields | |
290 # | |
291 try: | |
292 for tup in zip(*iters): | |
293 yield tuple(zipfunc(tup)) | |
294 except IndexError: | |
295 pass | |
296 | |
297 | |
298 def _fix_output(output, usemask=True, asrecarray=False): | |
299 """ | |
300 Private function: return a recarray, a ndarray, a MaskedArray | |
301 or a MaskedRecords depending on the input parameters | |
302 """ | |
303 if not isinstance(output, MaskedArray): | |
304 usemask = False | |
305 if usemask: | |
306 if asrecarray: | |
307 output = output.view(MaskedRecords) | |
308 else: | |
309 output = ma.filled(output) | |
310 if asrecarray: | |
311 output = output.view(recarray) | |
312 return output | |
313 | |
314 | |
315 def _fix_defaults(output, defaults=None): | |
316 """ | |
317 Update the fill_value and masked data of `output` | |
318 from the default given in a dictionary defaults. | |
319 """ | |
320 names = output.dtype.names | |
321 (data, mask, fill_value) = (output.data, output.mask, output.fill_value) | |
322 for (k, v) in (defaults or {}).items(): | |
323 if k in names: | |
324 fill_value[k] = v | |
325 data[k][mask[k]] = v | |
326 return output | |
327 | |
328 | |
329 def merge_arrays(seqarrays, fill_value=-1, flatten=False, | |
330 usemask=False, asrecarray=False): | |
331 """ | |
332 Merge arrays field by field. | |
333 | |
334 Parameters | |
335 ---------- | |
336 seqarrays : sequence of ndarrays | |
337 Sequence of arrays | |
338 fill_value : {float}, optional | |
339 Filling value used to pad missing data on the shorter arrays. | |
340 flatten : {False, True}, optional | |
341 Whether to collapse nested fields. | |
342 usemask : {False, True}, optional | |
343 Whether to return a masked array or not. | |
344 asrecarray : {False, True}, optional | |
345 Whether to return a recarray (MaskedRecords) or not. | |
346 | |
347 Examples | |
348 -------- | |
349 >>> from numpy.lib import recfunctions as rfn | |
350 >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.]))) | |
351 masked_array(data = [(1, 10.0) (2, 20.0) (--, 30.0)], | |
352 mask = [(False, False) (False, False) (True, False)], | |
353 fill_value = (999999, 1e+20), | |
354 dtype = [('f0', '<i4'), ('f1', '<f8')]) | |
355 | |
356 >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])), | |
357 ... usemask=False) | |
358 array([(1, 10.0), (2, 20.0), (-1, 30.0)], | |
359 dtype=[('f0', '<i4'), ('f1', '<f8')]) | |
360 >>> rfn.merge_arrays((np.array([1, 2]).view([('a', int)]), | |
361 ... np.array([10., 20., 30.])), | |
362 ... usemask=False, asrecarray=True) | |
363 rec.array([(1, 10.0), (2, 20.0), (-1, 30.0)], | |
364 dtype=[('a', '<i4'), ('f1', '<f8')]) | |
365 | |
366 Notes | |
367 ----- | |
368 * Without a mask, the missing value will be filled with something, | |
369 * depending on what its corresponding type: | |
370 -1 for integers | |
371 -1.0 for floating point numbers | |
372 '-' for characters | |
373 '-1' for strings | |
374 True for boolean values | |
375 * XXX: I just obtained these values empirically | |
376 """ | |
377 # Only one item in the input sequence ? | |
378 if (len(seqarrays) == 1): | |
379 seqarrays = np.asanyarray(seqarrays[0]) | |
380 # Do we have a single ndarray as input ? | |
381 if isinstance(seqarrays, (ndarray, np.void)): | |
382 seqdtype = seqarrays.dtype | |
383 if (not flatten) or \ | |
384 (zip_descr((seqarrays,), flatten=True) == seqdtype.descr): | |
385 # Minimal processing needed: just make sure everythng's a-ok | |
386 seqarrays = seqarrays.ravel() | |
387 # Make sure we have named fields | |
388 if not seqdtype.names: | |
389 seqdtype = [('', seqdtype)] | |
390 # Find what type of array we must return | |
391 if usemask: | |
392 if asrecarray: | |
393 seqtype = MaskedRecords | |
394 else: | |
395 seqtype = MaskedArray | |
396 elif asrecarray: | |
397 seqtype = recarray | |
398 else: | |
399 seqtype = ndarray | |
400 return seqarrays.view(dtype=seqdtype, type=seqtype) | |
401 else: | |
402 seqarrays = (seqarrays,) | |
403 else: | |
404 # Make sure we have arrays in the input sequence | |
405 seqarrays = [np.asanyarray(_m) for _m in seqarrays] | |
406 # Find the sizes of the inputs and their maximum | |
407 sizes = tuple(a.size for a in seqarrays) | |
408 maxlength = max(sizes) | |
409 # Get the dtype of the output (flattening if needed) | |
410 newdtype = zip_descr(seqarrays, flatten=flatten) | |
411 # Initialize the sequences for data and mask | |
412 seqdata = [] | |
413 seqmask = [] | |
414 # If we expect some kind of MaskedArray, make a special loop. | |
415 if usemask: | |
416 for (a, n) in zip(seqarrays, sizes): | |
417 nbmissing = (maxlength - n) | |
418 # Get the data and mask | |
419 data = a.ravel().__array__() | |
420 mask = ma.getmaskarray(a).ravel() | |
421 # Get the filling value (if needed) | |
422 if nbmissing: | |
423 fval = _check_fill_value(fill_value, a.dtype) | |
424 if isinstance(fval, (ndarray, np.void)): | |
425 if len(fval.dtype) == 1: | |
426 fval = fval.item()[0] | |
427 fmsk = True | |
428 else: | |
429 fval = np.array(fval, dtype=a.dtype, ndmin=1) | |
430 fmsk = np.ones((1,), dtype=mask.dtype) | |
431 else: | |
432 fval = None | |
433 fmsk = True | |
434 # Store an iterator padding the input to the expected length | |
435 seqdata.append(itertools.chain(data, [fval] * nbmissing)) | |
436 seqmask.append(itertools.chain(mask, [fmsk] * nbmissing)) | |
437 # Create an iterator for the data | |
438 data = tuple(izip_records(seqdata, flatten=flatten)) | |
439 output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength), | |
440 mask=list(izip_records(seqmask, flatten=flatten))) | |
441 if asrecarray: | |
442 output = output.view(MaskedRecords) | |
443 else: | |
444 # Same as before, without the mask we don't need... | |
445 for (a, n) in zip(seqarrays, sizes): | |
446 nbmissing = (maxlength - n) | |
447 data = a.ravel().__array__() | |
448 if nbmissing: | |
449 fval = _check_fill_value(fill_value, a.dtype) | |
450 if isinstance(fval, (ndarray, np.void)): | |
451 if len(fval.dtype) == 1: | |
452 fval = fval.item()[0] | |
453 else: | |
454 fval = np.array(fval, dtype=a.dtype, ndmin=1) | |
455 else: | |
456 fval = None | |
457 seqdata.append(itertools.chain(data, [fval] * nbmissing)) | |
458 output = np.fromiter(tuple(izip_records(seqdata, flatten=flatten)), | |
459 dtype=newdtype, count=maxlength) | |
460 if asrecarray: | |
461 output = output.view(recarray) | |
462 # And we're done... | |
463 return output | |
464 | |
465 | |
466 def drop_fields(base, drop_names, usemask=True, asrecarray=False): | |
467 """ | |
468 Return a new array with fields in `drop_names` dropped. | |
469 | |
470 Nested fields are supported. | |
471 | |
472 Parameters | |
473 ---------- | |
474 base : array | |
475 Input array | |
476 drop_names : string or sequence | |
477 String or sequence of strings corresponding to the names of the | |
478 fields to drop. | |
479 usemask : {False, True}, optional | |
480 Whether to return a masked array or not. | |
481 asrecarray : string or sequence, optional | |
482 Whether to return a recarray or a mrecarray (`asrecarray=True`) or | |
483 a plain ndarray or masked array with flexible dtype. The default | |
484 is False. | |
485 | |
486 Examples | |
487 -------- | |
488 >>> from numpy.lib import recfunctions as rfn | |
489 >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))], | |
490 ... dtype=[('a', int), ('b', [('ba', float), ('bb', int)])]) | |
491 >>> rfn.drop_fields(a, 'a') | |
492 array([((2.0, 3),), ((5.0, 6),)], | |
493 dtype=[('b', [('ba', '<f8'), ('bb', '<i4')])]) | |
494 >>> rfn.drop_fields(a, 'ba') | |
495 array([(1, (3,)), (4, (6,))], | |
496 dtype=[('a', '<i4'), ('b', [('bb', '<i4')])]) | |
497 >>> rfn.drop_fields(a, ['ba', 'bb']) | |
498 array([(1,), (4,)], | |
499 dtype=[('a', '<i4')]) | |
500 """ | |
501 if _is_string_like(drop_names): | |
502 drop_names = [drop_names, ] | |
503 else: | |
504 drop_names = set(drop_names) | |
505 | |
506 def _drop_descr(ndtype, drop_names): | |
507 names = ndtype.names | |
508 newdtype = [] | |
509 for name in names: | |
510 current = ndtype[name] | |
511 if name in drop_names: | |
512 continue | |
513 if current.names: | |
514 descr = _drop_descr(current, drop_names) | |
515 if descr: | |
516 newdtype.append((name, descr)) | |
517 else: | |
518 newdtype.append((name, current)) | |
519 return newdtype | |
520 | |
521 newdtype = _drop_descr(base.dtype, drop_names) | |
522 if not newdtype: | |
523 return None | |
524 | |
525 output = np.empty(base.shape, dtype=newdtype) | |
526 output = recursive_fill_fields(base, output) | |
527 return _fix_output(output, usemask=usemask, asrecarray=asrecarray) | |
528 | |
529 | |
530 def rec_drop_fields(base, drop_names): | |
531 """ | |
532 Returns a new numpy.recarray with fields in `drop_names` dropped. | |
533 """ | |
534 return drop_fields(base, drop_names, usemask=False, asrecarray=True) | |
535 | |
536 | |
537 def rename_fields(base, namemapper): | |
538 """ | |
539 Rename the fields from a flexible-datatype ndarray or recarray. | |
540 | |
541 Nested fields are supported. | |
542 | |
543 Parameters | |
544 ---------- | |
545 base : ndarray | |
546 Input array whose fields must be modified. | |
547 namemapper : dictionary | |
548 Dictionary mapping old field names to their new version. | |
549 | |
550 Examples | |
551 -------- | |
552 >>> from numpy.lib import recfunctions as rfn | |
553 >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))], | |
554 ... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])]) | |
555 >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'}) | |
556 array([(1, (2.0, [3.0, 30.0])), (4, (5.0, [6.0, 60.0]))], | |
557 dtype=[('A', '<i4'), ('b', [('ba', '<f8'), ('BB', '<f8', 2)])]) | |
558 | |
559 """ | |
560 def _recursive_rename_fields(ndtype, namemapper): | |
561 newdtype = [] | |
562 for name in ndtype.names: | |
563 newname = namemapper.get(name, name) | |
564 current = ndtype[name] | |
565 if current.names: | |
566 newdtype.append( | |
567 (newname, _recursive_rename_fields(current, namemapper)) | |
568 ) | |
569 else: | |
570 newdtype.append((newname, current)) | |
571 return newdtype | |
572 newdtype = _recursive_rename_fields(base.dtype, namemapper) | |
573 return base.view(newdtype) | |
574 | |
575 | |
576 def append_fields(base, names, data, dtypes=None, | |
577 fill_value=-1, usemask=True, asrecarray=False): | |
578 """ | |
579 Add new fields to an existing array. | |
580 | |
581 The names of the fields are given with the `names` arguments, | |
582 the corresponding values with the `data` arguments. | |
583 If a single field is appended, `names`, `data` and `dtypes` do not have | |
584 to be lists but just values. | |
585 | |
586 Parameters | |
587 ---------- | |
588 base : array | |
589 Input array to extend. | |
590 names : string, sequence | |
591 String or sequence of strings corresponding to the names | |
592 of the new fields. | |
593 data : array or sequence of arrays | |
594 Array or sequence of arrays storing the fields to add to the base. | |
595 dtypes : sequence of datatypes, optional | |
596 Datatype or sequence of datatypes. | |
597 If None, the datatypes are estimated from the `data`. | |
598 fill_value : {float}, optional | |
599 Filling value used to pad missing data on the shorter arrays. | |
600 usemask : {False, True}, optional | |
601 Whether to return a masked array or not. | |
602 asrecarray : {False, True}, optional | |
603 Whether to return a recarray (MaskedRecords) or not. | |
604 | |
605 """ | |
606 # Check the names | |
607 if isinstance(names, (tuple, list)): | |
608 if len(names) != len(data): | |
609 msg = "The number of arrays does not match the number of names" | |
610 raise ValueError(msg) | |
611 elif isinstance(names, basestring): | |
612 names = [names, ] | |
613 data = [data, ] | |
614 # | |
615 if dtypes is None: | |
616 data = [np.array(a, copy=False, subok=True) for a in data] | |
617 data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)] | |
618 else: | |
619 if not isinstance(dtypes, (tuple, list)): | |
620 dtypes = [dtypes, ] | |
621 if len(data) != len(dtypes): | |
622 if len(dtypes) == 1: | |
623 dtypes = dtypes * len(data) | |
624 else: | |
625 msg = "The dtypes argument must be None, a dtype, or a list." | |
626 raise ValueError(msg) | |
627 data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)]) | |
628 for (a, n, d) in zip(data, names, dtypes)] | |
629 # | |
630 base = merge_arrays(base, usemask=usemask, fill_value=fill_value) | |
631 if len(data) > 1: | |
632 data = merge_arrays(data, flatten=True, usemask=usemask, | |
633 fill_value=fill_value) | |
634 else: | |
635 data = data.pop() | |
636 # | |
637 output = ma.masked_all(max(len(base), len(data)), | |
638 dtype=base.dtype.descr + data.dtype.descr) | |
639 output = recursive_fill_fields(base, output) | |
640 output = recursive_fill_fields(data, output) | |
641 # | |
642 return _fix_output(output, usemask=usemask, asrecarray=asrecarray) | |
643 | |
644 | |
645 def rec_append_fields(base, names, data, dtypes=None): | |
646 """ | |
647 Add new fields to an existing array. | |
648 | |
649 The names of the fields are given with the `names` arguments, | |
650 the corresponding values with the `data` arguments. | |
651 If a single field is appended, `names`, `data` and `dtypes` do not have | |
652 to be lists but just values. | |
653 | |
654 Parameters | |
655 ---------- | |
656 base : array | |
657 Input array to extend. | |
658 names : string, sequence | |
659 String or sequence of strings corresponding to the names | |
660 of the new fields. | |
661 data : array or sequence of arrays | |
662 Array or sequence of arrays storing the fields to add to the base. | |
663 dtypes : sequence of datatypes, optional | |
664 Datatype or sequence of datatypes. | |
665 If None, the datatypes are estimated from the `data`. | |
666 | |
667 See Also | |
668 -------- | |
669 append_fields | |
670 | |
671 Returns | |
672 ------- | |
673 appended_array : np.recarray | |
674 """ | |
675 return append_fields(base, names, data=data, dtypes=dtypes, | |
676 asrecarray=True, usemask=False) | |
677 | |
678 | |
679 def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False, | |
680 autoconvert=False): | |
681 """ | |
682 Superposes arrays fields by fields | |
683 | |
684 Parameters | |
685 ---------- | |
686 seqarrays : array or sequence | |
687 Sequence of input arrays. | |
688 defaults : dictionary, optional | |
689 Dictionary mapping field names to the corresponding default values. | |
690 usemask : {True, False}, optional | |
691 Whether to return a MaskedArray (or MaskedRecords is | |
692 `asrecarray==True`) or a ndarray. | |
693 asrecarray : {False, True}, optional | |
694 Whether to return a recarray (or MaskedRecords if `usemask==True`) | |
695 or just a flexible-type ndarray. | |
696 autoconvert : {False, True}, optional | |
697 Whether automatically cast the type of the field to the maximum. | |
698 | |
699 Examples | |
700 -------- | |
701 >>> from numpy.lib import recfunctions as rfn | |
702 >>> x = np.array([1, 2,]) | |
703 >>> rfn.stack_arrays(x) is x | |
704 True | |
705 >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)]) | |
706 >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)], | |
707 ... dtype=[('A', '|S3'), ('B', float), ('C', float)]) | |
708 >>> test = rfn.stack_arrays((z,zz)) | |
709 >>> test | |
710 masked_array(data = [('A', 1.0, --) ('B', 2.0, --) ('a', 10.0, 100.0) ('b', 20.0, 200.0) | |
711 ('c', 30.0, 300.0)], | |
712 mask = [(False, False, True) (False, False, True) (False, False, False) | |
713 (False, False, False) (False, False, False)], | |
714 fill_value = ('N/A', 1e+20, 1e+20), | |
715 dtype = [('A', '|S3'), ('B', '<f8'), ('C', '<f8')]) | |
716 | |
717 """ | |
718 if isinstance(arrays, ndarray): | |
719 return arrays | |
720 elif len(arrays) == 1: | |
721 return arrays[0] | |
722 seqarrays = [np.asanyarray(a).ravel() for a in arrays] | |
723 nrecords = [len(a) for a in seqarrays] | |
724 ndtype = [a.dtype for a in seqarrays] | |
725 fldnames = [d.names for d in ndtype] | |
726 # | |
727 dtype_l = ndtype[0] | |
728 newdescr = dtype_l.descr | |
729 names = [_[0] for _ in newdescr] | |
730 for dtype_n in ndtype[1:]: | |
731 for descr in dtype_n.descr: | |
732 name = descr[0] or '' | |
733 if name not in names: | |
734 newdescr.append(descr) | |
735 names.append(name) | |
736 else: | |
737 nameidx = names.index(name) | |
738 current_descr = newdescr[nameidx] | |
739 if autoconvert: | |
740 if np.dtype(descr[1]) > np.dtype(current_descr[-1]): | |
741 current_descr = list(current_descr) | |
742 current_descr[-1] = descr[1] | |
743 newdescr[nameidx] = tuple(current_descr) | |
744 elif descr[1] != current_descr[-1]: | |
745 raise TypeError("Incompatible type '%s' <> '%s'" % | |
746 (dict(newdescr)[name], descr[1])) | |
747 # Only one field: use concatenate | |
748 if len(newdescr) == 1: | |
749 output = ma.concatenate(seqarrays) | |
750 else: | |
751 # | |
752 output = ma.masked_all((np.sum(nrecords),), newdescr) | |
753 offset = np.cumsum(np.r_[0, nrecords]) | |
754 seen = [] | |
755 for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]): | |
756 names = a.dtype.names | |
757 if names is None: | |
758 output['f%i' % len(seen)][i:j] = a | |
759 else: | |
760 for name in n: | |
761 output[name][i:j] = a[name] | |
762 if name not in seen: | |
763 seen.append(name) | |
764 # | |
765 return _fix_output(_fix_defaults(output, defaults), | |
766 usemask=usemask, asrecarray=asrecarray) | |
767 | |
768 | |
769 def find_duplicates(a, key=None, ignoremask=True, return_index=False): | |
770 """ | |
771 Find the duplicates in a structured array along a given key | |
772 | |
773 Parameters | |
774 ---------- | |
775 a : array-like | |
776 Input array | |
777 key : {string, None}, optional | |
778 Name of the fields along which to check the duplicates. | |
779 If None, the search is performed by records | |
780 ignoremask : {True, False}, optional | |
781 Whether masked data should be discarded or considered as duplicates. | |
782 return_index : {False, True}, optional | |
783 Whether to return the indices of the duplicated values. | |
784 | |
785 Examples | |
786 -------- | |
787 >>> from numpy.lib import recfunctions as rfn | |
788 >>> ndtype = [('a', int)] | |
789 >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3], | |
790 ... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype) | |
791 >>> rfn.find_duplicates(a, ignoremask=True, return_index=True) | |
792 ... # XXX: judging by the output, the ignoremask flag has no effect | |
793 """ | |
794 a = np.asanyarray(a).ravel() | |
795 # Get a dictionary of fields | |
796 fields = get_fieldstructure(a.dtype) | |
797 # Get the sorting data (by selecting the corresponding field) | |
798 base = a | |
799 if key: | |
800 for f in fields[key]: | |
801 base = base[f] | |
802 base = base[key] | |
803 # Get the sorting indices and the sorted data | |
804 sortidx = base.argsort() | |
805 sortedbase = base[sortidx] | |
806 sorteddata = sortedbase.filled() | |
807 # Compare the sorting data | |
808 flag = (sorteddata[:-1] == sorteddata[1:]) | |
809 # If masked data must be ignored, set the flag to false where needed | |
810 if ignoremask: | |
811 sortedmask = sortedbase.recordmask | |
812 flag[sortedmask[1:]] = False | |
813 flag = np.concatenate(([False], flag)) | |
814 # We need to take the point on the left as well (else we're missing it) | |
815 flag[:-1] = flag[:-1] + flag[1:] | |
816 duplicates = a[sortidx][flag] | |
817 if return_index: | |
818 return (duplicates, sortidx[flag]) | |
819 else: | |
820 return duplicates | |
821 | |
822 | |
823 def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2', | |
824 defaults=None, usemask=True, asrecarray=False): | |
825 """ | |
826 Join arrays `r1` and `r2` on key `key`. | |
827 | |
828 The key should be either a string or a sequence of string corresponding | |
829 to the fields used to join the array. An exception is raised if the | |
830 `key` field cannot be found in the two input arrays. Neither `r1` nor | |
831 `r2` should have any duplicates along `key`: the presence of duplicates | |
832 will make the output quite unreliable. Note that duplicates are not | |
833 looked for by the algorithm. | |
834 | |
835 Parameters | |
836 ---------- | |
837 key : {string, sequence} | |
838 A string or a sequence of strings corresponding to the fields used | |
839 for comparison. | |
840 r1, r2 : arrays | |
841 Structured arrays. | |
842 jointype : {'inner', 'outer', 'leftouter'}, optional | |
843 If 'inner', returns the elements common to both r1 and r2. | |
844 If 'outer', returns the common elements as well as the elements of | |
845 r1 not in r2 and the elements of not in r2. | |
846 If 'leftouter', returns the common elements and the elements of r1 | |
847 not in r2. | |
848 r1postfix : string, optional | |
849 String appended to the names of the fields of r1 that are present | |
850 in r2 but absent of the key. | |
851 r2postfix : string, optional | |
852 String appended to the names of the fields of r2 that are present | |
853 in r1 but absent of the key. | |
854 defaults : {dictionary}, optional | |
855 Dictionary mapping field names to the corresponding default values. | |
856 usemask : {True, False}, optional | |
857 Whether to return a MaskedArray (or MaskedRecords is | |
858 `asrecarray==True`) or a ndarray. | |
859 asrecarray : {False, True}, optional | |
860 Whether to return a recarray (or MaskedRecords if `usemask==True`) | |
861 or just a flexible-type ndarray. | |
862 | |
863 Notes | |
864 ----- | |
865 * The output is sorted along the key. | |
866 * A temporary array is formed by dropping the fields not in the key for | |
867 the two arrays and concatenating the result. This array is then | |
868 sorted, and the common entries selected. The output is constructed by | |
869 filling the fields with the selected entries. Matching is not | |
870 preserved if there are some duplicates... | |
871 | |
872 """ | |
873 # Check jointype | |
874 if jointype not in ('inner', 'outer', 'leftouter'): | |
875 raise ValueError( | |
876 "The 'jointype' argument should be in 'inner', " | |
877 "'outer' or 'leftouter' (got '%s' instead)" % jointype | |
878 ) | |
879 # If we have a single key, put it in a tuple | |
880 if isinstance(key, basestring): | |
881 key = (key,) | |
882 | |
883 # Check the keys | |
884 for name in key: | |
885 if name not in r1.dtype.names: | |
886 raise ValueError('r1 does not have key field %s' % name) | |
887 if name not in r2.dtype.names: | |
888 raise ValueError('r2 does not have key field %s' % name) | |
889 | |
890 # Make sure we work with ravelled arrays | |
891 r1 = r1.ravel() | |
892 r2 = r2.ravel() | |
893 # Fixme: nb2 below is never used. Commenting out for pyflakes. | |
894 # (nb1, nb2) = (len(r1), len(r2)) | |
895 nb1 = len(r1) | |
896 (r1names, r2names) = (r1.dtype.names, r2.dtype.names) | |
897 | |
898 # Check the names for collision | |
899 if (set.intersection(set(r1names), set(r2names)).difference(key) and | |
900 not (r1postfix or r2postfix)): | |
901 msg = "r1 and r2 contain common names, r1postfix and r2postfix " | |
902 msg += "can't be empty" | |
903 raise ValueError(msg) | |
904 | |
905 # Make temporary arrays of just the keys | |
906 r1k = drop_fields(r1, [n for n in r1names if n not in key]) | |
907 r2k = drop_fields(r2, [n for n in r2names if n not in key]) | |
908 | |
909 # Concatenate the two arrays for comparison | |
910 aux = ma.concatenate((r1k, r2k)) | |
911 idx_sort = aux.argsort(order=key) | |
912 aux = aux[idx_sort] | |
913 # | |
914 # Get the common keys | |
915 flag_in = ma.concatenate(([False], aux[1:] == aux[:-1])) | |
916 flag_in[:-1] = flag_in[1:] + flag_in[:-1] | |
917 idx_in = idx_sort[flag_in] | |
918 idx_1 = idx_in[(idx_in < nb1)] | |
919 idx_2 = idx_in[(idx_in >= nb1)] - nb1 | |
920 (r1cmn, r2cmn) = (len(idx_1), len(idx_2)) | |
921 if jointype == 'inner': | |
922 (r1spc, r2spc) = (0, 0) | |
923 elif jointype == 'outer': | |
924 idx_out = idx_sort[~flag_in] | |
925 idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)])) | |
926 idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1)) | |
927 (r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn) | |
928 elif jointype == 'leftouter': | |
929 idx_out = idx_sort[~flag_in] | |
930 idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)])) | |
931 (r1spc, r2spc) = (len(idx_1) - r1cmn, 0) | |
932 # Select the entries from each input | |
933 (s1, s2) = (r1[idx_1], r2[idx_2]) | |
934 # | |
935 # Build the new description of the output array ....... | |
936 # Start with the key fields | |
937 ndtype = [list(_) for _ in r1k.dtype.descr] | |
938 # Add the other fields | |
939 ndtype.extend(list(_) for _ in r1.dtype.descr if _[0] not in key) | |
940 # Find the new list of names (it may be different from r1names) | |
941 names = list(_[0] for _ in ndtype) | |
942 for desc in r2.dtype.descr: | |
943 desc = list(desc) | |
944 name = desc[0] | |
945 # Have we seen the current name already ? | |
946 if name in names: | |
947 nameidx = ndtype.index(desc) | |
948 current = ndtype[nameidx] | |
949 # The current field is part of the key: take the largest dtype | |
950 if name in key: | |
951 current[-1] = max(desc[1], current[-1]) | |
952 # The current field is not part of the key: add the suffixes | |
953 else: | |
954 current[0] += r1postfix | |
955 desc[0] += r2postfix | |
956 ndtype.insert(nameidx + 1, desc) | |
957 #... we haven't: just add the description to the current list | |
958 else: | |
959 names.extend(desc[0]) | |
960 ndtype.append(desc) | |
961 # Revert the elements to tuples | |
962 ndtype = [tuple(_) for _ in ndtype] | |
963 # Find the largest nb of common fields : | |
964 # r1cmn and r2cmn should be equal, but... | |
965 cmn = max(r1cmn, r2cmn) | |
966 # Construct an empty array | |
967 output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype) | |
968 names = output.dtype.names | |
969 for f in r1names: | |
970 selected = s1[f] | |
971 if f not in names or (f in r2names and not r2postfix and f not in key): | |
972 f += r1postfix | |
973 current = output[f] | |
974 current[:r1cmn] = selected[:r1cmn] | |
975 if jointype in ('outer', 'leftouter'): | |
976 current[cmn:cmn + r1spc] = selected[r1cmn:] | |
977 for f in r2names: | |
978 selected = s2[f] | |
979 if f not in names or (f in r1names and not r1postfix and f not in key): | |
980 f += r2postfix | |
981 current = output[f] | |
982 current[:r2cmn] = selected[:r2cmn] | |
983 if (jointype == 'outer') and r2spc: | |
984 current[-r2spc:] = selected[r2cmn:] | |
985 # Sort and finalize the output | |
986 output.sort(order=key) | |
987 kwargs = dict(usemask=usemask, asrecarray=asrecarray) | |
988 return _fix_output(_fix_defaults(output, defaults), **kwargs) | |
989 | |
990 | |
991 def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2', | |
992 defaults=None): | |
993 """ | |
994 Join arrays `r1` and `r2` on keys. | |
995 Alternative to join_by, that always returns a np.recarray. | |
996 | |
997 See Also | |
998 -------- | |
999 join_by : equivalent function | |
1000 """ | |
1001 kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix, | |
1002 defaults=defaults, usemask=False, asrecarray=True) | |
1003 return join_by(key, r1, r2, **kwargs) |