Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/numpy/lib/recfunctions.py : 13%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Collection of utilities to manipulate structured arrays.
4Most of these functions were initially implemented by John Hunter for
5matplotlib. They have been rewritten and extended for convenience.
7"""
8import itertools
9import numpy as np
10import numpy.ma as ma
11from numpy import ndarray, recarray
12from numpy.ma import MaskedArray
13from numpy.ma.mrecords import MaskedRecords
14from numpy.core.overrides import array_function_dispatch
15from numpy.lib._iotools import _is_string_like
16from numpy.testing import suppress_warnings
18_check_fill_value = np.ma.core._check_fill_value
21__all__ = [
22 'append_fields', 'apply_along_fields', 'assign_fields_by_name',
23 'drop_fields', 'find_duplicates', 'flatten_descr',
24 'get_fieldstructure', 'get_names', 'get_names_flat',
25 'join_by', 'merge_arrays', 'rec_append_fields',
26 'rec_drop_fields', 'rec_join', 'recursive_fill_fields',
27 'rename_fields', 'repack_fields', 'require_fields',
28 'stack_arrays', 'structured_to_unstructured', 'unstructured_to_structured',
29 ]
32def _recursive_fill_fields_dispatcher(input, output):
33 return (input, output)
36@array_function_dispatch(_recursive_fill_fields_dispatcher)
37def recursive_fill_fields(input, output):
38 """
39 Fills fields from output with fields from input,
40 with support for nested structures.
42 Parameters
43 ----------
44 input : ndarray
45 Input array.
46 output : ndarray
47 Output array.
49 Notes
50 -----
51 * `output` should be at least the same size as `input`
53 Examples
54 --------
55 >>> from numpy.lib import recfunctions as rfn
56 >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', np.int64), ('B', np.float64)])
57 >>> b = np.zeros((3,), dtype=a.dtype)
58 >>> rfn.recursive_fill_fields(a, b)
59 array([(1, 10.), (2, 20.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])
61 """
62 newdtype = output.dtype
63 for field in newdtype.names:
64 try:
65 current = input[field]
66 except ValueError:
67 continue
68 if current.dtype.names is not None:
69 recursive_fill_fields(current, output[field])
70 else:
71 output[field][:len(current)] = current
72 return output
75def _get_fieldspec(dtype):
76 """
77 Produce a list of name/dtype pairs corresponding to the dtype fields
79 Similar to dtype.descr, but the second item of each tuple is a dtype, not a
80 string. As a result, this handles subarray dtypes
82 Can be passed to the dtype constructor to reconstruct the dtype, noting that
83 this (deliberately) discards field offsets.
85 Examples
86 --------
87 >>> dt = np.dtype([(('a', 'A'), np.int64), ('b', np.double, 3)])
88 >>> dt.descr
89 [(('a', 'A'), '<i8'), ('b', '<f8', (3,))]
90 >>> _get_fieldspec(dt)
91 [(('a', 'A'), dtype('int64')), ('b', dtype(('<f8', (3,))))]
93 """
94 if dtype.names is None:
95 # .descr returns a nameless field, so we should too
96 return [('', dtype)]
97 else:
98 fields = ((name, dtype.fields[name]) for name in dtype.names)
99 # keep any titles, if present
100 return [
101 (name if len(f) == 2 else (f[2], name), f[0])
102 for name, f in fields
103 ]
106def get_names(adtype):
107 """
108 Returns the field names of the input datatype as a tuple.
110 Parameters
111 ----------
112 adtype : dtype
113 Input datatype
115 Examples
116 --------
117 >>> from numpy.lib import recfunctions as rfn
118 >>> rfn.get_names(np.empty((1,), dtype=int))
119 Traceback (most recent call last):
120 ...
121 AttributeError: 'numpy.ndarray' object has no attribute 'names'
123 >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)]))
124 Traceback (most recent call last):
125 ...
126 AttributeError: 'numpy.ndarray' object has no attribute 'names'
127 >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
128 >>> rfn.get_names(adtype)
129 ('a', ('b', ('ba', 'bb')))
130 """
131 listnames = []
132 names = adtype.names
133 for name in names:
134 current = adtype[name]
135 if current.names is not None:
136 listnames.append((name, tuple(get_names(current))))
137 else:
138 listnames.append(name)
139 return tuple(listnames)
142def get_names_flat(adtype):
143 """
144 Returns the field names of the input datatype as a tuple. Nested structure
145 are flattened beforehand.
147 Parameters
148 ----------
149 adtype : dtype
150 Input datatype
152 Examples
153 --------
154 >>> from numpy.lib import recfunctions as rfn
155 >>> rfn.get_names_flat(np.empty((1,), dtype=int)) is None
156 Traceback (most recent call last):
157 ...
158 AttributeError: 'numpy.ndarray' object has no attribute 'names'
159 >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', float)]))
160 Traceback (most recent call last):
161 ...
162 AttributeError: 'numpy.ndarray' object has no attribute 'names'
163 >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
164 >>> rfn.get_names_flat(adtype)
165 ('a', 'b', 'ba', 'bb')
166 """
167 listnames = []
168 names = adtype.names
169 for name in names:
170 listnames.append(name)
171 current = adtype[name]
172 if current.names is not None:
173 listnames.extend(get_names_flat(current))
174 return tuple(listnames)
177def flatten_descr(ndtype):
178 """
179 Flatten a structured data-type description.
181 Examples
182 --------
183 >>> from numpy.lib import recfunctions as rfn
184 >>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])])
185 >>> rfn.flatten_descr(ndtype)
186 (('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32')))
188 """
189 names = ndtype.names
190 if names is None:
191 return (('', ndtype),)
192 else:
193 descr = []
194 for field in names:
195 (typ, _) = ndtype.fields[field]
196 if typ.names is not None:
197 descr.extend(flatten_descr(typ))
198 else:
199 descr.append((field, typ))
200 return tuple(descr)
203def _zip_dtype(seqarrays, flatten=False):
204 newdtype = []
205 if flatten:
206 for a in seqarrays:
207 newdtype.extend(flatten_descr(a.dtype))
208 else:
209 for a in seqarrays:
210 current = a.dtype
211 if current.names is not None and len(current.names) == 1:
212 # special case - dtypes of 1 field are flattened
213 newdtype.extend(_get_fieldspec(current))
214 else:
215 newdtype.append(('', current))
216 return np.dtype(newdtype)
219def _zip_descr(seqarrays, flatten=False):
220 """
221 Combine the dtype description of a series of arrays.
223 Parameters
224 ----------
225 seqarrays : sequence of arrays
226 Sequence of arrays
227 flatten : {boolean}, optional
228 Whether to collapse nested descriptions.
229 """
230 return _zip_dtype(seqarrays, flatten=flatten).descr
233def get_fieldstructure(adtype, lastname=None, parents=None,):
234 """
235 Returns a dictionary with fields indexing lists of their parent fields.
237 This function is used to simplify access to fields nested in other fields.
239 Parameters
240 ----------
241 adtype : np.dtype
242 Input datatype
243 lastname : optional
244 Last processed field name (used internally during recursion).
245 parents : dictionary
246 Dictionary of parent fields (used interbally during recursion).
248 Examples
249 --------
250 >>> from numpy.lib import recfunctions as rfn
251 >>> ndtype = np.dtype([('A', int),
252 ... ('B', [('BA', int),
253 ... ('BB', [('BBA', int), ('BBB', int)])])])
254 >>> rfn.get_fieldstructure(ndtype)
255 ... # XXX: possible regression, order of BBA and BBB is swapped
256 {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']}
258 """
259 if parents is None:
260 parents = {}
261 names = adtype.names
262 for name in names:
263 current = adtype[name]
264 if current.names is not None:
265 if lastname:
266 parents[name] = [lastname, ]
267 else:
268 parents[name] = []
269 parents.update(get_fieldstructure(current, name, parents))
270 else:
271 lastparent = [_ for _ in (parents.get(lastname, []) or [])]
272 if lastparent:
273 lastparent.append(lastname)
274 elif lastname:
275 lastparent = [lastname, ]
276 parents[name] = lastparent or []
277 return parents
280def _izip_fields_flat(iterable):
281 """
282 Returns an iterator of concatenated fields from a sequence of arrays,
283 collapsing any nested structure.
285 """
286 for element in iterable:
287 if isinstance(element, np.void):
288 yield from _izip_fields_flat(tuple(element))
289 else:
290 yield element
293def _izip_fields(iterable):
294 """
295 Returns an iterator of concatenated fields from a sequence of arrays.
297 """
298 for element in iterable:
299 if (hasattr(element, '__iter__') and
300 not isinstance(element, str)):
301 yield from _izip_fields(element)
302 elif isinstance(element, np.void) and len(tuple(element)) == 1:
303 # this statement is the same from the previous expression
304 yield from _izip_fields(element)
305 else:
306 yield element
309def _izip_records(seqarrays, fill_value=None, flatten=True):
310 """
311 Returns an iterator of concatenated items from a sequence of arrays.
313 Parameters
314 ----------
315 seqarrays : sequence of arrays
316 Sequence of arrays.
317 fill_value : {None, integer}
318 Value used to pad shorter iterables.
319 flatten : {True, False},
320 Whether to
321 """
323 # Should we flatten the items, or just use a nested approach
324 if flatten:
325 zipfunc = _izip_fields_flat
326 else:
327 zipfunc = _izip_fields
329 for tup in itertools.zip_longest(*seqarrays, fillvalue=fill_value):
330 yield tuple(zipfunc(tup))
333def _fix_output(output, usemask=True, asrecarray=False):
334 """
335 Private function: return a recarray, a ndarray, a MaskedArray
336 or a MaskedRecords depending on the input parameters
337 """
338 if not isinstance(output, MaskedArray):
339 usemask = False
340 if usemask:
341 if asrecarray:
342 output = output.view(MaskedRecords)
343 else:
344 output = ma.filled(output)
345 if asrecarray:
346 output = output.view(recarray)
347 return output
350def _fix_defaults(output, defaults=None):
351 """
352 Update the fill_value and masked data of `output`
353 from the default given in a dictionary defaults.
354 """
355 names = output.dtype.names
356 (data, mask, fill_value) = (output.data, output.mask, output.fill_value)
357 for (k, v) in (defaults or {}).items():
358 if k in names:
359 fill_value[k] = v
360 data[k][mask[k]] = v
361 return output
364def _merge_arrays_dispatcher(seqarrays, fill_value=None, flatten=None,
365 usemask=None, asrecarray=None):
366 return seqarrays
369@array_function_dispatch(_merge_arrays_dispatcher)
370def merge_arrays(seqarrays, fill_value=-1, flatten=False,
371 usemask=False, asrecarray=False):
372 """
373 Merge arrays field by field.
375 Parameters
376 ----------
377 seqarrays : sequence of ndarrays
378 Sequence of arrays
379 fill_value : {float}, optional
380 Filling value used to pad missing data on the shorter arrays.
381 flatten : {False, True}, optional
382 Whether to collapse nested fields.
383 usemask : {False, True}, optional
384 Whether to return a masked array or not.
385 asrecarray : {False, True}, optional
386 Whether to return a recarray (MaskedRecords) or not.
388 Examples
389 --------
390 >>> from numpy.lib import recfunctions as rfn
391 >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])))
392 array([( 1, 10.), ( 2, 20.), (-1, 30.)],
393 dtype=[('f0', '<i8'), ('f1', '<f8')])
395 >>> rfn.merge_arrays((np.array([1, 2], dtype=np.int64),
396 ... np.array([10., 20., 30.])), usemask=False)
397 array([(1, 10.0), (2, 20.0), (-1, 30.0)],
398 dtype=[('f0', '<i8'), ('f1', '<f8')])
399 >>> rfn.merge_arrays((np.array([1, 2]).view([('a', np.int64)]),
400 ... np.array([10., 20., 30.])),
401 ... usemask=False, asrecarray=True)
402 rec.array([( 1, 10.), ( 2, 20.), (-1, 30.)],
403 dtype=[('a', '<i8'), ('f1', '<f8')])
405 Notes
406 -----
407 * Without a mask, the missing value will be filled with something,
408 depending on what its corresponding type:
410 * ``-1`` for integers
411 * ``-1.0`` for floating point numbers
412 * ``'-'`` for characters
413 * ``'-1'`` for strings
414 * ``True`` for boolean values
415 * XXX: I just obtained these values empirically
416 """
417 # Only one item in the input sequence ?
418 if (len(seqarrays) == 1):
419 seqarrays = np.asanyarray(seqarrays[0])
420 # Do we have a single ndarray as input ?
421 if isinstance(seqarrays, (ndarray, np.void)):
422 seqdtype = seqarrays.dtype
423 # Make sure we have named fields
424 if seqdtype.names is None:
425 seqdtype = np.dtype([('', seqdtype)])
426 if not flatten or _zip_dtype((seqarrays,), flatten=True) == seqdtype:
427 # Minimal processing needed: just make sure everything's a-ok
428 seqarrays = seqarrays.ravel()
429 # Find what type of array we must return
430 if usemask:
431 if asrecarray:
432 seqtype = MaskedRecords
433 else:
434 seqtype = MaskedArray
435 elif asrecarray:
436 seqtype = recarray
437 else:
438 seqtype = ndarray
439 return seqarrays.view(dtype=seqdtype, type=seqtype)
440 else:
441 seqarrays = (seqarrays,)
442 else:
443 # Make sure we have arrays in the input sequence
444 seqarrays = [np.asanyarray(_m) for _m in seqarrays]
445 # Find the sizes of the inputs and their maximum
446 sizes = tuple(a.size for a in seqarrays)
447 maxlength = max(sizes)
448 # Get the dtype of the output (flattening if needed)
449 newdtype = _zip_dtype(seqarrays, flatten=flatten)
450 # Initialize the sequences for data and mask
451 seqdata = []
452 seqmask = []
453 # If we expect some kind of MaskedArray, make a special loop.
454 if usemask:
455 for (a, n) in zip(seqarrays, sizes):
456 nbmissing = (maxlength - n)
457 # Get the data and mask
458 data = a.ravel().__array__()
459 mask = ma.getmaskarray(a).ravel()
460 # Get the filling value (if needed)
461 if nbmissing:
462 fval = _check_fill_value(fill_value, a.dtype)
463 if isinstance(fval, (ndarray, np.void)):
464 if len(fval.dtype) == 1:
465 fval = fval.item()[0]
466 fmsk = True
467 else:
468 fval = np.array(fval, dtype=a.dtype, ndmin=1)
469 fmsk = np.ones((1,), dtype=mask.dtype)
470 else:
471 fval = None
472 fmsk = True
473 # Store an iterator padding the input to the expected length
474 seqdata.append(itertools.chain(data, [fval] * nbmissing))
475 seqmask.append(itertools.chain(mask, [fmsk] * nbmissing))
476 # Create an iterator for the data
477 data = tuple(_izip_records(seqdata, flatten=flatten))
478 output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength),
479 mask=list(_izip_records(seqmask, flatten=flatten)))
480 if asrecarray:
481 output = output.view(MaskedRecords)
482 else:
483 # Same as before, without the mask we don't need...
484 for (a, n) in zip(seqarrays, sizes):
485 nbmissing = (maxlength - n)
486 data = a.ravel().__array__()
487 if nbmissing:
488 fval = _check_fill_value(fill_value, a.dtype)
489 if isinstance(fval, (ndarray, np.void)):
490 if len(fval.dtype) == 1:
491 fval = fval.item()[0]
492 else:
493 fval = np.array(fval, dtype=a.dtype, ndmin=1)
494 else:
495 fval = None
496 seqdata.append(itertools.chain(data, [fval] * nbmissing))
497 output = np.fromiter(tuple(_izip_records(seqdata, flatten=flatten)),
498 dtype=newdtype, count=maxlength)
499 if asrecarray:
500 output = output.view(recarray)
501 # And we're done...
502 return output
505def _drop_fields_dispatcher(base, drop_names, usemask=None, asrecarray=None):
506 return (base,)
509@array_function_dispatch(_drop_fields_dispatcher)
510def drop_fields(base, drop_names, usemask=True, asrecarray=False):
511 """
512 Return a new array with fields in `drop_names` dropped.
514 Nested fields are supported.
516 ..versionchanged: 1.18.0
517 `drop_fields` returns an array with 0 fields if all fields are dropped,
518 rather than returning ``None`` as it did previously.
520 Parameters
521 ----------
522 base : array
523 Input array
524 drop_names : string or sequence
525 String or sequence of strings corresponding to the names of the
526 fields to drop.
527 usemask : {False, True}, optional
528 Whether to return a masked array or not.
529 asrecarray : string or sequence, optional
530 Whether to return a recarray or a mrecarray (`asrecarray=True`) or
531 a plain ndarray or masked array with flexible dtype. The default
532 is False.
534 Examples
535 --------
536 >>> from numpy.lib import recfunctions as rfn
537 >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
538 ... dtype=[('a', np.int64), ('b', [('ba', np.double), ('bb', np.int64)])])
539 >>> rfn.drop_fields(a, 'a')
540 array([((2., 3),), ((5., 6),)],
541 dtype=[('b', [('ba', '<f8'), ('bb', '<i8')])])
542 >>> rfn.drop_fields(a, 'ba')
543 array([(1, (3,)), (4, (6,))], dtype=[('a', '<i8'), ('b', [('bb', '<i8')])])
544 >>> rfn.drop_fields(a, ['ba', 'bb'])
545 array([(1,), (4,)], dtype=[('a', '<i8')])
546 """
547 if _is_string_like(drop_names):
548 drop_names = [drop_names]
549 else:
550 drop_names = set(drop_names)
552 def _drop_descr(ndtype, drop_names):
553 names = ndtype.names
554 newdtype = []
555 for name in names:
556 current = ndtype[name]
557 if name in drop_names:
558 continue
559 if current.names is not None:
560 descr = _drop_descr(current, drop_names)
561 if descr:
562 newdtype.append((name, descr))
563 else:
564 newdtype.append((name, current))
565 return newdtype
567 newdtype = _drop_descr(base.dtype, drop_names)
569 output = np.empty(base.shape, dtype=newdtype)
570 output = recursive_fill_fields(base, output)
571 return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
574def _keep_fields(base, keep_names, usemask=True, asrecarray=False):
575 """
576 Return a new array keeping only the fields in `keep_names`,
577 and preserving the order of those fields.
579 Parameters
580 ----------
581 base : array
582 Input array
583 keep_names : string or sequence
584 String or sequence of strings corresponding to the names of the
585 fields to keep. Order of the names will be preserved.
586 usemask : {False, True}, optional
587 Whether to return a masked array or not.
588 asrecarray : string or sequence, optional
589 Whether to return a recarray or a mrecarray (`asrecarray=True`) or
590 a plain ndarray or masked array with flexible dtype. The default
591 is False.
592 """
593 newdtype = [(n, base.dtype[n]) for n in keep_names]
594 output = np.empty(base.shape, dtype=newdtype)
595 output = recursive_fill_fields(base, output)
596 return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
599def _rec_drop_fields_dispatcher(base, drop_names):
600 return (base,)
603@array_function_dispatch(_rec_drop_fields_dispatcher)
604def rec_drop_fields(base, drop_names):
605 """
606 Returns a new numpy.recarray with fields in `drop_names` dropped.
607 """
608 return drop_fields(base, drop_names, usemask=False, asrecarray=True)
611def _rename_fields_dispatcher(base, namemapper):
612 return (base,)
615@array_function_dispatch(_rename_fields_dispatcher)
616def rename_fields(base, namemapper):
617 """
618 Rename the fields from a flexible-datatype ndarray or recarray.
620 Nested fields are supported.
622 Parameters
623 ----------
624 base : ndarray
625 Input array whose fields must be modified.
626 namemapper : dictionary
627 Dictionary mapping old field names to their new version.
629 Examples
630 --------
631 >>> from numpy.lib import recfunctions as rfn
632 >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
633 ... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])])
634 >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'})
635 array([(1, (2., [ 3., 30.])), (4, (5., [ 6., 60.]))],
636 dtype=[('A', '<i8'), ('b', [('ba', '<f8'), ('BB', '<f8', (2,))])])
638 """
639 def _recursive_rename_fields(ndtype, namemapper):
640 newdtype = []
641 for name in ndtype.names:
642 newname = namemapper.get(name, name)
643 current = ndtype[name]
644 if current.names is not None:
645 newdtype.append(
646 (newname, _recursive_rename_fields(current, namemapper))
647 )
648 else:
649 newdtype.append((newname, current))
650 return newdtype
651 newdtype = _recursive_rename_fields(base.dtype, namemapper)
652 return base.view(newdtype)
655def _append_fields_dispatcher(base, names, data, dtypes=None,
656 fill_value=None, usemask=None, asrecarray=None):
657 yield base
658 yield from data
661@array_function_dispatch(_append_fields_dispatcher)
662def append_fields(base, names, data, dtypes=None,
663 fill_value=-1, usemask=True, asrecarray=False):
664 """
665 Add new fields to an existing array.
667 The names of the fields are given with the `names` arguments,
668 the corresponding values with the `data` arguments.
669 If a single field is appended, `names`, `data` and `dtypes` do not have
670 to be lists but just values.
672 Parameters
673 ----------
674 base : array
675 Input array to extend.
676 names : string, sequence
677 String or sequence of strings corresponding to the names
678 of the new fields.
679 data : array or sequence of arrays
680 Array or sequence of arrays storing the fields to add to the base.
681 dtypes : sequence of datatypes, optional
682 Datatype or sequence of datatypes.
683 If None, the datatypes are estimated from the `data`.
684 fill_value : {float}, optional
685 Filling value used to pad missing data on the shorter arrays.
686 usemask : {False, True}, optional
687 Whether to return a masked array or not.
688 asrecarray : {False, True}, optional
689 Whether to return a recarray (MaskedRecords) or not.
691 """
692 # Check the names
693 if isinstance(names, (tuple, list)):
694 if len(names) != len(data):
695 msg = "The number of arrays does not match the number of names"
696 raise ValueError(msg)
697 elif isinstance(names, str):
698 names = [names, ]
699 data = [data, ]
700 #
701 if dtypes is None:
702 data = [np.array(a, copy=False, subok=True) for a in data]
703 data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)]
704 else:
705 if not isinstance(dtypes, (tuple, list)):
706 dtypes = [dtypes, ]
707 if len(data) != len(dtypes):
708 if len(dtypes) == 1:
709 dtypes = dtypes * len(data)
710 else:
711 msg = "The dtypes argument must be None, a dtype, or a list."
712 raise ValueError(msg)
713 data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)])
714 for (a, n, d) in zip(data, names, dtypes)]
715 #
716 base = merge_arrays(base, usemask=usemask, fill_value=fill_value)
717 if len(data) > 1:
718 data = merge_arrays(data, flatten=True, usemask=usemask,
719 fill_value=fill_value)
720 else:
721 data = data.pop()
722 #
723 output = ma.masked_all(
724 max(len(base), len(data)),
725 dtype=_get_fieldspec(base.dtype) + _get_fieldspec(data.dtype))
726 output = recursive_fill_fields(base, output)
727 output = recursive_fill_fields(data, output)
728 #
729 return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
732def _rec_append_fields_dispatcher(base, names, data, dtypes=None):
733 yield base
734 yield from data
737@array_function_dispatch(_rec_append_fields_dispatcher)
738def rec_append_fields(base, names, data, dtypes=None):
739 """
740 Add new fields to an existing array.
742 The names of the fields are given with the `names` arguments,
743 the corresponding values with the `data` arguments.
744 If a single field is appended, `names`, `data` and `dtypes` do not have
745 to be lists but just values.
747 Parameters
748 ----------
749 base : array
750 Input array to extend.
751 names : string, sequence
752 String or sequence of strings corresponding to the names
753 of the new fields.
754 data : array or sequence of arrays
755 Array or sequence of arrays storing the fields to add to the base.
756 dtypes : sequence of datatypes, optional
757 Datatype or sequence of datatypes.
758 If None, the datatypes are estimated from the `data`.
760 See Also
761 --------
762 append_fields
764 Returns
765 -------
766 appended_array : np.recarray
767 """
768 return append_fields(base, names, data=data, dtypes=dtypes,
769 asrecarray=True, usemask=False)
772def _repack_fields_dispatcher(a, align=None, recurse=None):
773 return (a,)
776@array_function_dispatch(_repack_fields_dispatcher)
777def repack_fields(a, align=False, recurse=False):
778 """
779 Re-pack the fields of a structured array or dtype in memory.
781 The memory layout of structured datatypes allows fields at arbitrary
782 byte offsets. This means the fields can be separated by padding bytes,
783 their offsets can be non-monotonically increasing, and they can overlap.
785 This method removes any overlaps and reorders the fields in memory so they
786 have increasing byte offsets, and adds or removes padding bytes depending
787 on the `align` option, which behaves like the `align` option to `np.dtype`.
789 If `align=False`, this method produces a "packed" memory layout in which
790 each field starts at the byte the previous field ended, and any padding
791 bytes are removed.
793 If `align=True`, this methods produces an "aligned" memory layout in which
794 each field's offset is a multiple of its alignment, and the total itemsize
795 is a multiple of the largest alignment, by adding padding bytes as needed.
797 Parameters
798 ----------
799 a : ndarray or dtype
800 array or dtype for which to repack the fields.
801 align : boolean
802 If true, use an "aligned" memory layout, otherwise use a "packed" layout.
803 recurse : boolean
804 If True, also repack nested structures.
806 Returns
807 -------
808 repacked : ndarray or dtype
809 Copy of `a` with fields repacked, or `a` itself if no repacking was
810 needed.
812 Examples
813 --------
815 >>> from numpy.lib import recfunctions as rfn
816 >>> def print_offsets(d):
817 ... print("offsets:", [d.fields[name][1] for name in d.names])
818 ... print("itemsize:", d.itemsize)
819 ...
820 >>> dt = np.dtype('u1, <i8, <f8', align=True)
821 >>> dt
822 dtype({'names':['f0','f1','f2'], 'formats':['u1','<i8','<f8'], 'offsets':[0,8,16], 'itemsize':24}, align=True)
823 >>> print_offsets(dt)
824 offsets: [0, 8, 16]
825 itemsize: 24
826 >>> packed_dt = rfn.repack_fields(dt)
827 >>> packed_dt
828 dtype([('f0', 'u1'), ('f1', '<i8'), ('f2', '<f8')])
829 >>> print_offsets(packed_dt)
830 offsets: [0, 1, 9]
831 itemsize: 17
833 """
834 if not isinstance(a, np.dtype):
835 dt = repack_fields(a.dtype, align=align, recurse=recurse)
836 return a.astype(dt, copy=False)
838 if a.names is None:
839 return a
841 fieldinfo = []
842 for name in a.names:
843 tup = a.fields[name]
844 if recurse:
845 fmt = repack_fields(tup[0], align=align, recurse=True)
846 else:
847 fmt = tup[0]
849 if len(tup) == 3:
850 name = (tup[2], name)
852 fieldinfo.append((name, fmt))
854 dt = np.dtype(fieldinfo, align=align)
855 return np.dtype((a.type, dt))
857def _get_fields_and_offsets(dt, offset=0):
858 """
859 Returns a flat list of (dtype, count, offset) tuples of all the
860 scalar fields in the dtype "dt", including nested fields, in left
861 to right order.
862 """
864 # counts up elements in subarrays, including nested subarrays, and returns
865 # base dtype and count
866 def count_elem(dt):
867 count = 1
868 while dt.shape != ():
869 for size in dt.shape:
870 count *= size
871 dt = dt.base
872 return dt, count
874 fields = []
875 for name in dt.names:
876 field = dt.fields[name]
877 f_dt, f_offset = field[0], field[1]
878 f_dt, n = count_elem(f_dt)
880 if f_dt.names is None:
881 fields.append((np.dtype((f_dt, (n,))), n, f_offset + offset))
882 else:
883 subfields = _get_fields_and_offsets(f_dt, f_offset + offset)
884 size = f_dt.itemsize
886 for i in range(n):
887 if i == 0:
888 # optimization: avoid list comprehension if no subarray
889 fields.extend(subfields)
890 else:
891 fields.extend([(d, c, o + i*size) for d, c, o in subfields])
892 return fields
895def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None,
896 casting=None):
897 return (arr,)
899@array_function_dispatch(_structured_to_unstructured_dispatcher)
900def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
901 """
902 Converts and n-D structured array into an (n+1)-D unstructured array.
904 The new array will have a new last dimension equal in size to the
905 number of field-elements of the input array. If not supplied, the output
906 datatype is determined from the numpy type promotion rules applied to all
907 the field datatypes.
909 Nested fields, as well as each element of any subarray fields, all count
910 as a single field-elements.
912 Parameters
913 ----------
914 arr : ndarray
915 Structured array or dtype to convert. Cannot contain object datatype.
916 dtype : dtype, optional
917 The dtype of the output unstructured array.
918 copy : bool, optional
919 See copy argument to `ndarray.astype`. If true, always return a copy.
920 If false, and `dtype` requirements are satisfied, a view is returned.
921 casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
922 See casting argument of `ndarray.astype`. Controls what kind of data
923 casting may occur.
925 Returns
926 -------
927 unstructured : ndarray
928 Unstructured array with one more dimension.
930 Examples
931 --------
933 >>> from numpy.lib import recfunctions as rfn
934 >>> a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
935 >>> a
936 array([(0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.]),
937 (0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.])],
938 dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
939 >>> rfn.structured_to_unstructured(a)
940 array([[0., 0., 0., 0., 0.],
941 [0., 0., 0., 0., 0.],
942 [0., 0., 0., 0., 0.],
943 [0., 0., 0., 0., 0.]])
945 >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
946 ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
947 >>> np.mean(rfn.structured_to_unstructured(b[['x', 'z']]), axis=-1)
948 array([ 3. , 5.5, 9. , 11. ])
950 """
951 if arr.dtype.names is None:
952 raise ValueError('arr must be a structured array')
954 fields = _get_fields_and_offsets(arr.dtype)
955 n_fields = len(fields)
956 if n_fields == 0 and dtype is None:
957 raise ValueError("arr has no fields. Unable to guess dtype")
958 elif n_fields == 0:
959 # too many bugs elsewhere for this to work now
960 raise NotImplementedError("arr with no fields is not supported")
962 dts, counts, offsets = zip(*fields)
963 names = ['f{}'.format(n) for n in range(n_fields)]
965 if dtype is None:
966 out_dtype = np.result_type(*[dt.base for dt in dts])
967 else:
968 out_dtype = dtype
970 # Use a series of views and casts to convert to an unstructured array:
972 # first view using flattened fields (doesn't work for object arrays)
973 # Note: dts may include a shape for subarrays
974 flattened_fields = np.dtype({'names': names,
975 'formats': dts,
976 'offsets': offsets,
977 'itemsize': arr.dtype.itemsize})
978 with suppress_warnings() as sup: # until 1.16 (gh-12447)
979 sup.filter(FutureWarning, "Numpy has detected")
980 arr = arr.view(flattened_fields)
982 # next cast to a packed format with all fields converted to new dtype
983 packed_fields = np.dtype({'names': names,
984 'formats': [(out_dtype, dt.shape) for dt in dts]})
985 arr = arr.astype(packed_fields, copy=copy, casting=casting)
987 # finally is it safe to view the packed fields as the unstructured type
988 return arr.view((out_dtype, (sum(counts),)))
991def _unstructured_to_structured_dispatcher(arr, dtype=None, names=None,
992 align=None, copy=None, casting=None):
993 return (arr,)
995@array_function_dispatch(_unstructured_to_structured_dispatcher)
996def unstructured_to_structured(arr, dtype=None, names=None, align=False,
997 copy=False, casting='unsafe'):
998 """
999 Converts and n-D unstructured array into an (n-1)-D structured array.
1001 The last dimension of the input array is converted into a structure, with
1002 number of field-elements equal to the size of the last dimension of the
1003 input array. By default all output fields have the input array's dtype, but
1004 an output structured dtype with an equal number of fields-elements can be
1005 supplied instead.
1007 Nested fields, as well as each element of any subarray fields, all count
1008 towards the number of field-elements.
1010 Parameters
1011 ----------
1012 arr : ndarray
1013 Unstructured array or dtype to convert.
1014 dtype : dtype, optional
1015 The structured dtype of the output array
1016 names : list of strings, optional
1017 If dtype is not supplied, this specifies the field names for the output
1018 dtype, in order. The field dtypes will be the same as the input array.
1019 align : boolean, optional
1020 Whether to create an aligned memory layout.
1021 copy : bool, optional
1022 See copy argument to `ndarray.astype`. If true, always return a copy.
1023 If false, and `dtype` requirements are satisfied, a view is returned.
1024 casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
1025 See casting argument of `ndarray.astype`. Controls what kind of data
1026 casting may occur.
1028 Returns
1029 -------
1030 structured : ndarray
1031 Structured array with fewer dimensions.
1033 Examples
1034 --------
1036 >>> from numpy.lib import recfunctions as rfn
1037 >>> dt = np.dtype([('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
1038 >>> a = np.arange(20).reshape((4,5))
1039 >>> a
1040 array([[ 0, 1, 2, 3, 4],
1041 [ 5, 6, 7, 8, 9],
1042 [10, 11, 12, 13, 14],
1043 [15, 16, 17, 18, 19]])
1044 >>> rfn.unstructured_to_structured(a, dt)
1045 array([( 0, ( 1., 2), [ 3., 4.]), ( 5, ( 6., 7), [ 8., 9.]),
1046 (10, (11., 12), [13., 14.]), (15, (16., 17), [18., 19.])],
1047 dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
1049 """
1050 if arr.shape == ():
1051 raise ValueError('arr must have at least one dimension')
1052 n_elem = arr.shape[-1]
1053 if n_elem == 0:
1054 # too many bugs elsewhere for this to work now
1055 raise NotImplementedError("last axis with size 0 is not supported")
1057 if dtype is None:
1058 if names is None:
1059 names = ['f{}'.format(n) for n in range(n_elem)]
1060 out_dtype = np.dtype([(n, arr.dtype) for n in names], align=align)
1061 fields = _get_fields_and_offsets(out_dtype)
1062 dts, counts, offsets = zip(*fields)
1063 else:
1064 if names is not None:
1065 raise ValueError("don't supply both dtype and names")
1066 # sanity check of the input dtype
1067 fields = _get_fields_and_offsets(dtype)
1068 if len(fields) == 0:
1069 dts, counts, offsets = [], [], []
1070 else:
1071 dts, counts, offsets = zip(*fields)
1073 if n_elem != sum(counts):
1074 raise ValueError('The length of the last dimension of arr must '
1075 'be equal to the number of fields in dtype')
1076 out_dtype = dtype
1077 if align and not out_dtype.isalignedstruct:
1078 raise ValueError("align was True but dtype is not aligned")
1080 names = ['f{}'.format(n) for n in range(len(fields))]
1082 # Use a series of views and casts to convert to a structured array:
1084 # first view as a packed structured array of one dtype
1085 packed_fields = np.dtype({'names': names,
1086 'formats': [(arr.dtype, dt.shape) for dt in dts]})
1087 arr = np.ascontiguousarray(arr).view(packed_fields)
1089 # next cast to an unpacked but flattened format with varied dtypes
1090 flattened_fields = np.dtype({'names': names,
1091 'formats': dts,
1092 'offsets': offsets,
1093 'itemsize': out_dtype.itemsize})
1094 arr = arr.astype(flattened_fields, copy=copy, casting=casting)
1096 # finally view as the final nested dtype and remove the last axis
1097 return arr.view(out_dtype)[..., 0]
1099def _apply_along_fields_dispatcher(func, arr):
1100 return (arr,)
1102@array_function_dispatch(_apply_along_fields_dispatcher)
1103def apply_along_fields(func, arr):
1104 """
1105 Apply function 'func' as a reduction across fields of a structured array.
1107 This is similar to `apply_along_axis`, but treats the fields of a
1108 structured array as an extra axis. The fields are all first cast to a
1109 common type following the type-promotion rules from `numpy.result_type`
1110 applied to the field's dtypes.
1112 Parameters
1113 ----------
1114 func : function
1115 Function to apply on the "field" dimension. This function must
1116 support an `axis` argument, like np.mean, np.sum, etc.
1117 arr : ndarray
1118 Structured array for which to apply func.
1120 Returns
1121 -------
1122 out : ndarray
1123 Result of the recution operation
1125 Examples
1126 --------
1128 >>> from numpy.lib import recfunctions as rfn
1129 >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
1130 ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
1131 >>> rfn.apply_along_fields(np.mean, b)
1132 array([ 2.66666667, 5.33333333, 8.66666667, 11. ])
1133 >>> rfn.apply_along_fields(np.mean, b[['x', 'z']])
1134 array([ 3. , 5.5, 9. , 11. ])
1136 """
1137 if arr.dtype.names is None:
1138 raise ValueError('arr must be a structured array')
1140 uarr = structured_to_unstructured(arr)
1141 return func(uarr, axis=-1)
1142 # works and avoids axis requirement, but very, very slow:
1143 #return np.apply_along_axis(func, -1, uarr)
1145def _assign_fields_by_name_dispatcher(dst, src, zero_unassigned=None):
1146 return dst, src
1148@array_function_dispatch(_assign_fields_by_name_dispatcher)
1149def assign_fields_by_name(dst, src, zero_unassigned=True):
1150 """
1151 Assigns values from one structured array to another by field name.
1153 Normally in numpy >= 1.14, assignment of one structured array to another
1154 copies fields "by position", meaning that the first field from the src is
1155 copied to the first field of the dst, and so on, regardless of field name.
1157 This function instead copies "by field name", such that fields in the dst
1158 are assigned from the identically named field in the src. This applies
1159 recursively for nested structures. This is how structure assignment worked
1160 in numpy >= 1.6 to <= 1.13.
1162 Parameters
1163 ----------
1164 dst : ndarray
1165 src : ndarray
1166 The source and destination arrays during assignment.
1167 zero_unassigned : bool, optional
1168 If True, fields in the dst for which there was no matching
1169 field in the src are filled with the value 0 (zero). This
1170 was the behavior of numpy <= 1.13. If False, those fields
1171 are not modified.
1172 """
1174 if dst.dtype.names is None:
1175 dst[...] = src
1176 return
1178 for name in dst.dtype.names:
1179 if name not in src.dtype.names:
1180 if zero_unassigned:
1181 dst[name] = 0
1182 else:
1183 assign_fields_by_name(dst[name], src[name],
1184 zero_unassigned)
1186def _require_fields_dispatcher(array, required_dtype):
1187 return (array,)
1189@array_function_dispatch(_require_fields_dispatcher)
1190def require_fields(array, required_dtype):
1191 """
1192 Casts a structured array to a new dtype using assignment by field-name.
1194 This function assigns from the old to the new array by name, so the
1195 value of a field in the output array is the value of the field with the
1196 same name in the source array. This has the effect of creating a new
1197 ndarray containing only the fields "required" by the required_dtype.
1199 If a field name in the required_dtype does not exist in the
1200 input array, that field is created and set to 0 in the output array.
1202 Parameters
1203 ----------
1204 a : ndarray
1205 array to cast
1206 required_dtype : dtype
1207 datatype for output array
1209 Returns
1210 -------
1211 out : ndarray
1212 array with the new dtype, with field values copied from the fields in
1213 the input array with the same name
1215 Examples
1216 --------
1218 >>> from numpy.lib import recfunctions as rfn
1219 >>> a = np.ones(4, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')])
1220 >>> rfn.require_fields(a, [('b', 'f4'), ('c', 'u1')])
1221 array([(1., 1), (1., 1), (1., 1), (1., 1)],
1222 dtype=[('b', '<f4'), ('c', 'u1')])
1223 >>> rfn.require_fields(a, [('b', 'f4'), ('newf', 'u1')])
1224 array([(1., 0), (1., 0), (1., 0), (1., 0)],
1225 dtype=[('b', '<f4'), ('newf', 'u1')])
1227 """
1228 out = np.empty(array.shape, dtype=required_dtype)
1229 assign_fields_by_name(out, array)
1230 return out
1233def _stack_arrays_dispatcher(arrays, defaults=None, usemask=None,
1234 asrecarray=None, autoconvert=None):
1235 return arrays
1238@array_function_dispatch(_stack_arrays_dispatcher)
1239def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
1240 autoconvert=False):
1241 """
1242 Superposes arrays fields by fields
1244 Parameters
1245 ----------
1246 arrays : array or sequence
1247 Sequence of input arrays.
1248 defaults : dictionary, optional
1249 Dictionary mapping field names to the corresponding default values.
1250 usemask : {True, False}, optional
1251 Whether to return a MaskedArray (or MaskedRecords is
1252 `asrecarray==True`) or a ndarray.
1253 asrecarray : {False, True}, optional
1254 Whether to return a recarray (or MaskedRecords if `usemask==True`)
1255 or just a flexible-type ndarray.
1256 autoconvert : {False, True}, optional
1257 Whether automatically cast the type of the field to the maximum.
1259 Examples
1260 --------
1261 >>> from numpy.lib import recfunctions as rfn
1262 >>> x = np.array([1, 2,])
1263 >>> rfn.stack_arrays(x) is x
1264 True
1265 >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)])
1266 >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
1267 ... dtype=[('A', '|S3'), ('B', np.double), ('C', np.double)])
1268 >>> test = rfn.stack_arrays((z,zz))
1269 >>> test
1270 masked_array(data=[(b'A', 1.0, --), (b'B', 2.0, --), (b'a', 10.0, 100.0),
1271 (b'b', 20.0, 200.0), (b'c', 30.0, 300.0)],
1272 mask=[(False, False, True), (False, False, True),
1273 (False, False, False), (False, False, False),
1274 (False, False, False)],
1275 fill_value=(b'N/A', 1.e+20, 1.e+20),
1276 dtype=[('A', 'S3'), ('B', '<f8'), ('C', '<f8')])
1278 """
1279 if isinstance(arrays, ndarray):
1280 return arrays
1281 elif len(arrays) == 1:
1282 return arrays[0]
1283 seqarrays = [np.asanyarray(a).ravel() for a in arrays]
1284 nrecords = [len(a) for a in seqarrays]
1285 ndtype = [a.dtype for a in seqarrays]
1286 fldnames = [d.names for d in ndtype]
1287 #
1288 dtype_l = ndtype[0]
1289 newdescr = _get_fieldspec(dtype_l)
1290 names = [n for n, d in newdescr]
1291 for dtype_n in ndtype[1:]:
1292 for fname, fdtype in _get_fieldspec(dtype_n):
1293 if fname not in names:
1294 newdescr.append((fname, fdtype))
1295 names.append(fname)
1296 else:
1297 nameidx = names.index(fname)
1298 _, cdtype = newdescr[nameidx]
1299 if autoconvert:
1300 newdescr[nameidx] = (fname, max(fdtype, cdtype))
1301 elif fdtype != cdtype:
1302 raise TypeError("Incompatible type '%s' <> '%s'" %
1303 (cdtype, fdtype))
1304 # Only one field: use concatenate
1305 if len(newdescr) == 1:
1306 output = ma.concatenate(seqarrays)
1307 else:
1308 #
1309 output = ma.masked_all((np.sum(nrecords),), newdescr)
1310 offset = np.cumsum(np.r_[0, nrecords])
1311 seen = []
1312 for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]):
1313 names = a.dtype.names
1314 if names is None:
1315 output['f%i' % len(seen)][i:j] = a
1316 else:
1317 for name in n:
1318 output[name][i:j] = a[name]
1319 if name not in seen:
1320 seen.append(name)
1321 #
1322 return _fix_output(_fix_defaults(output, defaults),
1323 usemask=usemask, asrecarray=asrecarray)
1326def _find_duplicates_dispatcher(
1327 a, key=None, ignoremask=None, return_index=None):
1328 return (a,)
1331@array_function_dispatch(_find_duplicates_dispatcher)
1332def find_duplicates(a, key=None, ignoremask=True, return_index=False):
1333 """
1334 Find the duplicates in a structured array along a given key
1336 Parameters
1337 ----------
1338 a : array-like
1339 Input array
1340 key : {string, None}, optional
1341 Name of the fields along which to check the duplicates.
1342 If None, the search is performed by records
1343 ignoremask : {True, False}, optional
1344 Whether masked data should be discarded or considered as duplicates.
1345 return_index : {False, True}, optional
1346 Whether to return the indices of the duplicated values.
1348 Examples
1349 --------
1350 >>> from numpy.lib import recfunctions as rfn
1351 >>> ndtype = [('a', int)]
1352 >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3],
1353 ... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
1354 >>> rfn.find_duplicates(a, ignoremask=True, return_index=True)
1355 (masked_array(data=[(1,), (1,), (2,), (2,)],
1356 mask=[(False,), (False,), (False,), (False,)],
1357 fill_value=(999999,),
1358 dtype=[('a', '<i8')]), array([0, 1, 3, 4]))
1359 """
1360 a = np.asanyarray(a).ravel()
1361 # Get a dictionary of fields
1362 fields = get_fieldstructure(a.dtype)
1363 # Get the sorting data (by selecting the corresponding field)
1364 base = a
1365 if key:
1366 for f in fields[key]:
1367 base = base[f]
1368 base = base[key]
1369 # Get the sorting indices and the sorted data
1370 sortidx = base.argsort()
1371 sortedbase = base[sortidx]
1372 sorteddata = sortedbase.filled()
1373 # Compare the sorting data
1374 flag = (sorteddata[:-1] == sorteddata[1:])
1375 # If masked data must be ignored, set the flag to false where needed
1376 if ignoremask:
1377 sortedmask = sortedbase.recordmask
1378 flag[sortedmask[1:]] = False
1379 flag = np.concatenate(([False], flag))
1380 # We need to take the point on the left as well (else we're missing it)
1381 flag[:-1] = flag[:-1] + flag[1:]
1382 duplicates = a[sortidx][flag]
1383 if return_index:
1384 return (duplicates, sortidx[flag])
1385 else:
1386 return duplicates
1389def _join_by_dispatcher(
1390 key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
1391 defaults=None, usemask=None, asrecarray=None):
1392 return (r1, r2)
1395@array_function_dispatch(_join_by_dispatcher)
1396def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
1397 defaults=None, usemask=True, asrecarray=False):
1398 """
1399 Join arrays `r1` and `r2` on key `key`.
1401 The key should be either a string or a sequence of string corresponding
1402 to the fields used to join the array. An exception is raised if the
1403 `key` field cannot be found in the two input arrays. Neither `r1` nor
1404 `r2` should have any duplicates along `key`: the presence of duplicates
1405 will make the output quite unreliable. Note that duplicates are not
1406 looked for by the algorithm.
1408 Parameters
1409 ----------
1410 key : {string, sequence}
1411 A string or a sequence of strings corresponding to the fields used
1412 for comparison.
1413 r1, r2 : arrays
1414 Structured arrays.
1415 jointype : {'inner', 'outer', 'leftouter'}, optional
1416 If 'inner', returns the elements common to both r1 and r2.
1417 If 'outer', returns the common elements as well as the elements of
1418 r1 not in r2 and the elements of not in r2.
1419 If 'leftouter', returns the common elements and the elements of r1
1420 not in r2.
1421 r1postfix : string, optional
1422 String appended to the names of the fields of r1 that are present
1423 in r2 but absent of the key.
1424 r2postfix : string, optional
1425 String appended to the names of the fields of r2 that are present
1426 in r1 but absent of the key.
1427 defaults : {dictionary}, optional
1428 Dictionary mapping field names to the corresponding default values.
1429 usemask : {True, False}, optional
1430 Whether to return a MaskedArray (or MaskedRecords is
1431 `asrecarray==True`) or a ndarray.
1432 asrecarray : {False, True}, optional
1433 Whether to return a recarray (or MaskedRecords if `usemask==True`)
1434 or just a flexible-type ndarray.
1436 Notes
1437 -----
1438 * The output is sorted along the key.
1439 * A temporary array is formed by dropping the fields not in the key for
1440 the two arrays and concatenating the result. This array is then
1441 sorted, and the common entries selected. The output is constructed by
1442 filling the fields with the selected entries. Matching is not
1443 preserved if there are some duplicates...
1445 """
1446 # Check jointype
1447 if jointype not in ('inner', 'outer', 'leftouter'):
1448 raise ValueError(
1449 "The 'jointype' argument should be in 'inner', "
1450 "'outer' or 'leftouter' (got '%s' instead)" % jointype
1451 )
1452 # If we have a single key, put it in a tuple
1453 if isinstance(key, str):
1454 key = (key,)
1456 # Check the keys
1457 if len(set(key)) != len(key):
1458 dup = next(x for n,x in enumerate(key) if x in key[n+1:])
1459 raise ValueError("duplicate join key %r" % dup)
1460 for name in key:
1461 if name not in r1.dtype.names:
1462 raise ValueError('r1 does not have key field %r' % name)
1463 if name not in r2.dtype.names:
1464 raise ValueError('r2 does not have key field %r' % name)
1466 # Make sure we work with ravelled arrays
1467 r1 = r1.ravel()
1468 r2 = r2.ravel()
1469 # Fixme: nb2 below is never used. Commenting out for pyflakes.
1470 # (nb1, nb2) = (len(r1), len(r2))
1471 nb1 = len(r1)
1472 (r1names, r2names) = (r1.dtype.names, r2.dtype.names)
1474 # Check the names for collision
1475 collisions = (set(r1names) & set(r2names)) - set(key)
1476 if collisions and not (r1postfix or r2postfix):
1477 msg = "r1 and r2 contain common names, r1postfix and r2postfix "
1478 msg += "can't both be empty"
1479 raise ValueError(msg)
1481 # Make temporary arrays of just the keys
1482 # (use order of keys in `r1` for back-compatibility)
1483 key1 = [ n for n in r1names if n in key ]
1484 r1k = _keep_fields(r1, key1)
1485 r2k = _keep_fields(r2, key1)
1487 # Concatenate the two arrays for comparison
1488 aux = ma.concatenate((r1k, r2k))
1489 idx_sort = aux.argsort(order=key)
1490 aux = aux[idx_sort]
1491 #
1492 # Get the common keys
1493 flag_in = ma.concatenate(([False], aux[1:] == aux[:-1]))
1494 flag_in[:-1] = flag_in[1:] + flag_in[:-1]
1495 idx_in = idx_sort[flag_in]
1496 idx_1 = idx_in[(idx_in < nb1)]
1497 idx_2 = idx_in[(idx_in >= nb1)] - nb1
1498 (r1cmn, r2cmn) = (len(idx_1), len(idx_2))
1499 if jointype == 'inner':
1500 (r1spc, r2spc) = (0, 0)
1501 elif jointype == 'outer':
1502 idx_out = idx_sort[~flag_in]
1503 idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
1504 idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1))
1505 (r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn)
1506 elif jointype == 'leftouter':
1507 idx_out = idx_sort[~flag_in]
1508 idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
1509 (r1spc, r2spc) = (len(idx_1) - r1cmn, 0)
1510 # Select the entries from each input
1511 (s1, s2) = (r1[idx_1], r2[idx_2])
1512 #
1513 # Build the new description of the output array .......
1514 # Start with the key fields
1515 ndtype = _get_fieldspec(r1k.dtype)
1517 # Add the fields from r1
1518 for fname, fdtype in _get_fieldspec(r1.dtype):
1519 if fname not in key:
1520 ndtype.append((fname, fdtype))
1522 # Add the fields from r2
1523 for fname, fdtype in _get_fieldspec(r2.dtype):
1524 # Have we seen the current name already ?
1525 # we need to rebuild this list every time
1526 names = list(name for name, dtype in ndtype)
1527 try:
1528 nameidx = names.index(fname)
1529 except ValueError:
1530 #... we haven't: just add the description to the current list
1531 ndtype.append((fname, fdtype))
1532 else:
1533 # collision
1534 _, cdtype = ndtype[nameidx]
1535 if fname in key:
1536 # The current field is part of the key: take the largest dtype
1537 ndtype[nameidx] = (fname, max(fdtype, cdtype))
1538 else:
1539 # The current field is not part of the key: add the suffixes,
1540 # and place the new field adjacent to the old one
1541 ndtype[nameidx:nameidx + 1] = [
1542 (fname + r1postfix, cdtype),
1543 (fname + r2postfix, fdtype)
1544 ]
1545 # Rebuild a dtype from the new fields
1546 ndtype = np.dtype(ndtype)
1547 # Find the largest nb of common fields :
1548 # r1cmn and r2cmn should be equal, but...
1549 cmn = max(r1cmn, r2cmn)
1550 # Construct an empty array
1551 output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype)
1552 names = output.dtype.names
1553 for f in r1names:
1554 selected = s1[f]
1555 if f not in names or (f in r2names and not r2postfix and f not in key):
1556 f += r1postfix
1557 current = output[f]
1558 current[:r1cmn] = selected[:r1cmn]
1559 if jointype in ('outer', 'leftouter'):
1560 current[cmn:cmn + r1spc] = selected[r1cmn:]
1561 for f in r2names:
1562 selected = s2[f]
1563 if f not in names or (f in r1names and not r1postfix and f not in key):
1564 f += r2postfix
1565 current = output[f]
1566 current[:r2cmn] = selected[:r2cmn]
1567 if (jointype == 'outer') and r2spc:
1568 current[-r2spc:] = selected[r2cmn:]
1569 # Sort and finalize the output
1570 output.sort(order=key)
1571 kwargs = dict(usemask=usemask, asrecarray=asrecarray)
1572 return _fix_output(_fix_defaults(output, defaults), **kwargs)
1575def _rec_join_dispatcher(
1576 key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
1577 defaults=None):
1578 return (r1, r2)
1581@array_function_dispatch(_rec_join_dispatcher)
1582def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
1583 defaults=None):
1584 """
1585 Join arrays `r1` and `r2` on keys.
1586 Alternative to join_by, that always returns a np.recarray.
1588 See Also
1589 --------
1590 join_by : equivalent function
1591 """
1592 kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix,
1593 defaults=defaults, usemask=False, asrecarray=True)
1594 return join_by(key, r1, r2, **kwargs)