Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/internals/construction.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Functions for preparing various inputs passed to the DataFrame or Series
3constructors before passing them to a BlockManager.
4"""
5from collections import abc
7import numpy as np
8import numpy.ma as ma
10from pandas._libs import lib
12from pandas.core.dtypes.cast import (
13 construct_1d_arraylike_from_scalar,
14 maybe_cast_to_datetime,
15 maybe_convert_platform,
16 maybe_infer_to_datetimelike,
17 maybe_upcast,
18)
19from pandas.core.dtypes.common import (
20 is_categorical_dtype,
21 is_datetime64tz_dtype,
22 is_dtype_equal,
23 is_extension_array_dtype,
24 is_integer_dtype,
25 is_list_like,
26 is_object_dtype,
27)
28from pandas.core.dtypes.generic import (
29 ABCDataFrame,
30 ABCDatetimeIndex,
31 ABCIndexClass,
32 ABCPeriodIndex,
33 ABCSeries,
34 ABCTimedeltaIndex,
35)
37from pandas.core import algorithms, common as com
38from pandas.core.arrays import Categorical
39from pandas.core.construction import sanitize_array
40from pandas.core.indexes import base as ibase
41from pandas.core.indexes.api import (
42 Index,
43 ensure_index,
44 get_objs_combined_axis,
45 union_indexes,
46)
47from pandas.core.internals import (
48 create_block_manager_from_arrays,
49 create_block_manager_from_blocks,
50)
52# ---------------------------------------------------------------------
53# BlockManager Interface
56def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
57 """
58 Segregate Series based on type and coerce into matrices.
60 Needs to handle a lot of exceptional cases.
61 """
62 # figure out the index, if necessary
63 if index is None:
64 index = extract_index(arrays)
65 else:
66 index = ensure_index(index)
68 # don't force copy because getting jammed in an ndarray anyway
69 arrays = _homogenize(arrays, index, dtype)
71 # from BlockManager perspective
72 axes = [ensure_index(columns), index]
74 return create_block_manager_from_arrays(arrays, arr_names, axes)
77def masked_rec_array_to_mgr(data, index, columns, dtype, copy):
78 """
79 Extract from a masked rec array and create the manager.
80 """
82 # essentially process a record array then fill it
83 fill_value = data.fill_value
84 fdata = ma.getdata(data)
85 if index is None:
86 index = get_names_from_index(fdata)
87 if index is None:
88 index = ibase.default_index(len(data))
89 index = ensure_index(index)
91 if columns is not None:
92 columns = ensure_index(columns)
93 arrays, arr_columns = to_arrays(fdata, columns)
95 # fill if needed
96 new_arrays = []
97 for fv, arr, col in zip(fill_value, arrays, arr_columns):
98 # TODO: numpy docs suggest fv must be scalar, but could it be
99 # non-scalar for object dtype?
100 assert lib.is_scalar(fv), fv
101 mask = ma.getmaskarray(data[col])
102 if mask.any():
103 arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
104 arr[mask] = fv
105 new_arrays.append(arr)
107 # create the manager
108 arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns)
109 if columns is None:
110 columns = arr_columns
112 mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)
114 if copy:
115 mgr = mgr.copy()
116 return mgr
119# ---------------------------------------------------------------------
120# DataFrame Constructor Interface
123def init_ndarray(values, index, columns, dtype=None, copy=False):
124 # input must be a ndarray, list, Series, index
126 if isinstance(values, ABCSeries):
127 if columns is None:
128 if values.name is not None:
129 columns = [values.name]
130 if index is None:
131 index = values.index
132 else:
133 values = values.reindex(index)
135 # zero len case (GH #2234)
136 if not len(values) and columns is not None and len(columns):
137 values = np.empty((0, 1), dtype=object)
139 # we could have a categorical type passed or coerced to 'category'
140 # recast this to an arrays_to_mgr
141 if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype(
142 dtype
143 ):
145 if not hasattr(values, "dtype"):
146 values = prep_ndarray(values, copy=copy)
147 values = values.ravel()
148 elif copy:
149 values = values.copy()
151 index, columns = _get_axes(len(values), 1, index, columns)
152 return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
153 elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
154 # GH#19157
156 if isinstance(values, np.ndarray) and values.ndim > 1:
157 # GH#12513 a EA dtype passed with a 2D array, split into
158 # multiple EAs that view the values
159 values = [values[:, n] for n in range(values.shape[1])]
160 else:
161 values = [values]
163 if columns is None:
164 columns = list(range(len(values)))
165 return arrays_to_mgr(values, columns, index, columns, dtype=dtype)
167 # by definition an array here
168 # the dtypes will be coerced to a single dtype
169 values = prep_ndarray(values, copy=copy)
171 if dtype is not None:
172 if not is_dtype_equal(values.dtype, dtype):
173 try:
174 values = values.astype(dtype)
175 except Exception as orig:
176 # e.g. ValueError when trying to cast object dtype to float64
177 raise ValueError(
178 f"failed to cast to '{dtype}' (Exception was: {orig})"
179 ) from orig
181 index, columns = _get_axes(*values.shape, index=index, columns=columns)
182 values = values.T
184 # if we don't have a dtype specified, then try to convert objects
185 # on the entire block; this is to convert if we have datetimelike's
186 # embedded in an object type
187 if dtype is None and is_object_dtype(values):
189 if values.ndim == 2 and values.shape[0] != 1:
190 # transpose and separate blocks
192 dvals_list = [maybe_infer_to_datetimelike(row) for row in values]
193 for n in range(len(dvals_list)):
194 if isinstance(dvals_list[n], np.ndarray):
195 dvals_list[n] = dvals_list[n].reshape(1, -1)
197 from pandas.core.internals.blocks import make_block
199 # TODO: What about re-joining object columns?
200 block_values = [
201 make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list))
202 ]
204 else:
205 datelike_vals = maybe_infer_to_datetimelike(values)
206 block_values = [datelike_vals]
207 else:
208 block_values = [values]
210 return create_block_manager_from_blocks(block_values, [columns, index])
213def init_dict(data, index, columns, dtype=None):
214 """
215 Segregate Series based on type and coerce into matrices.
216 Needs to handle a lot of exceptional cases.
217 """
218 if columns is not None:
219 from pandas.core.series import Series
221 arrays = Series(data, index=columns, dtype=object)
222 data_names = arrays.index
224 missing = arrays.isna()
225 if index is None:
226 # GH10856
227 # raise ValueError if only scalars in dict
228 index = extract_index(arrays[~missing])
229 else:
230 index = ensure_index(index)
232 # no obvious "empty" int column
233 if missing.any() and not is_integer_dtype(dtype):
234 if dtype is None or np.issubdtype(dtype, np.flexible):
235 # GH#1783
236 nan_dtype = np.dtype(object)
237 else:
238 nan_dtype = dtype
239 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
240 arrays.loc[missing] = [val] * missing.sum()
242 else:
243 keys = list(data.keys())
244 columns = data_names = Index(keys)
245 arrays = (com.maybe_iterable_to_list(data[k]) for k in keys)
246 # GH#24096 need copy to be deep for datetime64tz case
247 # TODO: See if we can avoid these copies
248 arrays = [
249 arr if not isinstance(arr, ABCIndexClass) else arr._data for arr in arrays
250 ]
251 arrays = [
252 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
253 ]
254 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
257# ---------------------------------------------------------------------
260def prep_ndarray(values, copy=True) -> np.ndarray:
261 if not isinstance(values, (np.ndarray, ABCSeries, Index)):
262 if len(values) == 0:
263 return np.empty((0, 0), dtype=object)
264 elif isinstance(values, range):
265 arr = np.arange(values.start, values.stop, values.step, dtype="int64")
266 return arr[..., np.newaxis]
268 def convert(v):
269 return maybe_convert_platform(v)
271 # we could have a 1-dim or 2-dim list here
272 # this is equiv of np.asarray, but does object conversion
273 # and platform dtype preservation
274 try:
275 if is_list_like(values[0]) or hasattr(values[0], "len"):
276 values = np.array([convert(v) for v in values])
277 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0:
278 # GH#21861
279 values = np.array([convert(v) for v in values])
280 else:
281 values = convert(values)
282 except (ValueError, TypeError):
283 values = convert(values)
285 else:
287 # drop subclass info, do not copy data
288 values = np.asarray(values)
289 if copy:
290 values = values.copy()
292 if values.ndim == 1:
293 values = values.reshape((values.shape[0], 1))
294 elif values.ndim != 2:
295 raise ValueError("Must pass 2-d input")
297 return values
300def _homogenize(data, index, dtype=None):
301 oindex = None
302 homogenized = []
304 for val in data:
305 if isinstance(val, ABCSeries):
306 if dtype is not None:
307 val = val.astype(dtype)
308 if val.index is not index:
309 # Forces alignment. No need to copy data since we
310 # are putting it into an ndarray later
311 val = val.reindex(index, copy=False)
312 else:
313 if isinstance(val, dict):
314 if oindex is None:
315 oindex = index.astype("O")
317 if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)):
318 val = com.dict_compat(val)
319 else:
320 val = dict(val)
321 val = lib.fast_multiget(val, oindex.values, default=np.nan)
322 val = sanitize_array(
323 val, index, dtype=dtype, copy=False, raise_cast_failure=False
324 )
326 homogenized.append(val)
328 return homogenized
331def extract_index(data):
332 index = None
333 if len(data) == 0:
334 index = Index([])
335 elif len(data) > 0:
336 raw_lengths = []
337 indexes = []
339 have_raw_arrays = False
340 have_series = False
341 have_dicts = False
343 for val in data:
344 if isinstance(val, ABCSeries):
345 have_series = True
346 indexes.append(val.index)
347 elif isinstance(val, dict):
348 have_dicts = True
349 indexes.append(list(val.keys()))
350 elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
351 have_raw_arrays = True
352 raw_lengths.append(len(val))
354 if not indexes and not raw_lengths:
355 raise ValueError("If using all scalar values, you must pass an index")
357 if have_series:
358 index = union_indexes(indexes)
359 elif have_dicts:
360 index = union_indexes(indexes, sort=False)
362 if have_raw_arrays:
363 lengths = list(set(raw_lengths))
364 if len(lengths) > 1:
365 raise ValueError("arrays must all be same length")
367 if have_dicts:
368 raise ValueError(
369 "Mixing dicts with non-Series may lead to ambiguous ordering."
370 )
372 if have_series:
373 if lengths[0] != len(index):
374 msg = (
375 f"array length {lengths[0]} does not match index "
376 f"length {len(index)}"
377 )
378 raise ValueError(msg)
379 else:
380 index = ibase.default_index(lengths[0])
382 return ensure_index(index)
385def reorder_arrays(arrays, arr_columns, columns):
386 # reorder according to the columns
387 if (
388 columns is not None
389 and len(columns)
390 and arr_columns is not None
391 and len(arr_columns)
392 ):
393 indexer = ensure_index(arr_columns).get_indexer(columns)
394 arr_columns = ensure_index([arr_columns[i] for i in indexer])
395 arrays = [arrays[i] for i in indexer]
396 return arrays, arr_columns
399def get_names_from_index(data):
400 has_some_name = any(getattr(s, "name", None) is not None for s in data)
401 if not has_some_name:
402 return ibase.default_index(len(data))
404 index = list(range(len(data)))
405 count = 0
406 for i, s in enumerate(data):
407 n = getattr(s, "name", None)
408 if n is not None:
409 index[i] = n
410 else:
411 index[i] = f"Unnamed {count}"
412 count += 1
414 return index
417def _get_axes(N, K, index, columns):
418 # helper to create the axes as indexes
419 # return axes or defaults
421 if index is None:
422 index = ibase.default_index(N)
423 else:
424 index = ensure_index(index)
426 if columns is None:
427 columns = ibase.default_index(K)
428 else:
429 columns = ensure_index(columns)
430 return index, columns
433# ---------------------------------------------------------------------
434# Conversion of Inputs to Arrays
437def to_arrays(data, columns, coerce_float=False, dtype=None):
438 """
439 Return list of arrays, columns.
440 """
441 if isinstance(data, ABCDataFrame):
442 if columns is not None:
443 arrays = [
444 data._ixs(i, axis=1).values
445 for i, col in enumerate(data.columns)
446 if col in columns
447 ]
448 else:
449 columns = data.columns
450 arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]
452 return arrays, columns
454 if not len(data):
455 if isinstance(data, np.ndarray):
456 columns = data.dtype.names
457 if columns is not None:
458 return [[]] * len(columns), columns
459 return [], [] # columns if columns is not None else []
460 if isinstance(data[0], (list, tuple)):
461 return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
462 elif isinstance(data[0], abc.Mapping):
463 return _list_of_dict_to_arrays(
464 data, columns, coerce_float=coerce_float, dtype=dtype
465 )
466 elif isinstance(data[0], ABCSeries):
467 return _list_of_series_to_arrays(
468 data, columns, coerce_float=coerce_float, dtype=dtype
469 )
470 elif isinstance(data[0], Categorical):
471 if columns is None:
472 columns = ibase.default_index(len(data))
473 return data, columns
474 elif (
475 isinstance(data, (np.ndarray, ABCSeries, Index))
476 and data.dtype.names is not None
477 ):
479 columns = list(data.dtype.names)
480 arrays = [data[k] for k in columns]
481 return arrays, columns
482 else:
483 # last ditch effort
484 data = [tuple(x) for x in data]
485 return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
488def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
489 if len(data) > 0 and isinstance(data[0], tuple):
490 content = list(lib.to_object_array_tuples(data).T)
491 else:
492 # list of lists
493 content = list(lib.to_object_array(data).T)
494 # gh-26429 do not raise user-facing AssertionError
495 try:
496 result = _convert_object_array(
497 content, columns, dtype=dtype, coerce_float=coerce_float
498 )
499 except AssertionError as e:
500 raise ValueError(e) from e
501 return result
504def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
505 if columns is None:
506 # We know pass_data is non-empty because data[0] is a Series
507 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))]
508 columns = get_objs_combined_axis(pass_data, sort=False)
510 indexer_cache = {}
512 aligned_values = []
513 for s in data:
514 index = getattr(s, "index", None)
515 if index is None:
516 index = ibase.default_index(len(s))
518 if id(index) in indexer_cache:
519 indexer = indexer_cache[id(index)]
520 else:
521 indexer = indexer_cache[id(index)] = index.get_indexer(columns)
523 values = com.values_from_object(s)
524 aligned_values.append(algorithms.take_1d(values, indexer))
526 values = np.vstack(aligned_values)
528 if values.dtype == np.object_:
529 content = list(values.T)
530 return _convert_object_array(
531 content, columns, dtype=dtype, coerce_float=coerce_float
532 )
533 else:
534 return values.T, columns
537def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
538 """Convert list of dicts to numpy arrays
540 if `columns` is not passed, column names are inferred from the records
541 - for OrderedDict and dicts, the column names match
542 the key insertion-order from the first record to the last.
543 - For other kinds of dict-likes, the keys are lexically sorted.
545 Parameters
546 ----------
547 data : iterable
548 collection of records (OrderedDict, dict)
549 columns: iterables or None
550 coerce_float : bool
551 dtype : np.dtype
553 Returns
554 -------
555 tuple
556 arrays, columns
557 """
559 if columns is None:
560 gen = (list(x.keys()) for x in data)
561 sort = not any(isinstance(d, dict) for d in data)
562 columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
564 # assure that they are of the base dict class and not of derived
565 # classes
566 data = [(type(d) is dict) and d or dict(d) for d in data]
568 content = list(lib.dicts_to_array(data, list(columns)).T)
569 return _convert_object_array(
570 content, columns, dtype=dtype, coerce_float=coerce_float
571 )
574def _convert_object_array(content, columns, coerce_float=False, dtype=None):
575 if columns is None:
576 columns = ibase.default_index(len(content))
577 else:
578 if len(columns) != len(content): # pragma: no cover
579 # caller's responsibility to check for this...
580 raise AssertionError(
581 f"{len(columns)} columns passed, passed data had "
582 f"{len(content)} columns"
583 )
585 # provide soft conversion of object dtypes
586 def convert(arr):
587 if dtype != object and dtype != np.object:
588 arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
589 arr = maybe_cast_to_datetime(arr, dtype)
590 return arr
592 arrays = [convert(arr) for arr in content]
594 return arrays, columns
597# ---------------------------------------------------------------------
598# Series-Based
601def sanitize_index(data, index, copy=False):
602 """
603 Sanitize an index type to return an ndarray of the underlying, pass
604 through a non-Index.
605 """
607 if index is None:
608 return data
610 if len(data) != len(index):
611 raise ValueError("Length of values does not match length of index")
613 if isinstance(data, ABCIndexClass) and not copy:
614 pass
615 elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)):
616 data = data._values
617 if copy:
618 data = data.copy()
620 elif isinstance(data, np.ndarray):
622 # coerce datetimelike types
623 if data.dtype.kind in ["M", "m"]:
624 data = sanitize_array(data, index, copy=copy)
626 return data