Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Functions for preparing various inputs passed to the DataFrame or Series 

3constructors before passing them to a BlockManager. 

4""" 

5from collections import abc 

6 

7import numpy as np 

8import numpy.ma as ma 

9 

10from pandas._libs import lib 

11 

12from pandas.core.dtypes.cast import ( 

13 construct_1d_arraylike_from_scalar, 

14 maybe_cast_to_datetime, 

15 maybe_convert_platform, 

16 maybe_infer_to_datetimelike, 

17 maybe_upcast, 

18) 

19from pandas.core.dtypes.common import ( 

20 is_categorical_dtype, 

21 is_datetime64tz_dtype, 

22 is_dtype_equal, 

23 is_extension_array_dtype, 

24 is_integer_dtype, 

25 is_list_like, 

26 is_object_dtype, 

27) 

28from pandas.core.dtypes.generic import ( 

29 ABCDataFrame, 

30 ABCDatetimeIndex, 

31 ABCIndexClass, 

32 ABCPeriodIndex, 

33 ABCSeries, 

34 ABCTimedeltaIndex, 

35) 

36 

37from pandas.core import algorithms, common as com 

38from pandas.core.arrays import Categorical 

39from pandas.core.construction import sanitize_array 

40from pandas.core.indexes import base as ibase 

41from pandas.core.indexes.api import ( 

42 Index, 

43 ensure_index, 

44 get_objs_combined_axis, 

45 union_indexes, 

46) 

47from pandas.core.internals import ( 

48 create_block_manager_from_arrays, 

49 create_block_manager_from_blocks, 

50) 

51 

52# --------------------------------------------------------------------- 

53# BlockManager Interface 

54 

55 

56def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): 

57 """ 

58 Segregate Series based on type and coerce into matrices. 

59 

60 Needs to handle a lot of exceptional cases. 

61 """ 

62 # figure out the index, if necessary 

63 if index is None: 

64 index = extract_index(arrays) 

65 else: 

66 index = ensure_index(index) 

67 

68 # don't force copy because getting jammed in an ndarray anyway 

69 arrays = _homogenize(arrays, index, dtype) 

70 

71 # from BlockManager perspective 

72 axes = [ensure_index(columns), index] 

73 

74 return create_block_manager_from_arrays(arrays, arr_names, axes) 

75 

76 

77def masked_rec_array_to_mgr(data, index, columns, dtype, copy): 

78 """ 

79 Extract from a masked rec array and create the manager. 

80 """ 

81 

82 # essentially process a record array then fill it 

83 fill_value = data.fill_value 

84 fdata = ma.getdata(data) 

85 if index is None: 

86 index = get_names_from_index(fdata) 

87 if index is None: 

88 index = ibase.default_index(len(data)) 

89 index = ensure_index(index) 

90 

91 if columns is not None: 

92 columns = ensure_index(columns) 

93 arrays, arr_columns = to_arrays(fdata, columns) 

94 

95 # fill if needed 

96 new_arrays = [] 

97 for fv, arr, col in zip(fill_value, arrays, arr_columns): 

98 # TODO: numpy docs suggest fv must be scalar, but could it be 

99 # non-scalar for object dtype? 

100 assert lib.is_scalar(fv), fv 

101 mask = ma.getmaskarray(data[col]) 

102 if mask.any(): 

103 arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) 

104 arr[mask] = fv 

105 new_arrays.append(arr) 

106 

107 # create the manager 

108 arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) 

109 if columns is None: 

110 columns = arr_columns 

111 

112 mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) 

113 

114 if copy: 

115 mgr = mgr.copy() 

116 return mgr 

117 

118 

119# --------------------------------------------------------------------- 

120# DataFrame Constructor Interface 

121 

122 

123def init_ndarray(values, index, columns, dtype=None, copy=False): 

124 # input must be a ndarray, list, Series, index 

125 

126 if isinstance(values, ABCSeries): 

127 if columns is None: 

128 if values.name is not None: 

129 columns = [values.name] 

130 if index is None: 

131 index = values.index 

132 else: 

133 values = values.reindex(index) 

134 

135 # zero len case (GH #2234) 

136 if not len(values) and columns is not None and len(columns): 

137 values = np.empty((0, 1), dtype=object) 

138 

139 # we could have a categorical type passed or coerced to 'category' 

140 # recast this to an arrays_to_mgr 

141 if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype( 

142 dtype 

143 ): 

144 

145 if not hasattr(values, "dtype"): 

146 values = prep_ndarray(values, copy=copy) 

147 values = values.ravel() 

148 elif copy: 

149 values = values.copy() 

150 

151 index, columns = _get_axes(len(values), 1, index, columns) 

152 return arrays_to_mgr([values], columns, index, columns, dtype=dtype) 

153 elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype): 

154 # GH#19157 

155 

156 if isinstance(values, np.ndarray) and values.ndim > 1: 

157 # GH#12513 a EA dtype passed with a 2D array, split into 

158 # multiple EAs that view the values 

159 values = [values[:, n] for n in range(values.shape[1])] 

160 else: 

161 values = [values] 

162 

163 if columns is None: 

164 columns = list(range(len(values))) 

165 return arrays_to_mgr(values, columns, index, columns, dtype=dtype) 

166 

167 # by definition an array here 

168 # the dtypes will be coerced to a single dtype 

169 values = prep_ndarray(values, copy=copy) 

170 

171 if dtype is not None: 

172 if not is_dtype_equal(values.dtype, dtype): 

173 try: 

174 values = values.astype(dtype) 

175 except Exception as orig: 

176 # e.g. ValueError when trying to cast object dtype to float64 

177 raise ValueError( 

178 f"failed to cast to '{dtype}' (Exception was: {orig})" 

179 ) from orig 

180 

181 index, columns = _get_axes(*values.shape, index=index, columns=columns) 

182 values = values.T 

183 

184 # if we don't have a dtype specified, then try to convert objects 

185 # on the entire block; this is to convert if we have datetimelike's 

186 # embedded in an object type 

187 if dtype is None and is_object_dtype(values): 

188 

189 if values.ndim == 2 and values.shape[0] != 1: 

190 # transpose and separate blocks 

191 

192 dvals_list = [maybe_infer_to_datetimelike(row) for row in values] 

193 for n in range(len(dvals_list)): 

194 if isinstance(dvals_list[n], np.ndarray): 

195 dvals_list[n] = dvals_list[n].reshape(1, -1) 

196 

197 from pandas.core.internals.blocks import make_block 

198 

199 # TODO: What about re-joining object columns? 

200 block_values = [ 

201 make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) 

202 ] 

203 

204 else: 

205 datelike_vals = maybe_infer_to_datetimelike(values) 

206 block_values = [datelike_vals] 

207 else: 

208 block_values = [values] 

209 

210 return create_block_manager_from_blocks(block_values, [columns, index]) 

211 

212 

213def init_dict(data, index, columns, dtype=None): 

214 """ 

215 Segregate Series based on type and coerce into matrices. 

216 Needs to handle a lot of exceptional cases. 

217 """ 

218 if columns is not None: 

219 from pandas.core.series import Series 

220 

221 arrays = Series(data, index=columns, dtype=object) 

222 data_names = arrays.index 

223 

224 missing = arrays.isna() 

225 if index is None: 

226 # GH10856 

227 # raise ValueError if only scalars in dict 

228 index = extract_index(arrays[~missing]) 

229 else: 

230 index = ensure_index(index) 

231 

232 # no obvious "empty" int column 

233 if missing.any() and not is_integer_dtype(dtype): 

234 if dtype is None or np.issubdtype(dtype, np.flexible): 

235 # GH#1783 

236 nan_dtype = np.dtype(object) 

237 else: 

238 nan_dtype = dtype 

239 val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) 

240 arrays.loc[missing] = [val] * missing.sum() 

241 

242 else: 

243 keys = list(data.keys()) 

244 columns = data_names = Index(keys) 

245 arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) 

246 # GH#24096 need copy to be deep for datetime64tz case 

247 # TODO: See if we can avoid these copies 

248 arrays = [ 

249 arr if not isinstance(arr, ABCIndexClass) else arr._data for arr in arrays 

250 ] 

251 arrays = [ 

252 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays 

253 ] 

254 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) 

255 

256 

257# --------------------------------------------------------------------- 

258 

259 

260def prep_ndarray(values, copy=True) -> np.ndarray: 

261 if not isinstance(values, (np.ndarray, ABCSeries, Index)): 

262 if len(values) == 0: 

263 return np.empty((0, 0), dtype=object) 

264 elif isinstance(values, range): 

265 arr = np.arange(values.start, values.stop, values.step, dtype="int64") 

266 return arr[..., np.newaxis] 

267 

268 def convert(v): 

269 return maybe_convert_platform(v) 

270 

271 # we could have a 1-dim or 2-dim list here 

272 # this is equiv of np.asarray, but does object conversion 

273 # and platform dtype preservation 

274 try: 

275 if is_list_like(values[0]) or hasattr(values[0], "len"): 

276 values = np.array([convert(v) for v in values]) 

277 elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: 

278 # GH#21861 

279 values = np.array([convert(v) for v in values]) 

280 else: 

281 values = convert(values) 

282 except (ValueError, TypeError): 

283 values = convert(values) 

284 

285 else: 

286 

287 # drop subclass info, do not copy data 

288 values = np.asarray(values) 

289 if copy: 

290 values = values.copy() 

291 

292 if values.ndim == 1: 

293 values = values.reshape((values.shape[0], 1)) 

294 elif values.ndim != 2: 

295 raise ValueError("Must pass 2-d input") 

296 

297 return values 

298 

299 

300def _homogenize(data, index, dtype=None): 

301 oindex = None 

302 homogenized = [] 

303 

304 for val in data: 

305 if isinstance(val, ABCSeries): 

306 if dtype is not None: 

307 val = val.astype(dtype) 

308 if val.index is not index: 

309 # Forces alignment. No need to copy data since we 

310 # are putting it into an ndarray later 

311 val = val.reindex(index, copy=False) 

312 else: 

313 if isinstance(val, dict): 

314 if oindex is None: 

315 oindex = index.astype("O") 

316 

317 if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): 

318 val = com.dict_compat(val) 

319 else: 

320 val = dict(val) 

321 val = lib.fast_multiget(val, oindex.values, default=np.nan) 

322 val = sanitize_array( 

323 val, index, dtype=dtype, copy=False, raise_cast_failure=False 

324 ) 

325 

326 homogenized.append(val) 

327 

328 return homogenized 

329 

330 

331def extract_index(data): 

332 index = None 

333 if len(data) == 0: 

334 index = Index([]) 

335 elif len(data) > 0: 

336 raw_lengths = [] 

337 indexes = [] 

338 

339 have_raw_arrays = False 

340 have_series = False 

341 have_dicts = False 

342 

343 for val in data: 

344 if isinstance(val, ABCSeries): 

345 have_series = True 

346 indexes.append(val.index) 

347 elif isinstance(val, dict): 

348 have_dicts = True 

349 indexes.append(list(val.keys())) 

350 elif is_list_like(val) and getattr(val, "ndim", 1) == 1: 

351 have_raw_arrays = True 

352 raw_lengths.append(len(val)) 

353 

354 if not indexes and not raw_lengths: 

355 raise ValueError("If using all scalar values, you must pass an index") 

356 

357 if have_series: 

358 index = union_indexes(indexes) 

359 elif have_dicts: 

360 index = union_indexes(indexes, sort=False) 

361 

362 if have_raw_arrays: 

363 lengths = list(set(raw_lengths)) 

364 if len(lengths) > 1: 

365 raise ValueError("arrays must all be same length") 

366 

367 if have_dicts: 

368 raise ValueError( 

369 "Mixing dicts with non-Series may lead to ambiguous ordering." 

370 ) 

371 

372 if have_series: 

373 if lengths[0] != len(index): 

374 msg = ( 

375 f"array length {lengths[0]} does not match index " 

376 f"length {len(index)}" 

377 ) 

378 raise ValueError(msg) 

379 else: 

380 index = ibase.default_index(lengths[0]) 

381 

382 return ensure_index(index) 

383 

384 

385def reorder_arrays(arrays, arr_columns, columns): 

386 # reorder according to the columns 

387 if ( 

388 columns is not None 

389 and len(columns) 

390 and arr_columns is not None 

391 and len(arr_columns) 

392 ): 

393 indexer = ensure_index(arr_columns).get_indexer(columns) 

394 arr_columns = ensure_index([arr_columns[i] for i in indexer]) 

395 arrays = [arrays[i] for i in indexer] 

396 return arrays, arr_columns 

397 

398 

399def get_names_from_index(data): 

400 has_some_name = any(getattr(s, "name", None) is not None for s in data) 

401 if not has_some_name: 

402 return ibase.default_index(len(data)) 

403 

404 index = list(range(len(data))) 

405 count = 0 

406 for i, s in enumerate(data): 

407 n = getattr(s, "name", None) 

408 if n is not None: 

409 index[i] = n 

410 else: 

411 index[i] = f"Unnamed {count}" 

412 count += 1 

413 

414 return index 

415 

416 

417def _get_axes(N, K, index, columns): 

418 # helper to create the axes as indexes 

419 # return axes or defaults 

420 

421 if index is None: 

422 index = ibase.default_index(N) 

423 else: 

424 index = ensure_index(index) 

425 

426 if columns is None: 

427 columns = ibase.default_index(K) 

428 else: 

429 columns = ensure_index(columns) 

430 return index, columns 

431 

432 

433# --------------------------------------------------------------------- 

434# Conversion of Inputs to Arrays 

435 

436 

437def to_arrays(data, columns, coerce_float=False, dtype=None): 

438 """ 

439 Return list of arrays, columns. 

440 """ 

441 if isinstance(data, ABCDataFrame): 

442 if columns is not None: 

443 arrays = [ 

444 data._ixs(i, axis=1).values 

445 for i, col in enumerate(data.columns) 

446 if col in columns 

447 ] 

448 else: 

449 columns = data.columns 

450 arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] 

451 

452 return arrays, columns 

453 

454 if not len(data): 

455 if isinstance(data, np.ndarray): 

456 columns = data.dtype.names 

457 if columns is not None: 

458 return [[]] * len(columns), columns 

459 return [], [] # columns if columns is not None else [] 

460 if isinstance(data[0], (list, tuple)): 

461 return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) 

462 elif isinstance(data[0], abc.Mapping): 

463 return _list_of_dict_to_arrays( 

464 data, columns, coerce_float=coerce_float, dtype=dtype 

465 ) 

466 elif isinstance(data[0], ABCSeries): 

467 return _list_of_series_to_arrays( 

468 data, columns, coerce_float=coerce_float, dtype=dtype 

469 ) 

470 elif isinstance(data[0], Categorical): 

471 if columns is None: 

472 columns = ibase.default_index(len(data)) 

473 return data, columns 

474 elif ( 

475 isinstance(data, (np.ndarray, ABCSeries, Index)) 

476 and data.dtype.names is not None 

477 ): 

478 

479 columns = list(data.dtype.names) 

480 arrays = [data[k] for k in columns] 

481 return arrays, columns 

482 else: 

483 # last ditch effort 

484 data = [tuple(x) for x in data] 

485 return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) 

486 

487 

488def _list_to_arrays(data, columns, coerce_float=False, dtype=None): 

489 if len(data) > 0 and isinstance(data[0], tuple): 

490 content = list(lib.to_object_array_tuples(data).T) 

491 else: 

492 # list of lists 

493 content = list(lib.to_object_array(data).T) 

494 # gh-26429 do not raise user-facing AssertionError 

495 try: 

496 result = _convert_object_array( 

497 content, columns, dtype=dtype, coerce_float=coerce_float 

498 ) 

499 except AssertionError as e: 

500 raise ValueError(e) from e 

501 return result 

502 

503 

504def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): 

505 if columns is None: 

506 # We know pass_data is non-empty because data[0] is a Series 

507 pass_data = [x for x in data if isinstance(x, (ABCSeries, ABCDataFrame))] 

508 columns = get_objs_combined_axis(pass_data, sort=False) 

509 

510 indexer_cache = {} 

511 

512 aligned_values = [] 

513 for s in data: 

514 index = getattr(s, "index", None) 

515 if index is None: 

516 index = ibase.default_index(len(s)) 

517 

518 if id(index) in indexer_cache: 

519 indexer = indexer_cache[id(index)] 

520 else: 

521 indexer = indexer_cache[id(index)] = index.get_indexer(columns) 

522 

523 values = com.values_from_object(s) 

524 aligned_values.append(algorithms.take_1d(values, indexer)) 

525 

526 values = np.vstack(aligned_values) 

527 

528 if values.dtype == np.object_: 

529 content = list(values.T) 

530 return _convert_object_array( 

531 content, columns, dtype=dtype, coerce_float=coerce_float 

532 ) 

533 else: 

534 return values.T, columns 

535 

536 

537def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): 

538 """Convert list of dicts to numpy arrays 

539 

540 if `columns` is not passed, column names are inferred from the records 

541 - for OrderedDict and dicts, the column names match 

542 the key insertion-order from the first record to the last. 

543 - For other kinds of dict-likes, the keys are lexically sorted. 

544 

545 Parameters 

546 ---------- 

547 data : iterable 

548 collection of records (OrderedDict, dict) 

549 columns: iterables or None 

550 coerce_float : bool 

551 dtype : np.dtype 

552 

553 Returns 

554 ------- 

555 tuple 

556 arrays, columns 

557 """ 

558 

559 if columns is None: 

560 gen = (list(x.keys()) for x in data) 

561 sort = not any(isinstance(d, dict) for d in data) 

562 columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) 

563 

564 # assure that they are of the base dict class and not of derived 

565 # classes 

566 data = [(type(d) is dict) and d or dict(d) for d in data] 

567 

568 content = list(lib.dicts_to_array(data, list(columns)).T) 

569 return _convert_object_array( 

570 content, columns, dtype=dtype, coerce_float=coerce_float 

571 ) 

572 

573 

574def _convert_object_array(content, columns, coerce_float=False, dtype=None): 

575 if columns is None: 

576 columns = ibase.default_index(len(content)) 

577 else: 

578 if len(columns) != len(content): # pragma: no cover 

579 # caller's responsibility to check for this... 

580 raise AssertionError( 

581 f"{len(columns)} columns passed, passed data had " 

582 f"{len(content)} columns" 

583 ) 

584 

585 # provide soft conversion of object dtypes 

586 def convert(arr): 

587 if dtype != object and dtype != np.object: 

588 arr = lib.maybe_convert_objects(arr, try_float=coerce_float) 

589 arr = maybe_cast_to_datetime(arr, dtype) 

590 return arr 

591 

592 arrays = [convert(arr) for arr in content] 

593 

594 return arrays, columns 

595 

596 

597# --------------------------------------------------------------------- 

598# Series-Based 

599 

600 

601def sanitize_index(data, index, copy=False): 

602 """ 

603 Sanitize an index type to return an ndarray of the underlying, pass 

604 through a non-Index. 

605 """ 

606 

607 if index is None: 

608 return data 

609 

610 if len(data) != len(index): 

611 raise ValueError("Length of values does not match length of index") 

612 

613 if isinstance(data, ABCIndexClass) and not copy: 

614 pass 

615 elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)): 

616 data = data._values 

617 if copy: 

618 data = data.copy() 

619 

620 elif isinstance(data, np.ndarray): 

621 

622 # coerce datetimelike types 

623 if data.dtype.kind in ["M", "m"]: 

624 data = sanitize_array(data, index, copy=copy) 

625 

626 return data