Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/tools/tools.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Utility functions models code
3"""
4import numpy as np
5import numpy.lib.recfunctions as nprf
6import pandas as pd
8from statsmodels.compat.python import lzip, lmap
10from statsmodels.tools.data import _is_using_pandas, _is_recarray
11from statsmodels.tools.validation import array_like
14def asstr2(s):
15 if isinstance(s, str):
16 return s
17 elif isinstance(s, bytes):
18 return s.decode('latin1')
19 else:
20 return str(s)
23def _make_dictnames(tmp_arr, offset=0):
24 """
25 Helper function to create a dictionary mapping a column number
26 to the name in tmp_arr.
27 """
28 col_map = {}
29 for i, col_name in enumerate(tmp_arr):
30 col_map[i + offset] = col_name
31 return col_map
34def drop_missing(Y, X=None, axis=1):
35 """
36 Returns views on the arrays Y and X where missing observations are dropped.
38 Y : array_like
39 X : array_like, optional
40 axis : int
41 Axis along which to look for missing observations. Default is 1, ie.,
42 observations in rows.
44 Returns
45 -------
46 Y : ndarray
47 All Y where the
48 X : ndarray
50 Notes
51 -----
52 If either Y or X is 1d, it is reshaped to be 2d.
53 """
54 Y = np.asarray(Y)
55 if Y.ndim == 1:
56 Y = Y[:, None]
57 if X is not None:
58 X = np.array(X)
59 if X.ndim == 1:
60 X = X[:, None]
61 keepidx = np.logical_and(~np.isnan(Y).any(axis),
62 ~np.isnan(X).any(axis))
63 return Y[keepidx], X[keepidx]
64 else:
65 keepidx = ~np.isnan(Y).any(axis)
66 return Y[keepidx]
69# TODO: needs to better preserve dtype and be more flexible
70# ie., if you still have a string variable in your array you do not
71# want to cast it to float
72# TODO: add name validator (ie., bad names for datasets.grunfeld)
73def categorical(data, col=None, dictnames=False, drop=False):
74 """
75 Construct a dummy matrix from categorical variables
77 Parameters
78 ----------
79 data : array_like
80 A structured array, recarray, array, Series or DataFrame. This can be
81 either a 1d vector of the categorical variable or a 2d array with
82 the column specifying the categorical variable specified by the col
83 argument.
84 col : {str, int, None}
85 If data is a DataFrame col must in a column of data. If data is a
86 Series, col must be either the name of the Series or None. If data is a
87 structured array or a recarray, `col` can be a string that is the name
88 of the column that contains the variable. For all other
89 arrays `col` can be an int that is the (zero-based) column index
90 number. `col` can only be None for a 1d array. The default is None.
91 dictnames : bool, optional
92 If True, a dictionary mapping the column number to the categorical
93 name is returned. Used to have information about plain arrays.
94 drop : bool
95 Whether or not keep the categorical variable in the returned matrix.
97 Returns
98 -------
99 dummy_matrix : array_like
100 A matrix of dummy (indicator/binary) float variables for the
101 categorical data.
102 dictnames : dict[int, str], optional
103 Mapping between column numbers and categorical names.
105 Notes
106 -----
107 This returns a dummy variable for *each* distinct variable. If a
108 a structured or recarray is provided, the names for the new variable is the
109 old variable name - underscore - category name. So if the a variable
110 'vote' had answers as 'yes' or 'no' then the returned array would have to
111 new variables-- 'vote_yes' and 'vote_no'. There is currently
112 no name checking.
114 Examples
115 --------
116 >>> import numpy as np
117 >>> import statsmodels.api as sm
119 Univariate examples
121 >>> import string
122 >>> string_var = [string.ascii_lowercase[0:5],
123 ... string.ascii_lowercase[5:10],
124 ... string.ascii_lowercase[10:15],
125 ... string.ascii_lowercase[15:20],
126 ... string.ascii_lowercase[20:25]]
127 >>> string_var *= 5
128 >>> string_var = np.asarray(sorted(string_var))
129 >>> design = sm.tools.categorical(string_var, drop=True)
131 Or for a numerical categorical variable
133 >>> instr = np.floor(np.arange(10,60, step=2)/10)
134 >>> design = sm.tools.categorical(instr, drop=True)
136 With a structured array
138 >>> num = np.random.randn(25,2)
139 >>> struct_ar = np.zeros((25,1),
140 ... dtype=[('var1', 'f4'),('var2', 'f4'),
141 ... ('instrument','f4'),('str_instr','a5')])
142 >>> struct_ar['var1'] = num[:,0][:,None]
143 >>> struct_ar['var2'] = num[:,1][:,None]
144 >>> struct_ar['instrument'] = instr[:,None]
145 >>> struct_ar['str_instr'] = string_var[:,None]
146 >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True)
148 Or
150 >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True)
151 """
152 # TODO: add a NameValidator function
153 if isinstance(col, (list, tuple)):
154 if len(col) == 1:
155 col = col[0]
156 else:
157 raise ValueError("Can only convert one column at a time")
158 if (not isinstance(data, (pd.DataFrame, pd.Series)) and
159 not isinstance(col, (str, int)) and
160 col is not None):
161 raise TypeError('col must be a str, int or None')
163 # Pull out a Series from a DataFrame if provided
164 if isinstance(data, pd.DataFrame):
165 if col is None:
166 raise TypeError('col must be a str or int when using a DataFrame')
167 elif col not in data:
168 raise ValueError('Column \'{0}\' not found in data'.format(col))
169 data = data[col]
170 # Set col to None since we not have a Series
171 col = None
173 if isinstance(data, pd.Series):
174 if col is not None and data.name != col:
175 raise ValueError('data.name does not match col '
176 '\'{0}\''.format(col))
177 data_cat = pd.Categorical(data)
178 dummies = pd.get_dummies(data_cat)
179 col_map = {i: cat for i, cat in enumerate(data_cat.categories) if
180 cat in dummies}
181 if not drop:
182 dummies.columns = list(dummies.columns)
183 dummies = pd.concat([dummies, data], 1)
184 if dictnames:
185 return dummies, col_map
186 return dummies
187 # catch recarrays and structured arrays
188 elif data.dtype.names or data.__class__ is np.recarray:
189 # deprecated: remove path after 0.12
190 import warnings
191 from statsmodels.tools.sm_exceptions import recarray_warning
192 warnings.warn(recarray_warning, FutureWarning)
193 if not col and np.squeeze(data).ndim > 1:
194 raise IndexError("col is None and the input array is not 1d")
195 if isinstance(col, int):
196 col = data.dtype.names[col]
197 if col is None and data.dtype.names and len(data.dtype.names) == 1:
198 col = data.dtype.names[0]
200 tmp_arr = np.unique(data[col])
202 # if the cols are shape (#,) vs (#,1) need to add an axis and flip
203 _swap = True
204 if data[col].ndim == 1:
205 tmp_arr = tmp_arr[:, None]
206 _swap = False
207 tmp_dummy = (tmp_arr == data[col]).astype(float)
208 if _swap:
209 tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)
211 if not tmp_arr.dtype.names: # how do we get to this code path?
212 tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)]
213 elif tmp_arr.dtype.names:
214 tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())]
216 # prepend the varname and underscore, if col is numeric attribute
217 # lookup is lost for recarrays...
218 if col is None:
219 try:
220 col = data.dtype.names[0]
221 except:
222 col = 'var'
223 # TODO: the above needs to be made robust because there could be many
224 # var_yes, var_no varaibles for instance.
225 tmp_arr = [col + '_' + item for item in tmp_arr]
226 # TODO: test this for rec and structured arrays!!!
228 if drop is True:
229 if len(data.dtype) <= 1:
230 if tmp_dummy.shape[0] < tmp_dummy.shape[1]:
231 tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0)
232 dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr))
233 # preserve array type
234 return np.array(lmap(tuple, tmp_dummy.tolist()),
235 dtype=dt).view(type(data))
237 data = nprf.drop_fields(data, col, usemask=False,
238 asrecarray=type(data) is np.recarray)
239 data = nprf.append_fields(data, tmp_arr, data=tmp_dummy,
240 usemask=False,
241 asrecarray=type(data) is np.recarray)
242 return data
244 # Catch array_like for an error
245 elif not isinstance(data, np.ndarray):
246 raise NotImplementedError("array_like objects are not supported")
247 else:
248 if isinstance(col, int):
249 offset = data.shape[1] # need error catching here?
250 tmp_arr = np.unique(data[:, col])
251 tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float)
252 tmp_dummy = tmp_dummy.swapaxes(1, 0)
253 if drop is True:
254 offset -= 1
255 data = np.delete(data, col, axis=1).astype(float)
256 data = np.column_stack((data, tmp_dummy))
257 if dictnames is True:
258 col_map = _make_dictnames(tmp_arr, offset)
259 return data, col_map
260 return data
261 elif col is None and np.squeeze(data).ndim == 1:
262 tmp_arr = np.unique(data)
263 tmp_dummy = (tmp_arr[:, None] == data).astype(float)
264 tmp_dummy = tmp_dummy.swapaxes(1, 0)
265 if drop is True:
266 if dictnames is True:
267 col_map = _make_dictnames(tmp_arr)
268 return tmp_dummy, col_map
269 return tmp_dummy
270 else:
271 data = np.column_stack((data, tmp_dummy))
272 if dictnames is True:
273 col_map = _make_dictnames(tmp_arr, offset=1)
274 return data, col_map
275 return data
276 else:
277 raise IndexError("The index %s is not understood" % col)
280# TODO: add an axis argument to this for sysreg
281def add_constant(data, prepend=True, has_constant='skip'):
282 """
283 Add a column of ones to an array.
285 Parameters
286 ----------
287 data : array_like
288 A column-ordered design matrix.
289 prepend : bool
290 If true, the constant is in the first column. Else the constant is
291 appended (last column).
292 has_constant : str {'raise', 'add', 'skip'}
293 Behavior if ``data`` already has a constant. The default will return
294 data without adding another constant. If 'raise', will raise an
295 error if any column has a constant value. Using 'add' will add a
296 column of 1s if a constant column is present.
298 Returns
299 -------
300 array_like
301 The original values with a constant (column of ones) as the first or
302 last column. Returned value type depends on input type.
304 Notes
305 -----
306 When the input is recarray or a pandas Series or DataFrame, the added
307 column's name is 'const'.
308 """
309 if _is_using_pandas(data, None) or _is_recarray(data):
310 if _is_recarray(data):
311 # deprecated: remove recarray support after 0.12
312 import warnings
313 from statsmodels.tools.sm_exceptions import recarray_warning
314 warnings.warn(recarray_warning, FutureWarning)
315 from statsmodels.tsa.tsatools import add_trend
316 return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant)
318 # Special case for NumPy
319 x = np.asanyarray(data)
320 if x.ndim == 1:
321 x = x[:, None]
322 elif x.ndim > 2:
323 raise ValueError('Only implemented for 2-dimensional arrays')
325 is_nonzero_const = np.ptp(x, axis=0) == 0
326 is_nonzero_const &= np.all(x != 0.0, axis=0)
327 if is_nonzero_const.any():
328 if has_constant == 'skip':
329 return x
330 elif has_constant == 'raise':
331 raise ValueError("data already contains a constant")
333 x = [np.ones(x.shape[0]), x]
334 x = x if prepend else x[::-1]
335 return np.column_stack(x)
338def isestimable(c, d):
339 """
340 True if (Q, P) contrast `c` is estimable for (N, P) design `d`.
342 From an Q x P contrast matrix `C` and an N x P design matrix `D`, checks if
343 the contrast `C` is estimable by looking at the rank of ``vstack([C,D])``
344 and verifying it is the same as the rank of `D`.
346 Parameters
347 ----------
348 c : array_like
349 A contrast matrix with shape (Q, P). If 1 dimensional assume shape is
350 (1, P).
351 d : array_like
352 The design matrix, (N, P).
354 Returns
355 -------
356 bool
357 True if the contrast `c` is estimable on design `d`.
359 Examples
360 --------
361 >>> d = np.array([[1, 1, 1, 0, 0, 0],
362 ... [0, 0, 0, 1, 1, 1],
363 ... [1, 1, 1, 1, 1, 1]]).T
364 >>> isestimable([1, 0, 0], d)
365 False
366 >>> isestimable([1, -1, 0], d)
367 True
368 """
369 c = array_like(c, 'c', maxdim=2)
370 d = array_like(d, 'd', ndim=2)
371 c = c[None, :] if c.ndim == 1 else c
372 if c.shape[1] != d.shape[1]:
373 raise ValueError('Contrast should have %d columns' % d.shape[1])
374 new = np.vstack([c, d])
375 if np.linalg.matrix_rank(new) != np.linalg.matrix_rank(d):
376 return False
377 return True
380def pinv_extended(X, rcond=1e-15):
381 """
382 Return the pinv of an array X as well as the singular values
383 used in computation.
385 Code adapted from numpy.
386 """
387 X = np.asarray(X)
388 X = X.conjugate()
389 u, s, vt = np.linalg.svd(X, 0)
390 s_orig = np.copy(s)
391 m = u.shape[0]
392 n = vt.shape[1]
393 cutoff = rcond * np.maximum.reduce(s)
394 for i in range(min(n, m)):
395 if s[i] > cutoff:
396 s[i] = 1./s[i]
397 else:
398 s[i] = 0.
399 res = np.dot(np.transpose(vt), np.multiply(s[:, np.core.newaxis],
400 np.transpose(u)))
401 return res, s_orig
404def recipr(x):
405 """
406 Reciprocal of an array with entries less than or equal to 0 set to 0.
408 Parameters
409 ----------
410 x : array_like
411 The input array.
413 Returns
414 -------
415 ndarray
416 The array with 0-filled reciprocals.
417 """
418 x = np.asarray(x)
419 out = np.zeros_like(x, dtype=np.float64)
420 nans = np.isnan(x.flat)
421 pos = ~nans
422 pos[pos] = pos[pos] & (x.flat[pos] > 0)
423 out.flat[pos] = 1.0 / x.flat[pos]
424 out.flat[nans] = np.nan
425 return out
428def recipr0(x):
429 """
430 Reciprocal of an array with entries less than 0 set to 0.
432 Parameters
433 ----------
434 x : array_like
435 The input array.
437 Returns
438 -------
439 ndarray
440 The array with 0-filled reciprocals.
441 """
442 x = np.asarray(x)
443 out = np.zeros_like(x, dtype=np.float64)
444 nans = np.isnan(x.flat)
445 non_zero = ~nans
446 non_zero[non_zero] = non_zero[non_zero] & (x.flat[non_zero] != 0)
447 out.flat[non_zero] = 1.0 / x.flat[non_zero]
448 out.flat[nans] = np.nan
449 return out
452def clean0(matrix):
453 """
454 Erase columns of zeros: can save some time in pseudoinverse.
456 Parameters
457 ----------
458 matrix : ndarray
459 The array to clean.
461 Returns
462 -------
463 ndarray
464 The cleaned array.
465 """
466 colsum = np.add.reduce(matrix**2, 0)
467 val = [matrix[:, i] for i in np.flatnonzero(colsum)]
468 return np.array(np.transpose(val))
471def fullrank(x, r=None):
472 """
473 Return an array whose column span is the same as x.
475 Parameters
476 ----------
477 x : ndarray
478 The array to adjust, 2d.
479 r : int, optional
480 The rank of x. If not provided, determined by `np.linalg.matrix_rank`.
482 Returns
483 -------
484 ndarray
485 The array adjusted to have full rank.
487 Notes
488 -----
489 If the rank of x is known it can be specified as r -- no check
490 is made to ensure that this really is the rank of x.
491 """
492 if r is None:
493 r = np.linalg.matrix_rank(x)
495 v, d, u = np.linalg.svd(x, full_matrices=False)
496 order = np.argsort(d)
497 order = order[::-1]
498 value = []
499 for i in range(r):
500 value.append(v[:, order[i]])
501 return np.asarray(np.transpose(value)).astype(np.float64)
504def unsqueeze(data, axis, oldshape):
505 """
506 Unsqueeze a collapsed array.
508 Parameters
509 ----------
510 data : ndarray
511 The data to unsqueeze.
512 axis : int
513 The axis to unsqueeze.
514 oldshape : tuple[int]
515 The original shape before the squeeze or reduce operation.
517 Returns
518 -------
519 ndarray
520 The unsqueezed array.
522 Examples
523 --------
524 >>> from numpy import mean
525 >>> from numpy.random import standard_normal
526 >>> x = standard_normal((3,4,5))
527 >>> m = mean(x, axis=1)
528 >>> m.shape
529 (3, 5)
530 >>> m = unsqueeze(m, 1, x.shape)
531 >>> m.shape
532 (3, 1, 5)
533 >>>
534 """
535 newshape = list(oldshape)
536 newshape[axis] = 1
537 return data.reshape(newshape)
540def nan_dot(A, B):
541 """
542 Returns np.dot(left_matrix, right_matrix) with the convention that
543 nan * 0 = 0 and nan * x = nan if x != 0.
545 Parameters
546 ----------
547 A, B : ndarray
548 """
549 # Find out who should be nan due to nan * nonzero
550 should_be_nan_1 = np.dot(np.isnan(A), (B != 0))
551 should_be_nan_2 = np.dot((A != 0), np.isnan(B))
552 should_be_nan = should_be_nan_1 + should_be_nan_2
554 # Multiply after setting all nan to 0
555 # This is what happens if there were no nan * nonzero conflicts
556 C = np.dot(np.nan_to_num(A), np.nan_to_num(B))
558 C[should_be_nan] = np.nan
560 return C
563def maybe_unwrap_results(results):
564 """
565 Gets raw results back from wrapped results.
567 Can be used in plotting functions or other post-estimation type
568 routines.
569 """
570 return getattr(results, '_results', results)
573class Bunch(dict):
574 """
575 Returns a dict-like object with keys accessible via attribute lookup.
577 Parameters
578 ----------
579 *args
580 Arguments passed to dict constructor, tuples (key, value).
581 **kwargs
582 Keyword agument passed to dict constructor, key=value.
583 """
584 def __init__(self, *args, **kwargs):
585 super(Bunch, self).__init__(*args, **kwargs)
586 self.__dict__ = self
589def _ensure_2d(x, ndarray=False):
590 """
592 Parameters
593 ----------
594 x : ndarray, Series, DataFrame or None
595 Input to verify dimensions, and to transform as necesary
596 ndarray : bool
597 Flag indicating whether to always return a NumPy array. Setting False
598 will return an pandas DataFrame when the input is a Series or a
599 DataFrame.
601 Returns
602 -------
603 out : ndarray, DataFrame or None
604 array or DataFrame with 2 dimensiona. One dimensional arrays are
605 returned as nobs by 1. None is returned if x is None.
606 names : list of str or None
607 list containing variables names when the input is a pandas datatype.
608 Returns None if the input is an ndarray.
610 Notes
611 -----
612 Accepts None for simplicity
613 """
614 if x is None:
615 return x
616 is_pandas = _is_using_pandas(x, None)
617 if x.ndim == 2:
618 if is_pandas:
619 return x, x.columns
620 else:
621 return x, None
622 elif x.ndim > 2:
623 raise ValueError('x mst be 1 or 2-dimensional.')
625 name = x.name if is_pandas else None
626 if ndarray:
627 return np.asarray(x)[:, None], name
628 else:
629 return pd.DataFrame(x), name