Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/tools/grouputils.py : 19%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""Tools for working with groups
4This provides several functions to work with groups and a Group class that
5keeps track of the different representations and has methods to work more
6easily with groups.
9Author: Josef Perktold,
10Author: Nathaniel Smith, recipe for sparse_dummies on scipy user mailing list
12Created on Tue Nov 29 15:44:53 2011 : sparse_dummies
13Created on Wed Nov 30 14:28:24 2011 : combine_indices
14changes: add Group class
16Notes
17~~~~~
19This reverses the class I used before, where the class was for the data and
20the group was auxiliary. Here, it is only the group, no data is kept.
22sparse_dummies needs checking for corner cases, e.g.
23what if a category level has zero elements? This can happen with subset
24 selection even if the original groups where defined as arange.
26Not all methods and options have been tried out yet after refactoring
28need more efficient loop if groups are sorted -> see GroupSorted.group_iter
29"""
30from statsmodels.compat.python import lrange, lzip
31import numpy as np
32import pandas as pd
34import statsmodels.tools.data as data_util
35from pandas import Index, MultiIndex
38def combine_indices(groups, prefix='', sep='.', return_labels=False):
39 """use np.unique to get integer group indices for product, intersection
40 """
41 if isinstance(groups, tuple):
42 groups = np.column_stack(groups)
43 else:
44 groups = np.asarray(groups)
46 dt = groups.dtype
48 is2d = (groups.ndim == 2) # need to store
50 if is2d:
51 ncols = groups.shape[1]
52 if not groups.flags.c_contiguous:
53 groups = np.array(groups, order='C')
55 groups_ = groups.view([('', groups.dtype)] * groups.shape[1])
56 else:
57 groups_ = groups
59 uni, uni_idx, uni_inv = np.unique(groups_, return_index=True,
60 return_inverse=True)
62 if is2d:
63 uni = uni.view(dt).reshape(-1, ncols)
65 # avoiding a view would be
66 # for t in uni.dtype.fields.values():
67 # assert (t[0] == dt)
68 #
69 # uni.dtype = dt
70 # uni.shape = (uni.size//ncols, ncols)
72 if return_labels:
73 label = [(prefix+sep.join(['%s']*len(uni[0]))) % tuple(ii)
74 for ii in uni]
75 return uni_inv, uni_idx, uni, label
76 else:
77 return uni_inv, uni_idx, uni
80# written for and used in try_covariance_grouploop.py
81def group_sums(x, group, use_bincount=True):
82 """simple bincount version, again
84 group : ndarray, integer
85 assumed to be consecutive integers
87 no dtype checking because I want to raise in that case
89 uses loop over columns of x
91 for comparison, simple python loop
92 """
93 x = np.asarray(x)
94 if x.ndim == 1:
95 x = x[:, None]
96 elif x.ndim > 2 and use_bincount:
97 raise ValueError('not implemented yet')
99 if use_bincount:
101 # re-label groups or bincount takes too much memory
102 if np.max(group) > 2 * x.shape[0]:
103 group = pd.factorize(group)[0]
105 return np.array([np.bincount(group, weights=x[:, col])
106 for col in range(x.shape[1])])
107 else:
108 uniques = np.unique(group)
109 result = np.zeros([len(uniques)] + list(x.shape[1:]))
110 for ii, cat in enumerate(uniques):
111 result[ii] = x[group == cat].sum(0)
112 return result
115def group_sums_dummy(x, group_dummy):
116 """sum by groups given group dummy variable
118 group_dummy can be either ndarray or sparse matrix
119 """
120 if data_util._is_using_ndarray_type(group_dummy, None):
121 return np.dot(x.T, group_dummy)
122 else: # check for sparse
123 return x.T * group_dummy
126# TODO: See if this can be entirely replaced by Grouping.dummy_sparse;
127# see GH#5687
128def dummy_sparse(groups):
129 """create a sparse indicator from a group array with integer labels
131 Parameters
132 ----------
133 groups: ndarray, int, 1d (nobs,)
134 an array of group indicators for each observation. Group levels are
135 assumed to be defined as consecutive integers, i.e. range(n_groups)
136 where n_groups is the number of group levels. A group level with no
137 observations for it will still produce a column of zeros.
139 Returns
140 -------
141 indi : ndarray, int8, 2d (nobs, n_groups)
142 an indicator array with one row per observation, that has 1 in the
143 column of the group level for that observation
145 Examples
146 --------
148 >>> g = np.array([0, 0, 2, 1, 1, 2, 0])
149 >>> indi = dummy_sparse(g)
150 >>> indi
151 <7x3 sparse matrix of type '<type 'numpy.int8'>'
152 with 7 stored elements in Compressed Sparse Row format>
153 >>> indi.todense()
154 matrix([[1, 0, 0],
155 [1, 0, 0],
156 [0, 0, 1],
157 [0, 1, 0],
158 [0, 1, 0],
159 [0, 0, 1],
160 [1, 0, 0]], dtype=int8)
163 current behavior with missing groups
164 >>> g = np.array([0, 0, 2, 0, 2, 0])
165 >>> indi = dummy_sparse(g)
166 >>> indi.todense()
167 matrix([[1, 0, 0],
168 [1, 0, 0],
169 [0, 0, 1],
170 [1, 0, 0],
171 [0, 0, 1],
172 [1, 0, 0]], dtype=int8)
173 """
174 from scipy import sparse
176 indptr = np.arange(len(groups)+1)
177 data = np.ones(len(groups), dtype=np.int8)
178 indi = sparse.csr_matrix((data, groups, indptr))
180 return indi
183class Group(object):
185 def __init__(self, group, name=''):
187 # self.group = np.asarray(group) # TODO: use checks in combine_indices
188 self.name = name
189 uni, uni_idx, uni_inv = combine_indices(group)
191 # TODO: rename these to something easier to remember
192 self.group_int, self.uni_idx, self.uni = uni, uni_idx, uni_inv
194 self.n_groups = len(self.uni)
196 # put this here so they can be overwritten before calling labels
197 self.separator = '.'
198 self.prefix = self.name
199 if self.prefix:
200 self.prefix = self.prefix + '='
202 # cache decorator
203 def counts(self):
204 return np.bincount(self.group_int)
206 # cache_decorator
207 def labels(self):
208 # is this only needed for product of groups (intersection)?
209 prefix = self.prefix
210 uni = self.uni
211 sep = self.separator
213 if uni.ndim > 1:
214 label = [(prefix+sep.join(['%s']*len(uni[0]))) % tuple(ii)
215 for ii in uni]
216 else:
217 label = [prefix + '%s' % ii for ii in uni]
218 return label
220 def dummy(self, drop_idx=None, sparse=False, dtype=int):
221 """
222 drop_idx is only available if sparse=False
224 drop_idx is supposed to index into uni
225 """
226 uni = self.uni
227 if drop_idx is not None:
228 idx = lrange(len(uni))
229 del idx[drop_idx]
230 uni = uni[idx]
232 group = self.group
234 if not sparse:
235 return (group[:, None] == uni[None, :]).astype(dtype)
236 else:
237 return dummy_sparse(self.group_int)
239 def interaction(self, other):
240 if isinstance(other, self.__class__):
241 other = other.group
242 return self.__class__((self, other))
244 def group_sums(self, x, use_bincount=True):
245 return group_sums(x, self.group_int, use_bincount=use_bincount)
247 def group_demean(self, x, use_bincount=True):
248 nobs = float(len(x))
249 means_g = group_sums(x / nobs, self.group_int,
250 use_bincount=use_bincount)
251 x_demeaned = x - means_g[self.group_int] # check reverse_index?
252 return x_demeaned, means_g
255class GroupSorted(Group):
256 def __init__(self, group, name=''):
257 super(self.__class__, self).__init__(group, name=name)
259 idx = (np.nonzero(np.diff(group))[0]+1).tolist()
260 self.groupidx = lzip([0] + idx, idx + [len(group)])
262 def group_iter(self):
263 for low, upp in self.groupidx:
264 yield slice(low, upp)
266 def lag_indices(self, lag):
267 """return the index array for lagged values
269 Warning: if k is larger then the number of observations for an
270 individual, then no values for that individual are returned.
272 TODO: for the unbalanced case, I should get the same truncation for
273 the array with lag=0. From the return of lag_idx we would not know
274 which individual is missing.
276 TODO: do I want the full equivalent of lagmat in tsa?
277 maxlag or lag or lags.
279 not tested yet
280 """
281 lag_idx = np.asarray(self.groupidx)[:, 1] - lag # asarray or already?
282 mask_ok = (lag <= lag_idx)
283 # still an observation that belongs to the same individual
285 return lag_idx[mask_ok]
288def _is_hierarchical(x):
289 """
290 Checks if the first item of an array-like object is also array-like
291 If so, we have a MultiIndex and returns True. Else returns False.
292 """
293 item = x[0]
294 # is there a better way to do this?
295 if isinstance(item, (list, tuple, np.ndarray, pd.Series, pd.DataFrame)):
296 return True
297 else:
298 return False
301def _make_hierarchical_index(index, names):
302 return MultiIndex.from_tuples(*[index], names=names)
305def _make_generic_names(index):
306 n_names = len(index.names)
307 pad = str(len(str(n_names))) # number of digits
308 return [("group{0:0"+pad+"}").format(i) for i in range(n_names)]
311class Grouping(object):
312 def __init__(self, index, names=None):
313 """
314 index : index-like
315 Can be pandas MultiIndex or Index or array-like. If array-like
316 and is a MultipleIndex (more than one grouping variable),
317 groups are expected to be in each row. E.g., [('red', 1),
318 ('red', 2), ('green', 1), ('green', 2)]
319 names : list or str, optional
320 The names to use for the groups. Should be a str if only
321 one grouping variable is used.
323 Notes
324 -----
325 If index is already a pandas Index then there is no copy.
326 """
327 if isinstance(index, (Index, MultiIndex)):
328 if names is not None:
329 if hasattr(index, 'set_names'): # newer pandas
330 index.set_names(names, inplace=True)
331 else:
332 index.names = names
333 self.index = index
334 else: # array_like
335 if _is_hierarchical(index):
336 self.index = _make_hierarchical_index(index, names)
337 else:
338 self.index = Index(index, name=names)
339 if names is None:
340 names = _make_generic_names(self.index)
341 if hasattr(self.index, 'set_names'):
342 self.index.set_names(names, inplace=True)
343 else:
344 self.index.names = names
346 self.nobs = len(self.index)
347 self.nlevels = len(self.index.names)
348 self.slices = None
350 @property
351 def index_shape(self):
352 if hasattr(self.index, 'levshape'):
353 return self.index.levshape
354 else:
355 return self.index.shape
357 @property
358 def levels(self):
359 if hasattr(self.index, 'levels'):
360 return self.index.levels
361 else:
362 return pd.Categorical(self.index).levels
364 @property
365 def labels(self):
366 # this was index_int, but that's not a very good name...
367 codes = getattr(self.index, 'codes', None)
368 if codes is None:
369 if hasattr(self.index, 'labels'):
370 codes = self.index.labels
371 else:
372 codes = pd.Categorical(self.index).codes[None]
373 return codes
375 @property
376 def group_names(self):
377 return self.index.names
379 def reindex(self, index=None, names=None):
380 """
381 Resets the index in-place.
382 """
383 # NOTE: this is not of much use if the rest of the data does not change
384 # This needs to reset cache
385 if names is None:
386 names = self.group_names
387 self = Grouping(index, names)
389 def get_slices(self, level=0):
390 """
391 Sets the slices attribute to be a list of indices of the sorted
392 groups for the first index level. I.e., self.slices[0] is the
393 index where each observation is in the first (sorted) group.
394 """
395 # TODO: refactor this
396 groups = self.index.get_level_values(level).unique()
397 groups = np.array(groups)
398 groups.sort()
399 if isinstance(self.index, MultiIndex):
400 self.slices = [self.index.get_loc_level(x, level=level)[0]
401 for x in groups]
402 else:
403 self.slices = [self.index.get_loc(x) for x in groups]
405 def count_categories(self, level=0):
406 """
407 Sets the attribute counts to equal the bincount of the (integer-valued)
408 labels.
409 """
410 # TODO: refactor this not to set an attribute. Why would we do this?
411 self.counts = np.bincount(self.labels[level])
413 def check_index(self, is_sorted=True, unique=True, index=None):
414 """Sanity checks"""
415 if not index:
416 index = self.index
417 if is_sorted:
418 test = pd.DataFrame(lrange(len(index)), index=index)
419 test_sorted = test.sort()
420 if not test.index.equals(test_sorted.index):
421 raise Exception('Data is not be sorted')
422 if unique:
423 if len(index) != len(index.unique()):
424 raise Exception('Duplicate index entries')
426 def sort(self, data, index=None):
427 """Applies a (potentially hierarchical) sort operation on a numpy array
428 or pandas series/dataframe based on the grouping index or a
429 user-supplied index. Returns an object of the same type as the
430 original data as well as the matching (sorted) Pandas index.
431 """
433 if index is None:
434 index = self.index
435 if data_util._is_using_ndarray_type(data, None):
436 if data.ndim == 1:
437 out = pd.Series(data, index=index, copy=True)
438 out = out.sort_index()
439 else:
440 out = pd.DataFrame(data, index=index)
441 out = out.sort_index(inplace=False) # copies
442 return np.array(out), out.index
443 elif data_util._is_using_pandas(data, None):
444 out = data
445 out = out.reindex(index) # copies?
446 out = out.sort_index()
447 return out, out.index
448 else:
449 msg = 'data must be a Numpy array or a Pandas Series/DataFrame'
450 raise ValueError(msg)
452 def transform_dataframe(self, dataframe, function, level=0, **kwargs):
453 """Apply function to each column, by group
454 Assumes that the dataframe already has a proper index"""
455 if dataframe.shape[0] != self.nobs:
456 raise Exception('dataframe does not have the same shape as index')
457 out = dataframe.groupby(level=level).apply(function, **kwargs)
458 if 1 in out.shape:
459 return np.ravel(out)
460 else:
461 return np.array(out)
463 def transform_array(self, array, function, level=0, **kwargs):
464 """Apply function to each column, by group
465 """
466 if array.shape[0] != self.nobs:
467 raise Exception('array does not have the same shape as index')
468 dataframe = pd.DataFrame(array, index=self.index)
469 return self.transform_dataframe(dataframe, function, level=level,
470 **kwargs)
472 def transform_slices(self, array, function, level=0, **kwargs):
473 """Apply function to each group. Similar to transform_array but does
474 not coerce array to a DataFrame and back and only works on a 1D or 2D
475 numpy array. function is called function(group, group_idx, **kwargs).
476 """
477 array = np.asarray(array)
478 if array.shape[0] != self.nobs:
479 raise Exception('array does not have the same shape as index')
480 # always reset because level is given. need to refactor this.
481 self.get_slices(level=level)
482 processed = []
483 for s in self.slices:
484 if array.ndim == 2:
485 subset = array[s, :]
486 elif array.ndim == 1:
487 subset = array[s]
488 processed.append(function(subset, s, **kwargs))
489 processed = np.array(processed)
490 return processed.reshape(-1, processed.shape[-1])
492 # TODO: this is not general needs to be a PanelGrouping object
493 def dummies_time(self):
494 self.dummy_sparse(level=1)
495 return self._dummies
497 def dummies_groups(self, level=0):
498 self.dummy_sparse(level=level)
499 return self._dummies
501 def dummy_sparse(self, level=0):
502 """create a sparse indicator from a group array with integer labels
504 Parameters
505 ----------
506 groups: ndarray, int, 1d (nobs,) an array of group indicators for each
507 observation. Group levels are assumed to be defined as consecutive
508 integers, i.e. range(n_groups) where n_groups is the number of
509 group levels. A group level with no observations for it will still
510 produce a column of zeros.
512 Returns
513 -------
514 indi : ndarray, int8, 2d (nobs, n_groups)
515 an indicator array with one row per observation, that has 1 in the
516 column of the group level for that observation
518 Examples
519 --------
521 >>> g = np.array([0, 0, 2, 1, 1, 2, 0])
522 >>> indi = dummy_sparse(g)
523 >>> indi
524 <7x3 sparse matrix of type '<type 'numpy.int8'>'
525 with 7 stored elements in Compressed Sparse Row format>
526 >>> indi.todense()
527 matrix([[1, 0, 0],
528 [1, 0, 0],
529 [0, 0, 1],
530 [0, 1, 0],
531 [0, 1, 0],
532 [0, 0, 1],
533 [1, 0, 0]], dtype=int8)
536 current behavior with missing groups
537 >>> g = np.array([0, 0, 2, 0, 2, 0])
538 >>> indi = dummy_sparse(g)
539 >>> indi.todense()
540 matrix([[1, 0, 0],
541 [1, 0, 0],
542 [0, 0, 1],
543 [1, 0, 0],
544 [0, 0, 1],
545 [1, 0, 0]], dtype=int8)
546 """
547 indi = dummy_sparse(self.labels[level])
548 self._dummies = indi