Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2"""Tools for working with groups 

3 

4This provides several functions to work with groups and a Group class that 

5keeps track of the different representations and has methods to work more 

6easily with groups. 

7 

8 

9Author: Josef Perktold, 

10Author: Nathaniel Smith, recipe for sparse_dummies on scipy user mailing list 

11 

12Created on Tue Nov 29 15:44:53 2011 : sparse_dummies 

13Created on Wed Nov 30 14:28:24 2011 : combine_indices 

14changes: add Group class 

15 

16Notes 

17~~~~~ 

18 

19This reverses the class I used before, where the class was for the data and 

20the group was auxiliary. Here, it is only the group, no data is kept. 

21 

22sparse_dummies needs checking for corner cases, e.g. 

23what if a category level has zero elements? This can happen with subset 

24 selection even if the original groups where defined as arange. 

25 

26Not all methods and options have been tried out yet after refactoring 

27 

28need more efficient loop if groups are sorted -> see GroupSorted.group_iter 

29""" 

30from statsmodels.compat.python import lrange, lzip 

31import numpy as np 

32import pandas as pd 

33 

34import statsmodels.tools.data as data_util 

35from pandas import Index, MultiIndex 

36 

37 

38def combine_indices(groups, prefix='', sep='.', return_labels=False): 

39 """use np.unique to get integer group indices for product, intersection 

40 """ 

41 if isinstance(groups, tuple): 

42 groups = np.column_stack(groups) 

43 else: 

44 groups = np.asarray(groups) 

45 

46 dt = groups.dtype 

47 

48 is2d = (groups.ndim == 2) # need to store 

49 

50 if is2d: 

51 ncols = groups.shape[1] 

52 if not groups.flags.c_contiguous: 

53 groups = np.array(groups, order='C') 

54 

55 groups_ = groups.view([('', groups.dtype)] * groups.shape[1]) 

56 else: 

57 groups_ = groups 

58 

59 uni, uni_idx, uni_inv = np.unique(groups_, return_index=True, 

60 return_inverse=True) 

61 

62 if is2d: 

63 uni = uni.view(dt).reshape(-1, ncols) 

64 

65 # avoiding a view would be 

66 # for t in uni.dtype.fields.values(): 

67 # assert (t[0] == dt) 

68 # 

69 # uni.dtype = dt 

70 # uni.shape = (uni.size//ncols, ncols) 

71 

72 if return_labels: 

73 label = [(prefix+sep.join(['%s']*len(uni[0]))) % tuple(ii) 

74 for ii in uni] 

75 return uni_inv, uni_idx, uni, label 

76 else: 

77 return uni_inv, uni_idx, uni 

78 

79 

80# written for and used in try_covariance_grouploop.py 

81def group_sums(x, group, use_bincount=True): 

82 """simple bincount version, again 

83 

84 group : ndarray, integer 

85 assumed to be consecutive integers 

86 

87 no dtype checking because I want to raise in that case 

88 

89 uses loop over columns of x 

90 

91 for comparison, simple python loop 

92 """ 

93 x = np.asarray(x) 

94 if x.ndim == 1: 

95 x = x[:, None] 

96 elif x.ndim > 2 and use_bincount: 

97 raise ValueError('not implemented yet') 

98 

99 if use_bincount: 

100 

101 # re-label groups or bincount takes too much memory 

102 if np.max(group) > 2 * x.shape[0]: 

103 group = pd.factorize(group)[0] 

104 

105 return np.array([np.bincount(group, weights=x[:, col]) 

106 for col in range(x.shape[1])]) 

107 else: 

108 uniques = np.unique(group) 

109 result = np.zeros([len(uniques)] + list(x.shape[1:])) 

110 for ii, cat in enumerate(uniques): 

111 result[ii] = x[group == cat].sum(0) 

112 return result 

113 

114 

115def group_sums_dummy(x, group_dummy): 

116 """sum by groups given group dummy variable 

117 

118 group_dummy can be either ndarray or sparse matrix 

119 """ 

120 if data_util._is_using_ndarray_type(group_dummy, None): 

121 return np.dot(x.T, group_dummy) 

122 else: # check for sparse 

123 return x.T * group_dummy 

124 

125 

126# TODO: See if this can be entirely replaced by Grouping.dummy_sparse; 

127# see GH#5687 

128def dummy_sparse(groups): 

129 """create a sparse indicator from a group array with integer labels 

130 

131 Parameters 

132 ---------- 

133 groups: ndarray, int, 1d (nobs,) 

134 an array of group indicators for each observation. Group levels are 

135 assumed to be defined as consecutive integers, i.e. range(n_groups) 

136 where n_groups is the number of group levels. A group level with no 

137 observations for it will still produce a column of zeros. 

138 

139 Returns 

140 ------- 

141 indi : ndarray, int8, 2d (nobs, n_groups) 

142 an indicator array with one row per observation, that has 1 in the 

143 column of the group level for that observation 

144 

145 Examples 

146 -------- 

147 

148 >>> g = np.array([0, 0, 2, 1, 1, 2, 0]) 

149 >>> indi = dummy_sparse(g) 

150 >>> indi 

151 <7x3 sparse matrix of type '<type 'numpy.int8'>' 

152 with 7 stored elements in Compressed Sparse Row format> 

153 >>> indi.todense() 

154 matrix([[1, 0, 0], 

155 [1, 0, 0], 

156 [0, 0, 1], 

157 [0, 1, 0], 

158 [0, 1, 0], 

159 [0, 0, 1], 

160 [1, 0, 0]], dtype=int8) 

161 

162 

163 current behavior with missing groups 

164 >>> g = np.array([0, 0, 2, 0, 2, 0]) 

165 >>> indi = dummy_sparse(g) 

166 >>> indi.todense() 

167 matrix([[1, 0, 0], 

168 [1, 0, 0], 

169 [0, 0, 1], 

170 [1, 0, 0], 

171 [0, 0, 1], 

172 [1, 0, 0]], dtype=int8) 

173 """ 

174 from scipy import sparse 

175 

176 indptr = np.arange(len(groups)+1) 

177 data = np.ones(len(groups), dtype=np.int8) 

178 indi = sparse.csr_matrix((data, groups, indptr)) 

179 

180 return indi 

181 

182 

183class Group(object): 

184 

185 def __init__(self, group, name=''): 

186 

187 # self.group = np.asarray(group) # TODO: use checks in combine_indices 

188 self.name = name 

189 uni, uni_idx, uni_inv = combine_indices(group) 

190 

191 # TODO: rename these to something easier to remember 

192 self.group_int, self.uni_idx, self.uni = uni, uni_idx, uni_inv 

193 

194 self.n_groups = len(self.uni) 

195 

196 # put this here so they can be overwritten before calling labels 

197 self.separator = '.' 

198 self.prefix = self.name 

199 if self.prefix: 

200 self.prefix = self.prefix + '=' 

201 

202 # cache decorator 

203 def counts(self): 

204 return np.bincount(self.group_int) 

205 

206 # cache_decorator 

207 def labels(self): 

208 # is this only needed for product of groups (intersection)? 

209 prefix = self.prefix 

210 uni = self.uni 

211 sep = self.separator 

212 

213 if uni.ndim > 1: 

214 label = [(prefix+sep.join(['%s']*len(uni[0]))) % tuple(ii) 

215 for ii in uni] 

216 else: 

217 label = [prefix + '%s' % ii for ii in uni] 

218 return label 

219 

220 def dummy(self, drop_idx=None, sparse=False, dtype=int): 

221 """ 

222 drop_idx is only available if sparse=False 

223 

224 drop_idx is supposed to index into uni 

225 """ 

226 uni = self.uni 

227 if drop_idx is not None: 

228 idx = lrange(len(uni)) 

229 del idx[drop_idx] 

230 uni = uni[idx] 

231 

232 group = self.group 

233 

234 if not sparse: 

235 return (group[:, None] == uni[None, :]).astype(dtype) 

236 else: 

237 return dummy_sparse(self.group_int) 

238 

239 def interaction(self, other): 

240 if isinstance(other, self.__class__): 

241 other = other.group 

242 return self.__class__((self, other)) 

243 

244 def group_sums(self, x, use_bincount=True): 

245 return group_sums(x, self.group_int, use_bincount=use_bincount) 

246 

247 def group_demean(self, x, use_bincount=True): 

248 nobs = float(len(x)) 

249 means_g = group_sums(x / nobs, self.group_int, 

250 use_bincount=use_bincount) 

251 x_demeaned = x - means_g[self.group_int] # check reverse_index? 

252 return x_demeaned, means_g 

253 

254 

255class GroupSorted(Group): 

256 def __init__(self, group, name=''): 

257 super(self.__class__, self).__init__(group, name=name) 

258 

259 idx = (np.nonzero(np.diff(group))[0]+1).tolist() 

260 self.groupidx = lzip([0] + idx, idx + [len(group)]) 

261 

262 def group_iter(self): 

263 for low, upp in self.groupidx: 

264 yield slice(low, upp) 

265 

266 def lag_indices(self, lag): 

267 """return the index array for lagged values 

268 

269 Warning: if k is larger then the number of observations for an 

270 individual, then no values for that individual are returned. 

271 

272 TODO: for the unbalanced case, I should get the same truncation for 

273 the array with lag=0. From the return of lag_idx we would not know 

274 which individual is missing. 

275 

276 TODO: do I want the full equivalent of lagmat in tsa? 

277 maxlag or lag or lags. 

278 

279 not tested yet 

280 """ 

281 lag_idx = np.asarray(self.groupidx)[:, 1] - lag # asarray or already? 

282 mask_ok = (lag <= lag_idx) 

283 # still an observation that belongs to the same individual 

284 

285 return lag_idx[mask_ok] 

286 

287 

288def _is_hierarchical(x): 

289 """ 

290 Checks if the first item of an array-like object is also array-like 

291 If so, we have a MultiIndex and returns True. Else returns False. 

292 """ 

293 item = x[0] 

294 # is there a better way to do this? 

295 if isinstance(item, (list, tuple, np.ndarray, pd.Series, pd.DataFrame)): 

296 return True 

297 else: 

298 return False 

299 

300 

301def _make_hierarchical_index(index, names): 

302 return MultiIndex.from_tuples(*[index], names=names) 

303 

304 

305def _make_generic_names(index): 

306 n_names = len(index.names) 

307 pad = str(len(str(n_names))) # number of digits 

308 return [("group{0:0"+pad+"}").format(i) for i in range(n_names)] 

309 

310 

311class Grouping(object): 

312 def __init__(self, index, names=None): 

313 """ 

314 index : index-like 

315 Can be pandas MultiIndex or Index or array-like. If array-like 

316 and is a MultipleIndex (more than one grouping variable), 

317 groups are expected to be in each row. E.g., [('red', 1), 

318 ('red', 2), ('green', 1), ('green', 2)] 

319 names : list or str, optional 

320 The names to use for the groups. Should be a str if only 

321 one grouping variable is used. 

322 

323 Notes 

324 ----- 

325 If index is already a pandas Index then there is no copy. 

326 """ 

327 if isinstance(index, (Index, MultiIndex)): 

328 if names is not None: 

329 if hasattr(index, 'set_names'): # newer pandas 

330 index.set_names(names, inplace=True) 

331 else: 

332 index.names = names 

333 self.index = index 

334 else: # array_like 

335 if _is_hierarchical(index): 

336 self.index = _make_hierarchical_index(index, names) 

337 else: 

338 self.index = Index(index, name=names) 

339 if names is None: 

340 names = _make_generic_names(self.index) 

341 if hasattr(self.index, 'set_names'): 

342 self.index.set_names(names, inplace=True) 

343 else: 

344 self.index.names = names 

345 

346 self.nobs = len(self.index) 

347 self.nlevels = len(self.index.names) 

348 self.slices = None 

349 

350 @property 

351 def index_shape(self): 

352 if hasattr(self.index, 'levshape'): 

353 return self.index.levshape 

354 else: 

355 return self.index.shape 

356 

357 @property 

358 def levels(self): 

359 if hasattr(self.index, 'levels'): 

360 return self.index.levels 

361 else: 

362 return pd.Categorical(self.index).levels 

363 

364 @property 

365 def labels(self): 

366 # this was index_int, but that's not a very good name... 

367 codes = getattr(self.index, 'codes', None) 

368 if codes is None: 

369 if hasattr(self.index, 'labels'): 

370 codes = self.index.labels 

371 else: 

372 codes = pd.Categorical(self.index).codes[None] 

373 return codes 

374 

375 @property 

376 def group_names(self): 

377 return self.index.names 

378 

379 def reindex(self, index=None, names=None): 

380 """ 

381 Resets the index in-place. 

382 """ 

383 # NOTE: this is not of much use if the rest of the data does not change 

384 # This needs to reset cache 

385 if names is None: 

386 names = self.group_names 

387 self = Grouping(index, names) 

388 

389 def get_slices(self, level=0): 

390 """ 

391 Sets the slices attribute to be a list of indices of the sorted 

392 groups for the first index level. I.e., self.slices[0] is the 

393 index where each observation is in the first (sorted) group. 

394 """ 

395 # TODO: refactor this 

396 groups = self.index.get_level_values(level).unique() 

397 groups = np.array(groups) 

398 groups.sort() 

399 if isinstance(self.index, MultiIndex): 

400 self.slices = [self.index.get_loc_level(x, level=level)[0] 

401 for x in groups] 

402 else: 

403 self.slices = [self.index.get_loc(x) for x in groups] 

404 

405 def count_categories(self, level=0): 

406 """ 

407 Sets the attribute counts to equal the bincount of the (integer-valued) 

408 labels. 

409 """ 

410 # TODO: refactor this not to set an attribute. Why would we do this? 

411 self.counts = np.bincount(self.labels[level]) 

412 

413 def check_index(self, is_sorted=True, unique=True, index=None): 

414 """Sanity checks""" 

415 if not index: 

416 index = self.index 

417 if is_sorted: 

418 test = pd.DataFrame(lrange(len(index)), index=index) 

419 test_sorted = test.sort() 

420 if not test.index.equals(test_sorted.index): 

421 raise Exception('Data is not be sorted') 

422 if unique: 

423 if len(index) != len(index.unique()): 

424 raise Exception('Duplicate index entries') 

425 

426 def sort(self, data, index=None): 

427 """Applies a (potentially hierarchical) sort operation on a numpy array 

428 or pandas series/dataframe based on the grouping index or a 

429 user-supplied index. Returns an object of the same type as the 

430 original data as well as the matching (sorted) Pandas index. 

431 """ 

432 

433 if index is None: 

434 index = self.index 

435 if data_util._is_using_ndarray_type(data, None): 

436 if data.ndim == 1: 

437 out = pd.Series(data, index=index, copy=True) 

438 out = out.sort_index() 

439 else: 

440 out = pd.DataFrame(data, index=index) 

441 out = out.sort_index(inplace=False) # copies 

442 return np.array(out), out.index 

443 elif data_util._is_using_pandas(data, None): 

444 out = data 

445 out = out.reindex(index) # copies? 

446 out = out.sort_index() 

447 return out, out.index 

448 else: 

449 msg = 'data must be a Numpy array or a Pandas Series/DataFrame' 

450 raise ValueError(msg) 

451 

452 def transform_dataframe(self, dataframe, function, level=0, **kwargs): 

453 """Apply function to each column, by group 

454 Assumes that the dataframe already has a proper index""" 

455 if dataframe.shape[0] != self.nobs: 

456 raise Exception('dataframe does not have the same shape as index') 

457 out = dataframe.groupby(level=level).apply(function, **kwargs) 

458 if 1 in out.shape: 

459 return np.ravel(out) 

460 else: 

461 return np.array(out) 

462 

463 def transform_array(self, array, function, level=0, **kwargs): 

464 """Apply function to each column, by group 

465 """ 

466 if array.shape[0] != self.nobs: 

467 raise Exception('array does not have the same shape as index') 

468 dataframe = pd.DataFrame(array, index=self.index) 

469 return self.transform_dataframe(dataframe, function, level=level, 

470 **kwargs) 

471 

472 def transform_slices(self, array, function, level=0, **kwargs): 

473 """Apply function to each group. Similar to transform_array but does 

474 not coerce array to a DataFrame and back and only works on a 1D or 2D 

475 numpy array. function is called function(group, group_idx, **kwargs). 

476 """ 

477 array = np.asarray(array) 

478 if array.shape[0] != self.nobs: 

479 raise Exception('array does not have the same shape as index') 

480 # always reset because level is given. need to refactor this. 

481 self.get_slices(level=level) 

482 processed = [] 

483 for s in self.slices: 

484 if array.ndim == 2: 

485 subset = array[s, :] 

486 elif array.ndim == 1: 

487 subset = array[s] 

488 processed.append(function(subset, s, **kwargs)) 

489 processed = np.array(processed) 

490 return processed.reshape(-1, processed.shape[-1]) 

491 

492 # TODO: this is not general needs to be a PanelGrouping object 

493 def dummies_time(self): 

494 self.dummy_sparse(level=1) 

495 return self._dummies 

496 

497 def dummies_groups(self, level=0): 

498 self.dummy_sparse(level=level) 

499 return self._dummies 

500 

501 def dummy_sparse(self, level=0): 

502 """create a sparse indicator from a group array with integer labels 

503 

504 Parameters 

505 ---------- 

506 groups: ndarray, int, 1d (nobs,) an array of group indicators for each 

507 observation. Group levels are assumed to be defined as consecutive 

508 integers, i.e. range(n_groups) where n_groups is the number of 

509 group levels. A group level with no observations for it will still 

510 produce a column of zeros. 

511 

512 Returns 

513 ------- 

514 indi : ndarray, int8, 2d (nobs, n_groups) 

515 an indicator array with one row per observation, that has 1 in the 

516 column of the group level for that observation 

517 

518 Examples 

519 -------- 

520 

521 >>> g = np.array([0, 0, 2, 1, 1, 2, 0]) 

522 >>> indi = dummy_sparse(g) 

523 >>> indi 

524 <7x3 sparse matrix of type '<type 'numpy.int8'>' 

525 with 7 stored elements in Compressed Sparse Row format> 

526 >>> indi.todense() 

527 matrix([[1, 0, 0], 

528 [1, 0, 0], 

529 [0, 0, 1], 

530 [0, 1, 0], 

531 [0, 1, 0], 

532 [0, 0, 1], 

533 [1, 0, 0]], dtype=int8) 

534 

535 

536 current behavior with missing groups 

537 >>> g = np.array([0, 0, 2, 0, 2, 0]) 

538 >>> indi = dummy_sparse(g) 

539 >>> indi.todense() 

540 matrix([[1, 0, 0], 

541 [1, 0, 0], 

542 [0, 0, 1], 

543 [1, 0, 0], 

544 [0, 0, 1], 

545 [1, 0, 0]], dtype=int8) 

546 """ 

547 indi = dummy_sparse(self.labels[level]) 

548 self._dummies = indi