Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2011-2013 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5__all__ = ["C", "guess_categorical", "CategoricalSniffer", 

6 "categorical_to_int"] 

7 

8# How we handle categorical data: the big picture 

9# ----------------------------------------------- 

10# 

11# There is no Python/NumPy standard for how to represent categorical data. 

12# There is no Python/NumPy standard for how to represent missing data. 

13# 

14# Together, these facts mean that when we receive some data object, we must be 

15# able to heuristically infer what levels it has -- and this process must be 

16# sensitive to the current missing data handling, because maybe 'None' is a 

17# level and maybe it is missing data. 

18# 

19# We don't know how missing data is represented until we get into the actual 

20# builder code, so anything which runs before this -- e.g., the 'C()' builtin 

21# -- cannot actually do *anything* meaningful with the data. 

22# 

23# Therefore, C() simply takes some data and arguments, and boxes them all up 

24# together into an object called (appropriately enough) _CategoricalBox. All 

25# the actual work of handling the various different sorts of categorical data 

26# (lists, string arrays, bool arrays, pandas.Categorical, etc.) happens inside 

27# the builder code, and we just extend this so that it also accepts 

28# _CategoricalBox objects as yet another categorical type. 

29# 

30# Originally this file contained a container type (called 'Categorical'), and 

31# the various sniffing, conversion, etc., functions were written as methods on 

32# that type. But we had to get rid of that type, so now this file just 

33# provides a set of plain old functions which are used by patsy.build to 

34# handle the different stages of categorical data munging. 

35 

36import numpy as np 

37import six 

38from patsy import PatsyError 

39from patsy.util import (SortAnythingKey, 

40 safe_scalar_isnan, 

41 iterable, 

42 have_pandas, have_pandas_categorical, 

43 have_pandas_categorical_dtype, 

44 safe_is_pandas_categorical, 

45 pandas_Categorical_from_codes, 

46 pandas_Categorical_categories, 

47 pandas_Categorical_codes, 

48 safe_issubdtype, 

49 no_pickling, assert_no_pickling) 

50 

51if have_pandas: 

52 import pandas 

53 

54# Objects of this type will always be treated as categorical, with the 

55# specified levels and contrast (if given). 

56class _CategoricalBox(object): 

57 def __init__(self, data, contrast, levels): 

58 self.data = data 

59 self.contrast = contrast 

60 self.levels = levels 

61 

62 __getstate__ = no_pickling 

63 

64def C(data, contrast=None, levels=None): 

65 """ 

66 Marks some `data` as being categorical, and specifies how to interpret 

67 it. 

68 

69 This is used for three reasons: 

70 

71 * To explicitly mark some data as categorical. For instance, integer data 

72 is by default treated as numerical. If you have data that is stored 

73 using an integer type, but where you want patsy to treat each different 

74 value as a different level of a categorical factor, you can wrap it in a 

75 call to `C` to accomplish this. E.g., compare:: 

76 

77 dmatrix("a", {"a": [1, 2, 3]}) 

78 dmatrix("C(a)", {"a": [1, 2, 3]}) 

79 

80 * To explicitly set the levels or override the default level ordering for 

81 categorical data, e.g.:: 

82 

83 dmatrix("C(a, levels=["a2", "a1"])", balanced(a=2)) 

84 * To override the default coding scheme for categorical data. The 

85 `contrast` argument can be any of: 

86 

87 * A :class:`ContrastMatrix` object 

88 * A simple 2d ndarray (which is treated the same as a ContrastMatrix 

89 object except that you can't specify column names) 

90 * An object with methods called `code_with_intercept` and 

91 `code_without_intercept`, like the built-in contrasts 

92 (:class:`Treatment`, :class:`Diff`, :class:`Poly`, etc.). See 

93 :ref:`categorical-coding` for more details. 

94 * A callable that returns one of the above. 

95 """ 

96 if isinstance(data, _CategoricalBox): 

97 if contrast is None: 

98 contrast = data.contrast 

99 if levels is None: 

100 levels = data.levels 

101 data = data.data 

102 return _CategoricalBox(data, contrast, levels) 

103 

104def test_C(): 

105 c1 = C("asdf") 

106 assert isinstance(c1, _CategoricalBox) 

107 assert c1.data == "asdf" 

108 assert c1.levels is None 

109 assert c1.contrast is None 

110 c2 = C("DATA", "CONTRAST", "LEVELS") 

111 assert c2.data == "DATA" 

112 assert c2.contrast == "CONTRAST" 

113 assert c2.levels == "LEVELS" 

114 c3 = C(c2, levels="NEW LEVELS") 

115 assert c3.data == "DATA" 

116 assert c3.contrast == "CONTRAST" 

117 assert c3.levels == "NEW LEVELS" 

118 c4 = C(c2, "NEW CONTRAST") 

119 assert c4.data == "DATA" 

120 assert c4.contrast == "NEW CONTRAST" 

121 assert c4.levels == "LEVELS" 

122 

123 assert_no_pickling(c4) 

124 

125def guess_categorical(data): 

126 if safe_is_pandas_categorical(data): 

127 return True 

128 if isinstance(data, _CategoricalBox): 

129 return True 

130 data = np.asarray(data) 

131 if safe_issubdtype(data.dtype, np.number): 

132 return False 

133 return True 

134 

135def test_guess_categorical(): 

136 if have_pandas_categorical: 

137 c = pandas.Categorical([1, 2, 3]) 

138 assert guess_categorical(c) 

139 if have_pandas_categorical_dtype: 

140 assert guess_categorical(pandas.Series(c)) 

141 assert guess_categorical(C([1, 2, 3])) 

142 assert guess_categorical([True, False]) 

143 assert guess_categorical(["a", "b"]) 

144 assert guess_categorical(["a", "b", np.nan]) 

145 assert guess_categorical(["a", "b", None]) 

146 assert not guess_categorical([1, 2, 3]) 

147 assert not guess_categorical([1, 2, 3, np.nan]) 

148 assert not guess_categorical([1.0, 2.0, 3.0]) 

149 assert not guess_categorical([1.0, 2.0, 3.0, np.nan]) 

150 

151def _categorical_shape_fix(data): 

152 # helper function 

153 # data should not be a _CategoricalBox or pandas Categorical or anything 

154 # -- it should be an actual iterable of data, but which might have the 

155 # wrong shape. 

156 if hasattr(data, "ndim") and data.ndim > 1: 

157 raise PatsyError("categorical data cannot be >1-dimensional") 

158 # coerce scalars into 1d, which is consistent with what we do for numeric 

159 # factors. (See statsmodels/statsmodels#1881) 

160 if (not iterable(data) 

161 or isinstance(data, (six.text_type, six.binary_type))): 

162 data = [data] 

163 return data 

164 

165class CategoricalSniffer(object): 

166 def __init__(self, NA_action, origin=None): 

167 self._NA_action = NA_action 

168 self._origin = origin 

169 self._contrast = None 

170 self._levels = None 

171 self._level_set = set() 

172 

173 def levels_contrast(self): 

174 if self._levels is None: 

175 levels = list(self._level_set) 

176 levels.sort(key=SortAnythingKey) 

177 self._levels = levels 

178 return tuple(self._levels), self._contrast 

179 

180 def sniff(self, data): 

181 if hasattr(data, "contrast"): 

182 self._contrast = data.contrast 

183 # returns a bool: are we confident that we found all the levels? 

184 if isinstance(data, _CategoricalBox): 

185 if data.levels is not None: 

186 self._levels = tuple(data.levels) 

187 return True 

188 else: 

189 # unbox and fall through 

190 data = data.data 

191 if safe_is_pandas_categorical(data): 

192 # pandas.Categorical has its own NA detection, so don't try to 

193 # second-guess it. 

194 self._levels = tuple(pandas_Categorical_categories(data)) 

195 return True 

196 # fastpath to avoid doing an item-by-item iteration over boolean 

197 # arrays, as requested by #44 

198 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): 

199 self._level_set = set([True, False]) 

200 return True 

201 

202 data = _categorical_shape_fix(data) 

203 

204 for value in data: 

205 if self._NA_action.is_categorical_NA(value): 

206 continue 

207 if value is True or value is False: 

208 self._level_set.update([True, False]) 

209 else: 

210 try: 

211 self._level_set.add(value) 

212 except TypeError: 

213 raise PatsyError("Error interpreting categorical data: " 

214 "all items must be hashable", 

215 self._origin) 

216 # If everything we've seen is boolean, assume that everything else 

217 # would be too. Otherwise we need to keep looking. 

218 return self._level_set == set([True, False]) 

219 

220 __getstate__ = no_pickling 

221 

222def test_CategoricalSniffer(): 

223 from patsy.missing import NAAction 

224 def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None): 

225 sniffer = CategoricalSniffer(NAAction(NA_types=NA_types)) 

226 for data in datas: 

227 done = sniffer.sniff(data) 

228 if done: 

229 assert exp_finish_fast 

230 break 

231 else: 

232 assert not exp_finish_fast 

233 assert sniffer.levels_contrast() == (exp_levels, exp_contrast) 

234 

235 if have_pandas_categorical: 

236 # We make sure to test with both boxed and unboxed pandas objects, 

237 # because we used to have a bug where boxed pandas objects would be 

238 # treated as categorical, but their levels would be lost... 

239 preps = [lambda x: x, 

240 C] 

241 if have_pandas_categorical_dtype: 

242 preps += [pandas.Series, 

243 lambda x: C(pandas.Series(x))] 

244 for prep in preps: 

245 t([], [prep(pandas.Categorical([1, 2, None]))], 

246 True, (1, 2)) 

247 # check order preservation 

248 t([], [prep(pandas_Categorical_from_codes([1, 0], ["a", "b"]))], 

249 True, ("a", "b")) 

250 t([], [prep(pandas_Categorical_from_codes([1, 0], ["b", "a"]))], 

251 True, ("b", "a")) 

252 # check that if someone sticks a .contrast field onto our object 

253 obj = prep(pandas.Categorical(["a", "b"])) 

254 obj.contrast = "CONTRAST" 

255 t([], [obj], True, ("a", "b"), "CONTRAST") 

256 

257 t([], [C([1, 2]), C([3, 2])], False, (1, 2, 3)) 

258 # check order preservation 

259 t([], [C([1, 2], levels=[1, 2, 3]), C([4, 2])], True, (1, 2, 3)) 

260 t([], [C([1, 2], levels=[3, 2, 1]), C([4, 2])], True, (3, 2, 1)) 

261 

262 # do some actual sniffing with NAs in 

263 t(["None", "NaN"], [C([1, np.nan]), C([10, None])], 

264 False, (1, 10)) 

265 # But 'None' can be a type if we don't make it represent NA: 

266 sniffer = CategoricalSniffer(NAAction(NA_types=["NaN"])) 

267 sniffer.sniff(C([1, np.nan, None])) 

268 # The level order here is different on py2 and py3 :-( Because there's no 

269 # consistent way to sort mixed-type values on both py2 and py3. Honestly 

270 # people probably shouldn't use this, but I don't know how to give a 

271 # sensible error. 

272 levels, _ = sniffer.levels_contrast() 

273 assert set(levels) == set([None, 1]) 

274 

275 # bool special cases 

276 t(["None", "NaN"], [C([True, np.nan, None])], 

277 True, (False, True)) 

278 t([], [C([10, 20]), C([False]), C([30, 40])], 

279 False, (False, True, 10, 20, 30, 40)) 

280 # exercise the fast-path 

281 t([], [np.asarray([True, False]), ["foo"]], 

282 True, (False, True)) 

283 

284 # check tuples too 

285 t(["None", "NaN"], [C([("b", 2), None, ("a", 1), np.nan, ("c", None)])], 

286 False, (("a", 1), ("b", 2), ("c", None))) 

287 

288 # contrasts 

289 t([], [C([10, 20], contrast="FOO")], False, (10, 20), "FOO") 

290 

291 # no box 

292 t([], [[10, 30], [20]], False, (10, 20, 30)) 

293 t([], [["b", "a"], ["a"]], False, ("a", "b")) 

294 

295 # 0d 

296 t([], ["b"], False, ("b",)) 

297 

298 from nose.tools import assert_raises 

299 

300 # unhashable level error: 

301 sniffer = CategoricalSniffer(NAAction()) 

302 assert_raises(PatsyError, sniffer.sniff, [{}]) 

303 

304 # >1d is illegal 

305 assert_raises(PatsyError, sniffer.sniff, np.asarray([["b"]])) 

306 

307# returns either a 1d ndarray or a pandas.Series 

308def categorical_to_int(data, levels, NA_action, origin=None): 

309 assert isinstance(levels, tuple) 

310 # In this function, missing values are always mapped to -1 

311 

312 if safe_is_pandas_categorical(data): 

313 data_levels_tuple = tuple(pandas_Categorical_categories(data)) 

314 if not data_levels_tuple == levels: 

315 raise PatsyError("mismatching levels: expected %r, got %r" 

316 % (levels, data_levels_tuple), origin) 

317 # pandas.Categorical also uses -1 to indicate NA, and we don't try to 

318 # second-guess its NA detection, so we can just pass it back. 

319 return pandas_Categorical_codes(data) 

320 

321 if isinstance(data, _CategoricalBox): 

322 if data.levels is not None and tuple(data.levels) != levels: 

323 raise PatsyError("mismatching levels: expected %r, got %r" 

324 % (levels, tuple(data.levels)), origin) 

325 data = data.data 

326 

327 data = _categorical_shape_fix(data) 

328 

329 try: 

330 level_to_int = dict(zip(levels, range(len(levels)))) 

331 except TypeError: 

332 raise PatsyError("Error interpreting categorical data: " 

333 "all items must be hashable", origin) 

334 

335 # fastpath to avoid doing an item-by-item iteration over boolean arrays, 

336 # as requested by #44 

337 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_): 

338 if level_to_int[False] == 0 and level_to_int[True] == 1: 

339 return data.astype(np.int_) 

340 out = np.empty(len(data), dtype=int) 

341 for i, value in enumerate(data): 

342 if NA_action.is_categorical_NA(value): 

343 out[i] = -1 

344 else: 

345 try: 

346 out[i] = level_to_int[value] 

347 except KeyError: 

348 SHOW_LEVELS = 4 

349 level_strs = [] 

350 if len(levels) <= SHOW_LEVELS: 

351 level_strs += [repr(level) for level in levels] 

352 else: 

353 level_strs += [repr(level) 

354 for level in levels[:SHOW_LEVELS//2]] 

355 level_strs.append("...") 

356 level_strs += [repr(level) 

357 for level in levels[-SHOW_LEVELS//2:]] 

358 level_str = "[%s]" % (", ".join(level_strs)) 

359 raise PatsyError("Error converting data to categorical: " 

360 "observation with value %r does not match " 

361 "any of the expected levels (expected: %s)" 

362 % (value, level_str), origin) 

363 except TypeError: 

364 raise PatsyError("Error converting data to categorical: " 

365 "encountered unhashable value %r" 

366 % (value,), origin) 

367 if have_pandas and isinstance(data, pandas.Series): 

368 out = pandas.Series(out, index=data.index) 

369 return out 

370 

371def test_categorical_to_int(): 

372 from nose.tools import assert_raises 

373 from patsy.missing import NAAction 

374 if have_pandas: 

375 s = pandas.Series(["a", "b", "c"], index=[10, 20, 30]) 

376 c_pandas = categorical_to_int(s, ("a", "b", "c"), NAAction()) 

377 assert np.all(c_pandas == [0, 1, 2]) 

378 assert np.all(c_pandas.index == [10, 20, 30]) 

379 # Input must be 1-dimensional 

380 assert_raises(PatsyError, 

381 categorical_to_int, 

382 pandas.DataFrame({10: s}), ("a", "b", "c"), NAAction()) 

383 if have_pandas_categorical: 

384 constructors = [pandas_Categorical_from_codes] 

385 if have_pandas_categorical_dtype: 

386 def Series_from_codes(codes, categories): 

387 c = pandas_Categorical_from_codes(codes, categories) 

388 return pandas.Series(c) 

389 constructors.append(Series_from_codes) 

390 for con in constructors: 

391 cat = con([1, 0, -1], ("a", "b")) 

392 conv = categorical_to_int(cat, ("a", "b"), NAAction()) 

393 assert np.all(conv == [1, 0, -1]) 

394 # Trust pandas NA marking 

395 cat2 = con([1, 0, -1], ("a", "None")) 

396 conv2 = categorical_to_int(cat, ("a", "b"), 

397 NAAction(NA_types=["None"])) 

398 assert np.all(conv2 == [1, 0, -1]) 

399 # But levels must match 

400 assert_raises(PatsyError, 

401 categorical_to_int, 

402 con([1, 0], ("a", "b")), 

403 ("a", "c"), 

404 NAAction()) 

405 assert_raises(PatsyError, 

406 categorical_to_int, 

407 con([1, 0], ("a", "b")), 

408 ("b", "a"), 

409 NAAction()) 

410 

411 def t(data, levels, expected, NA_action=NAAction()): 

412 got = categorical_to_int(data, levels, NA_action) 

413 assert np.array_equal(got, expected) 

414 

415 t(["a", "b", "a"], ("a", "b"), [0, 1, 0]) 

416 t(np.asarray(["a", "b", "a"]), ("a", "b"), [0, 1, 0]) 

417 t(np.asarray(["a", "b", "a"], dtype=object), ("a", "b"), [0, 1, 0]) 

418 t([0, 1, 2], (1, 2, 0), [2, 0, 1]) 

419 t(np.asarray([0, 1, 2]), (1, 2, 0), [2, 0, 1]) 

420 t(np.asarray([0, 1, 2], dtype=float), (1, 2, 0), [2, 0, 1]) 

421 t(np.asarray([0, 1, 2], dtype=object), (1, 2, 0), [2, 0, 1]) 

422 t(["a", "b", "a"], ("a", "d", "z", "b"), [0, 3, 0]) 

423 t([("a", 1), ("b", 0), ("a", 1)], (("a", 1), ("b", 0)), [0, 1, 0]) 

424 

425 assert_raises(PatsyError, categorical_to_int, 

426 ["a", "b", "a"], ("a", "c"), NAAction()) 

427 

428 t(C(["a", "b", "a"]), ("a", "b"), [0, 1, 0]) 

429 t(C(["a", "b", "a"]), ("b", "a"), [1, 0, 1]) 

430 t(C(["a", "b", "a"], levels=["b", "a"]), ("b", "a"), [1, 0, 1]) 

431 # Mismatch between C() levels and expected levels 

432 assert_raises(PatsyError, categorical_to_int, 

433 C(["a", "b", "a"], levels=["a", "b"]), 

434 ("b", "a"), NAAction()) 

435 

436 # ndim == 0 is okay 

437 t("a", ("a", "b"), [0]) 

438 t("b", ("a", "b"), [1]) 

439 t(True, (False, True), [1]) 

440 

441 # ndim == 2 is disallowed 

442 assert_raises(PatsyError, categorical_to_int, 

443 np.asarray([["a", "b"], ["b", "a"]]), 

444 ("a", "b"), NAAction()) 

445 

446 # levels must be hashable 

447 assert_raises(PatsyError, categorical_to_int, 

448 ["a", "b"], ("a", "b", {}), NAAction()) 

449 assert_raises(PatsyError, categorical_to_int, 

450 ["a", "b", {}], ("a", "b"), NAAction()) 

451 

452 t(["b", None, np.nan, "a"], ("a", "b"), [1, -1, -1, 0], 

453 NAAction(NA_types=["None", "NaN"])) 

454 t(["b", None, np.nan, "a"], ("a", "b", None), [1, -1, -1, 0], 

455 NAAction(NA_types=["None", "NaN"])) 

456 t(["b", None, np.nan, "a"], ("a", "b", None), [1, 2, -1, 0], 

457 NAAction(NA_types=["NaN"])) 

458 

459 # Smoke test for the branch that formats the ellipsized list of levels in 

460 # the error message: 

461 assert_raises(PatsyError, categorical_to_int, 

462 ["a", "b", "q"], 

463 ("a", "b", "c", "d", "e", "f", "g", "h"), 

464 NAAction())