Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/categorical.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2011-2013 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5__all__ = ["C", "guess_categorical", "CategoricalSniffer",
6 "categorical_to_int"]
8# How we handle categorical data: the big picture
9# -----------------------------------------------
10#
11# There is no Python/NumPy standard for how to represent categorical data.
12# There is no Python/NumPy standard for how to represent missing data.
13#
14# Together, these facts mean that when we receive some data object, we must be
15# able to heuristically infer what levels it has -- and this process must be
16# sensitive to the current missing data handling, because maybe 'None' is a
17# level and maybe it is missing data.
18#
19# We don't know how missing data is represented until we get into the actual
20# builder code, so anything which runs before this -- e.g., the 'C()' builtin
21# -- cannot actually do *anything* meaningful with the data.
22#
23# Therefore, C() simply takes some data and arguments, and boxes them all up
24# together into an object called (appropriately enough) _CategoricalBox. All
25# the actual work of handling the various different sorts of categorical data
26# (lists, string arrays, bool arrays, pandas.Categorical, etc.) happens inside
27# the builder code, and we just extend this so that it also accepts
28# _CategoricalBox objects as yet another categorical type.
29#
30# Originally this file contained a container type (called 'Categorical'), and
31# the various sniffing, conversion, etc., functions were written as methods on
32# that type. But we had to get rid of that type, so now this file just
33# provides a set of plain old functions which are used by patsy.build to
34# handle the different stages of categorical data munging.
36import numpy as np
37import six
38from patsy import PatsyError
39from patsy.util import (SortAnythingKey,
40 safe_scalar_isnan,
41 iterable,
42 have_pandas, have_pandas_categorical,
43 have_pandas_categorical_dtype,
44 safe_is_pandas_categorical,
45 pandas_Categorical_from_codes,
46 pandas_Categorical_categories,
47 pandas_Categorical_codes,
48 safe_issubdtype,
49 no_pickling, assert_no_pickling)
51if have_pandas:
52 import pandas
54# Objects of this type will always be treated as categorical, with the
55# specified levels and contrast (if given).
56class _CategoricalBox(object):
57 def __init__(self, data, contrast, levels):
58 self.data = data
59 self.contrast = contrast
60 self.levels = levels
62 __getstate__ = no_pickling
64def C(data, contrast=None, levels=None):
65 """
66 Marks some `data` as being categorical, and specifies how to interpret
67 it.
69 This is used for three reasons:
71 * To explicitly mark some data as categorical. For instance, integer data
72 is by default treated as numerical. If you have data that is stored
73 using an integer type, but where you want patsy to treat each different
74 value as a different level of a categorical factor, you can wrap it in a
75 call to `C` to accomplish this. E.g., compare::
77 dmatrix("a", {"a": [1, 2, 3]})
78 dmatrix("C(a)", {"a": [1, 2, 3]})
80 * To explicitly set the levels or override the default level ordering for
81 categorical data, e.g.::
83 dmatrix("C(a, levels=["a2", "a1"])", balanced(a=2))
84 * To override the default coding scheme for categorical data. The
85 `contrast` argument can be any of:
87 * A :class:`ContrastMatrix` object
88 * A simple 2d ndarray (which is treated the same as a ContrastMatrix
89 object except that you can't specify column names)
90 * An object with methods called `code_with_intercept` and
91 `code_without_intercept`, like the built-in contrasts
92 (:class:`Treatment`, :class:`Diff`, :class:`Poly`, etc.). See
93 :ref:`categorical-coding` for more details.
94 * A callable that returns one of the above.
95 """
96 if isinstance(data, _CategoricalBox):
97 if contrast is None:
98 contrast = data.contrast
99 if levels is None:
100 levels = data.levels
101 data = data.data
102 return _CategoricalBox(data, contrast, levels)
104def test_C():
105 c1 = C("asdf")
106 assert isinstance(c1, _CategoricalBox)
107 assert c1.data == "asdf"
108 assert c1.levels is None
109 assert c1.contrast is None
110 c2 = C("DATA", "CONTRAST", "LEVELS")
111 assert c2.data == "DATA"
112 assert c2.contrast == "CONTRAST"
113 assert c2.levels == "LEVELS"
114 c3 = C(c2, levels="NEW LEVELS")
115 assert c3.data == "DATA"
116 assert c3.contrast == "CONTRAST"
117 assert c3.levels == "NEW LEVELS"
118 c4 = C(c2, "NEW CONTRAST")
119 assert c4.data == "DATA"
120 assert c4.contrast == "NEW CONTRAST"
121 assert c4.levels == "LEVELS"
123 assert_no_pickling(c4)
125def guess_categorical(data):
126 if safe_is_pandas_categorical(data):
127 return True
128 if isinstance(data, _CategoricalBox):
129 return True
130 data = np.asarray(data)
131 if safe_issubdtype(data.dtype, np.number):
132 return False
133 return True
135def test_guess_categorical():
136 if have_pandas_categorical:
137 c = pandas.Categorical([1, 2, 3])
138 assert guess_categorical(c)
139 if have_pandas_categorical_dtype:
140 assert guess_categorical(pandas.Series(c))
141 assert guess_categorical(C([1, 2, 3]))
142 assert guess_categorical([True, False])
143 assert guess_categorical(["a", "b"])
144 assert guess_categorical(["a", "b", np.nan])
145 assert guess_categorical(["a", "b", None])
146 assert not guess_categorical([1, 2, 3])
147 assert not guess_categorical([1, 2, 3, np.nan])
148 assert not guess_categorical([1.0, 2.0, 3.0])
149 assert not guess_categorical([1.0, 2.0, 3.0, np.nan])
151def _categorical_shape_fix(data):
152 # helper function
153 # data should not be a _CategoricalBox or pandas Categorical or anything
154 # -- it should be an actual iterable of data, but which might have the
155 # wrong shape.
156 if hasattr(data, "ndim") and data.ndim > 1:
157 raise PatsyError("categorical data cannot be >1-dimensional")
158 # coerce scalars into 1d, which is consistent with what we do for numeric
159 # factors. (See statsmodels/statsmodels#1881)
160 if (not iterable(data)
161 or isinstance(data, (six.text_type, six.binary_type))):
162 data = [data]
163 return data
165class CategoricalSniffer(object):
166 def __init__(self, NA_action, origin=None):
167 self._NA_action = NA_action
168 self._origin = origin
169 self._contrast = None
170 self._levels = None
171 self._level_set = set()
173 def levels_contrast(self):
174 if self._levels is None:
175 levels = list(self._level_set)
176 levels.sort(key=SortAnythingKey)
177 self._levels = levels
178 return tuple(self._levels), self._contrast
180 def sniff(self, data):
181 if hasattr(data, "contrast"):
182 self._contrast = data.contrast
183 # returns a bool: are we confident that we found all the levels?
184 if isinstance(data, _CategoricalBox):
185 if data.levels is not None:
186 self._levels = tuple(data.levels)
187 return True
188 else:
189 # unbox and fall through
190 data = data.data
191 if safe_is_pandas_categorical(data):
192 # pandas.Categorical has its own NA detection, so don't try to
193 # second-guess it.
194 self._levels = tuple(pandas_Categorical_categories(data))
195 return True
196 # fastpath to avoid doing an item-by-item iteration over boolean
197 # arrays, as requested by #44
198 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
199 self._level_set = set([True, False])
200 return True
202 data = _categorical_shape_fix(data)
204 for value in data:
205 if self._NA_action.is_categorical_NA(value):
206 continue
207 if value is True or value is False:
208 self._level_set.update([True, False])
209 else:
210 try:
211 self._level_set.add(value)
212 except TypeError:
213 raise PatsyError("Error interpreting categorical data: "
214 "all items must be hashable",
215 self._origin)
216 # If everything we've seen is boolean, assume that everything else
217 # would be too. Otherwise we need to keep looking.
218 return self._level_set == set([True, False])
220 __getstate__ = no_pickling
222def test_CategoricalSniffer():
223 from patsy.missing import NAAction
224 def t(NA_types, datas, exp_finish_fast, exp_levels, exp_contrast=None):
225 sniffer = CategoricalSniffer(NAAction(NA_types=NA_types))
226 for data in datas:
227 done = sniffer.sniff(data)
228 if done:
229 assert exp_finish_fast
230 break
231 else:
232 assert not exp_finish_fast
233 assert sniffer.levels_contrast() == (exp_levels, exp_contrast)
235 if have_pandas_categorical:
236 # We make sure to test with both boxed and unboxed pandas objects,
237 # because we used to have a bug where boxed pandas objects would be
238 # treated as categorical, but their levels would be lost...
239 preps = [lambda x: x,
240 C]
241 if have_pandas_categorical_dtype:
242 preps += [pandas.Series,
243 lambda x: C(pandas.Series(x))]
244 for prep in preps:
245 t([], [prep(pandas.Categorical([1, 2, None]))],
246 True, (1, 2))
247 # check order preservation
248 t([], [prep(pandas_Categorical_from_codes([1, 0], ["a", "b"]))],
249 True, ("a", "b"))
250 t([], [prep(pandas_Categorical_from_codes([1, 0], ["b", "a"]))],
251 True, ("b", "a"))
252 # check that if someone sticks a .contrast field onto our object
253 obj = prep(pandas.Categorical(["a", "b"]))
254 obj.contrast = "CONTRAST"
255 t([], [obj], True, ("a", "b"), "CONTRAST")
257 t([], [C([1, 2]), C([3, 2])], False, (1, 2, 3))
258 # check order preservation
259 t([], [C([1, 2], levels=[1, 2, 3]), C([4, 2])], True, (1, 2, 3))
260 t([], [C([1, 2], levels=[3, 2, 1]), C([4, 2])], True, (3, 2, 1))
262 # do some actual sniffing with NAs in
263 t(["None", "NaN"], [C([1, np.nan]), C([10, None])],
264 False, (1, 10))
265 # But 'None' can be a type if we don't make it represent NA:
266 sniffer = CategoricalSniffer(NAAction(NA_types=["NaN"]))
267 sniffer.sniff(C([1, np.nan, None]))
268 # The level order here is different on py2 and py3 :-( Because there's no
269 # consistent way to sort mixed-type values on both py2 and py3. Honestly
270 # people probably shouldn't use this, but I don't know how to give a
271 # sensible error.
272 levels, _ = sniffer.levels_contrast()
273 assert set(levels) == set([None, 1])
275 # bool special cases
276 t(["None", "NaN"], [C([True, np.nan, None])],
277 True, (False, True))
278 t([], [C([10, 20]), C([False]), C([30, 40])],
279 False, (False, True, 10, 20, 30, 40))
280 # exercise the fast-path
281 t([], [np.asarray([True, False]), ["foo"]],
282 True, (False, True))
284 # check tuples too
285 t(["None", "NaN"], [C([("b", 2), None, ("a", 1), np.nan, ("c", None)])],
286 False, (("a", 1), ("b", 2), ("c", None)))
288 # contrasts
289 t([], [C([10, 20], contrast="FOO")], False, (10, 20), "FOO")
291 # no box
292 t([], [[10, 30], [20]], False, (10, 20, 30))
293 t([], [["b", "a"], ["a"]], False, ("a", "b"))
295 # 0d
296 t([], ["b"], False, ("b",))
298 from nose.tools import assert_raises
300 # unhashable level error:
301 sniffer = CategoricalSniffer(NAAction())
302 assert_raises(PatsyError, sniffer.sniff, [{}])
304 # >1d is illegal
305 assert_raises(PatsyError, sniffer.sniff, np.asarray([["b"]]))
307# returns either a 1d ndarray or a pandas.Series
308def categorical_to_int(data, levels, NA_action, origin=None):
309 assert isinstance(levels, tuple)
310 # In this function, missing values are always mapped to -1
312 if safe_is_pandas_categorical(data):
313 data_levels_tuple = tuple(pandas_Categorical_categories(data))
314 if not data_levels_tuple == levels:
315 raise PatsyError("mismatching levels: expected %r, got %r"
316 % (levels, data_levels_tuple), origin)
317 # pandas.Categorical also uses -1 to indicate NA, and we don't try to
318 # second-guess its NA detection, so we can just pass it back.
319 return pandas_Categorical_codes(data)
321 if isinstance(data, _CategoricalBox):
322 if data.levels is not None and tuple(data.levels) != levels:
323 raise PatsyError("mismatching levels: expected %r, got %r"
324 % (levels, tuple(data.levels)), origin)
325 data = data.data
327 data = _categorical_shape_fix(data)
329 try:
330 level_to_int = dict(zip(levels, range(len(levels))))
331 except TypeError:
332 raise PatsyError("Error interpreting categorical data: "
333 "all items must be hashable", origin)
335 # fastpath to avoid doing an item-by-item iteration over boolean arrays,
336 # as requested by #44
337 if hasattr(data, "dtype") and safe_issubdtype(data.dtype, np.bool_):
338 if level_to_int[False] == 0 and level_to_int[True] == 1:
339 return data.astype(np.int_)
340 out = np.empty(len(data), dtype=int)
341 for i, value in enumerate(data):
342 if NA_action.is_categorical_NA(value):
343 out[i] = -1
344 else:
345 try:
346 out[i] = level_to_int[value]
347 except KeyError:
348 SHOW_LEVELS = 4
349 level_strs = []
350 if len(levels) <= SHOW_LEVELS:
351 level_strs += [repr(level) for level in levels]
352 else:
353 level_strs += [repr(level)
354 for level in levels[:SHOW_LEVELS//2]]
355 level_strs.append("...")
356 level_strs += [repr(level)
357 for level in levels[-SHOW_LEVELS//2:]]
358 level_str = "[%s]" % (", ".join(level_strs))
359 raise PatsyError("Error converting data to categorical: "
360 "observation with value %r does not match "
361 "any of the expected levels (expected: %s)"
362 % (value, level_str), origin)
363 except TypeError:
364 raise PatsyError("Error converting data to categorical: "
365 "encountered unhashable value %r"
366 % (value,), origin)
367 if have_pandas and isinstance(data, pandas.Series):
368 out = pandas.Series(out, index=data.index)
369 return out
371def test_categorical_to_int():
372 from nose.tools import assert_raises
373 from patsy.missing import NAAction
374 if have_pandas:
375 s = pandas.Series(["a", "b", "c"], index=[10, 20, 30])
376 c_pandas = categorical_to_int(s, ("a", "b", "c"), NAAction())
377 assert np.all(c_pandas == [0, 1, 2])
378 assert np.all(c_pandas.index == [10, 20, 30])
379 # Input must be 1-dimensional
380 assert_raises(PatsyError,
381 categorical_to_int,
382 pandas.DataFrame({10: s}), ("a", "b", "c"), NAAction())
383 if have_pandas_categorical:
384 constructors = [pandas_Categorical_from_codes]
385 if have_pandas_categorical_dtype:
386 def Series_from_codes(codes, categories):
387 c = pandas_Categorical_from_codes(codes, categories)
388 return pandas.Series(c)
389 constructors.append(Series_from_codes)
390 for con in constructors:
391 cat = con([1, 0, -1], ("a", "b"))
392 conv = categorical_to_int(cat, ("a", "b"), NAAction())
393 assert np.all(conv == [1, 0, -1])
394 # Trust pandas NA marking
395 cat2 = con([1, 0, -1], ("a", "None"))
396 conv2 = categorical_to_int(cat, ("a", "b"),
397 NAAction(NA_types=["None"]))
398 assert np.all(conv2 == [1, 0, -1])
399 # But levels must match
400 assert_raises(PatsyError,
401 categorical_to_int,
402 con([1, 0], ("a", "b")),
403 ("a", "c"),
404 NAAction())
405 assert_raises(PatsyError,
406 categorical_to_int,
407 con([1, 0], ("a", "b")),
408 ("b", "a"),
409 NAAction())
411 def t(data, levels, expected, NA_action=NAAction()):
412 got = categorical_to_int(data, levels, NA_action)
413 assert np.array_equal(got, expected)
415 t(["a", "b", "a"], ("a", "b"), [0, 1, 0])
416 t(np.asarray(["a", "b", "a"]), ("a", "b"), [0, 1, 0])
417 t(np.asarray(["a", "b", "a"], dtype=object), ("a", "b"), [0, 1, 0])
418 t([0, 1, 2], (1, 2, 0), [2, 0, 1])
419 t(np.asarray([0, 1, 2]), (1, 2, 0), [2, 0, 1])
420 t(np.asarray([0, 1, 2], dtype=float), (1, 2, 0), [2, 0, 1])
421 t(np.asarray([0, 1, 2], dtype=object), (1, 2, 0), [2, 0, 1])
422 t(["a", "b", "a"], ("a", "d", "z", "b"), [0, 3, 0])
423 t([("a", 1), ("b", 0), ("a", 1)], (("a", 1), ("b", 0)), [0, 1, 0])
425 assert_raises(PatsyError, categorical_to_int,
426 ["a", "b", "a"], ("a", "c"), NAAction())
428 t(C(["a", "b", "a"]), ("a", "b"), [0, 1, 0])
429 t(C(["a", "b", "a"]), ("b", "a"), [1, 0, 1])
430 t(C(["a", "b", "a"], levels=["b", "a"]), ("b", "a"), [1, 0, 1])
431 # Mismatch between C() levels and expected levels
432 assert_raises(PatsyError, categorical_to_int,
433 C(["a", "b", "a"], levels=["a", "b"]),
434 ("b", "a"), NAAction())
436 # ndim == 0 is okay
437 t("a", ("a", "b"), [0])
438 t("b", ("a", "b"), [1])
439 t(True, (False, True), [1])
441 # ndim == 2 is disallowed
442 assert_raises(PatsyError, categorical_to_int,
443 np.asarray([["a", "b"], ["b", "a"]]),
444 ("a", "b"), NAAction())
446 # levels must be hashable
447 assert_raises(PatsyError, categorical_to_int,
448 ["a", "b"], ("a", "b", {}), NAAction())
449 assert_raises(PatsyError, categorical_to_int,
450 ["a", "b", {}], ("a", "b"), NAAction())
452 t(["b", None, np.nan, "a"], ("a", "b"), [1, -1, -1, 0],
453 NAAction(NA_types=["None", "NaN"]))
454 t(["b", None, np.nan, "a"], ("a", "b", None), [1, -1, -1, 0],
455 NAAction(NA_types=["None", "NaN"]))
456 t(["b", None, np.nan, "a"], ("a", "b", None), [1, 2, -1, 0],
457 NAAction(NA_types=["NaN"]))
459 # Smoke test for the branch that formats the ellipsized list of levels in
460 # the error message:
461 assert_raises(PatsyError, categorical_to_int,
462 ["a", "b", "q"],
463 ("a", "b", "c", "d", "e", "f", "g", "h"),
464 NAAction())