Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/user_util.py : 18%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2012 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# Miscellaneous utilities that are useful to users (as compared to
6# patsy.util, which is misc. utilities useful for implementing patsy).
8# These are made available in the patsy.* namespace
9__all__ = ["balanced", "demo_data", "LookupFactor"]
11import itertools
12import numpy as np
13from patsy import PatsyError
14from patsy.categorical import C
15from patsy.util import no_pickling, assert_no_pickling
17def balanced(**kwargs):
18 """balanced(factor_name=num_levels, [factor_name=num_levels, ..., repeat=1])
20 Create simple balanced factorial designs for testing.
22 Given some factor names and the number of desired levels for each,
23 generates a balanced factorial design in the form of a data
24 dictionary. For example:
26 .. ipython::
28 In [1]: balanced(a=2, b=3)
29 Out[1]:
30 {'a': ['a1', 'a1', 'a1', 'a2', 'a2', 'a2'],
31 'b': ['b1', 'b2', 'b3', 'b1', 'b2', 'b3']}
33 By default it produces exactly one instance of each combination of levels,
34 but if you want multiple replicates this can be accomplished via the
35 `repeat` argument:
37 .. ipython::
39 In [2]: balanced(a=2, b=2, repeat=2)
40 Out[2]:
41 {'a': ['a1', 'a1', 'a2', 'a2', 'a1', 'a1', 'a2', 'a2'],
42 'b': ['b1', 'b2', 'b1', 'b2', 'b1', 'b2', 'b1', 'b2']}
43 """
44 repeat = kwargs.pop("repeat", 1)
45 levels = []
46 names = sorted(kwargs)
47 for name in names:
48 level_count = kwargs[name]
49 levels.append(["%s%s" % (name, i) for i in range(1, level_count + 1)])
50 # zip(*...) does an "unzip"
51 values = zip(*itertools.product(*levels))
52 data = {}
53 for name, value in zip(names, values):
54 data[name] = list(value) * repeat
55 return data
57def test_balanced():
58 data = balanced(a=2, b=3)
59 assert data["a"] == ["a1", "a1", "a1", "a2", "a2", "a2"]
60 assert data["b"] == ["b1", "b2", "b3", "b1", "b2", "b3"]
61 data = balanced(a=2, b=3, repeat=2)
62 assert data["a"] == ["a1", "a1", "a1", "a2", "a2", "a2",
63 "a1", "a1", "a1", "a2", "a2", "a2"]
64 assert data["b"] == ["b1", "b2", "b3", "b1", "b2", "b3",
65 "b1", "b2", "b3", "b1", "b2", "b3"]
67def demo_data(*names, **kwargs):
68 """demo_data(*names, nlevels=2, min_rows=5)
70 Create simple categorical/numerical demo data.
72 Pass in a set of variable names, and this function will return a simple
73 data set using those variable names.
75 Names whose first letter falls in the range "a" through "m" will be made
76 categorical (with `nlevels` levels). Those that start with a "p" through
77 "z" are numerical.
79 We attempt to produce a balanced design on the categorical variables,
80 repeating as necessary to generate at least `min_rows` data
81 points. Categorical variables are returned as a list of strings.
83 Numerical data is generated by sampling from a normal distribution. A
84 fixed random seed is used, so that identical calls to demo_data() will
85 produce identical results. Numerical data is returned in a numpy array.
87 Example:
89 .. ipython:
91 In [1]: patsy.demo_data("a", "b", "x", "y")
92 Out[1]:
93 {'a': ['a1', 'a1', 'a2', 'a2', 'a1', 'a1', 'a2', 'a2'],
94 'b': ['b1', 'b2', 'b1', 'b2', 'b1', 'b2', 'b1', 'b2'],
95 'x': array([ 1.76405235, 0.40015721, 0.97873798, 2.2408932 ,
96 1.86755799, -0.97727788, 0.95008842, -0.15135721]),
97 'y': array([-0.10321885, 0.4105985 , 0.14404357, 1.45427351,
98 0.76103773, 0.12167502, 0.44386323, 0.33367433])}
99 """
100 nlevels = kwargs.pop("nlevels", 2)
101 min_rows = kwargs.pop("min_rows", 5)
102 if kwargs:
103 raise TypeError("unexpected keyword arguments %r" % (kwargs,))
104 numerical = set()
105 categorical = {}
106 for name in names:
107 if name[0] in "abcdefghijklmn":
108 categorical[name] = nlevels
109 elif name[0] in "pqrstuvwxyz":
110 numerical.add(name)
111 else:
112 raise PatsyError("bad name %r" % (name,))
113 balanced_design_size = np.prod(list(categorical.values()), dtype=int)
114 repeat = int(np.ceil(min_rows * 1.0 / balanced_design_size))
115 num_rows = repeat * balanced_design_size
116 data = balanced(repeat=repeat, **categorical)
117 r = np.random.RandomState(0)
118 for name in sorted(numerical):
119 data[name] = r.normal(size=num_rows)
120 return data
122def test_demo_data():
123 d1 = demo_data("a", "b", "x")
124 assert sorted(d1.keys()) == ["a", "b", "x"]
125 assert d1["a"] == ["a1", "a1", "a2", "a2", "a1", "a1", "a2", "a2"]
126 assert d1["b"] == ["b1", "b2", "b1", "b2", "b1", "b2", "b1", "b2"]
127 assert d1["x"].dtype == np.dtype(float)
128 assert d1["x"].shape == (8,)
130 d2 = demo_data("x", "y")
131 assert sorted(d2.keys()) == ["x", "y"]
132 assert len(d2["x"]) == len(d2["y"]) == 5
134 assert len(demo_data("x", min_rows=10)["x"]) == 10
135 assert len(demo_data("a", "b", "x", min_rows=10)["x"]) == 12
136 assert len(demo_data("a", "b", "x", min_rows=10, nlevels=3)["x"]) == 18
138 from nose.tools import assert_raises
139 assert_raises(PatsyError, demo_data, "a", "b", "__123")
140 assert_raises(TypeError, demo_data, "a", "b", asdfasdf=123)
142class LookupFactor(object):
143 """A simple factor class that simply looks up a named entry in the given
144 data.
146 Useful for programatically constructing formulas, and as a simple example
147 of the factor protocol. For details see
148 :ref:`expert-model-specification`.
150 Example::
152 dmatrix(ModelDesc([], [Term([LookupFactor("x")])]), {"x": [1, 2, 3]})
154 :arg varname: The name of this variable; used as a lookup key in the
155 passed in data dictionary/DataFrame/whatever.
156 :arg force_categorical: If True, then treat this factor as
157 categorical. (Equivalent to using :func:`C` in a regular formula, but
158 of course you can't do that with a :class:`LookupFactor`.
159 :arg contrast: If given, the contrast to use; see :func:`C`. (Requires
160 ``force_categorical=True``.)
161 :arg levels: If given, the categorical levels; see :func:`C`. (Requires
162 ``force_categorical=True``.)
163 :arg origin: Either ``None``, or the :class:`Origin` of this factor for use
164 in error reporting.
166 .. versionadded:: 0.2.0
167 The ``force_categorical`` and related arguments.
168 """
169 def __init__(self, varname,
170 force_categorical=False, contrast=None, levels=None,
171 origin=None):
172 self._varname = varname
173 self._force_categorical = force_categorical
174 self._contrast = contrast
175 self._levels = levels
176 self.origin = origin
177 if not self._force_categorical:
178 if contrast is not None:
179 raise ValueError("contrast= requires force_categorical=True")
180 if levels is not None:
181 raise ValueError("levels= requires force_categorical=True")
183 def name(self):
184 return self._varname
186 def __repr__(self):
187 return "%s(%r)" % (self.__class__.__name__, self._varname)
189 def __eq__(self, other):
190 return (isinstance(other, LookupFactor)
191 and self._varname == other._varname
192 and self._force_categorical == other._force_categorical
193 and self._contrast == other._contrast
194 and self._levels == other._levels)
196 def __ne__(self, other):
197 return not self == other
199 def __hash__(self):
200 return hash((LookupFactor, self._varname,
201 self._force_categorical, self._contrast, self._levels))
203 def memorize_passes_needed(self, state, eval_env):
204 return 0
206 def memorize_chunk(self, state, which_pass, data): # pragma: no cover
207 assert False
209 def memorize_finish(self, state, which_pass): # pragma: no cover
210 assert False
212 def eval(self, memorize_state, data):
213 value = data[self._varname]
214 if self._force_categorical:
215 value = C(value, contrast=self._contrast, levels=self._levels)
216 return value
218 __getstate__ = no_pickling
220def test_LookupFactor():
221 l_a = LookupFactor("a")
222 assert l_a.name() == "a"
223 assert l_a == LookupFactor("a")
224 assert l_a != LookupFactor("b")
225 assert hash(l_a) == hash(LookupFactor("a"))
226 assert hash(l_a) != hash(LookupFactor("b"))
227 assert l_a.eval({}, {"a": 1}) == 1
228 assert l_a.eval({}, {"a": 2}) == 2
229 assert repr(l_a) == "LookupFactor('a')"
230 assert l_a.origin is None
231 l_with_origin = LookupFactor("b", origin="asdf")
232 assert l_with_origin.origin == "asdf"
234 l_c = LookupFactor("c", force_categorical=True,
235 contrast="CONTRAST", levels=(1, 2))
236 box = l_c.eval({}, {"c": [1, 1, 2]})
237 assert box.data == [1, 1, 2]
238 assert box.contrast == "CONTRAST"
239 assert box.levels == (1, 2)
241 from nose.tools import assert_raises
242 assert_raises(ValueError, LookupFactor, "nc", contrast="CONTRAST")
243 assert_raises(ValueError, LookupFactor, "nc", levels=(1, 2))
245 assert_no_pickling(LookupFactor("a"))