Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2012 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# Miscellaneous utilities that are useful to users (as compared to 

6# patsy.util, which is misc. utilities useful for implementing patsy). 

7 

8# These are made available in the patsy.* namespace 

9__all__ = ["balanced", "demo_data", "LookupFactor"] 

10 

11import itertools 

12import numpy as np 

13from patsy import PatsyError 

14from patsy.categorical import C 

15from patsy.util import no_pickling, assert_no_pickling 

16 

17def balanced(**kwargs): 

18 """balanced(factor_name=num_levels, [factor_name=num_levels, ..., repeat=1]) 

19 

20 Create simple balanced factorial designs for testing. 

21 

22 Given some factor names and the number of desired levels for each, 

23 generates a balanced factorial design in the form of a data 

24 dictionary. For example: 

25 

26 .. ipython:: 

27 

28 In [1]: balanced(a=2, b=3) 

29 Out[1]: 

30 {'a': ['a1', 'a1', 'a1', 'a2', 'a2', 'a2'], 

31 'b': ['b1', 'b2', 'b3', 'b1', 'b2', 'b3']} 

32 

33 By default it produces exactly one instance of each combination of levels, 

34 but if you want multiple replicates this can be accomplished via the 

35 `repeat` argument: 

36 

37 .. ipython:: 

38 

39 In [2]: balanced(a=2, b=2, repeat=2) 

40 Out[2]: 

41 {'a': ['a1', 'a1', 'a2', 'a2', 'a1', 'a1', 'a2', 'a2'], 

42 'b': ['b1', 'b2', 'b1', 'b2', 'b1', 'b2', 'b1', 'b2']} 

43 """ 

44 repeat = kwargs.pop("repeat", 1) 

45 levels = [] 

46 names = sorted(kwargs) 

47 for name in names: 

48 level_count = kwargs[name] 

49 levels.append(["%s%s" % (name, i) for i in range(1, level_count + 1)]) 

50 # zip(*...) does an "unzip" 

51 values = zip(*itertools.product(*levels)) 

52 data = {} 

53 for name, value in zip(names, values): 

54 data[name] = list(value) * repeat 

55 return data 

56 

57def test_balanced(): 

58 data = balanced(a=2, b=3) 

59 assert data["a"] == ["a1", "a1", "a1", "a2", "a2", "a2"] 

60 assert data["b"] == ["b1", "b2", "b3", "b1", "b2", "b3"] 

61 data = balanced(a=2, b=3, repeat=2) 

62 assert data["a"] == ["a1", "a1", "a1", "a2", "a2", "a2", 

63 "a1", "a1", "a1", "a2", "a2", "a2"] 

64 assert data["b"] == ["b1", "b2", "b3", "b1", "b2", "b3", 

65 "b1", "b2", "b3", "b1", "b2", "b3"] 

66 

67def demo_data(*names, **kwargs): 

68 """demo_data(*names, nlevels=2, min_rows=5) 

69 

70 Create simple categorical/numerical demo data. 

71 

72 Pass in a set of variable names, and this function will return a simple 

73 data set using those variable names. 

74 

75 Names whose first letter falls in the range "a" through "m" will be made 

76 categorical (with `nlevels` levels). Those that start with a "p" through 

77 "z" are numerical. 

78 

79 We attempt to produce a balanced design on the categorical variables, 

80 repeating as necessary to generate at least `min_rows` data 

81 points. Categorical variables are returned as a list of strings. 

82 

83 Numerical data is generated by sampling from a normal distribution. A 

84 fixed random seed is used, so that identical calls to demo_data() will 

85 produce identical results. Numerical data is returned in a numpy array. 

86 

87 Example: 

88 

89 .. ipython: 

90 

91 In [1]: patsy.demo_data("a", "b", "x", "y") 

92 Out[1]:  

93 {'a': ['a1', 'a1', 'a2', 'a2', 'a1', 'a1', 'a2', 'a2'], 

94 'b': ['b1', 'b2', 'b1', 'b2', 'b1', 'b2', 'b1', 'b2'], 

95 'x': array([ 1.76405235, 0.40015721, 0.97873798, 2.2408932 , 

96 1.86755799, -0.97727788, 0.95008842, -0.15135721]), 

97 'y': array([-0.10321885, 0.4105985 , 0.14404357, 1.45427351, 

98 0.76103773, 0.12167502, 0.44386323, 0.33367433])} 

99 """ 

100 nlevels = kwargs.pop("nlevels", 2) 

101 min_rows = kwargs.pop("min_rows", 5) 

102 if kwargs: 

103 raise TypeError("unexpected keyword arguments %r" % (kwargs,)) 

104 numerical = set() 

105 categorical = {} 

106 for name in names: 

107 if name[0] in "abcdefghijklmn": 

108 categorical[name] = nlevels 

109 elif name[0] in "pqrstuvwxyz": 

110 numerical.add(name) 

111 else: 

112 raise PatsyError("bad name %r" % (name,)) 

113 balanced_design_size = np.prod(list(categorical.values()), dtype=int) 

114 repeat = int(np.ceil(min_rows * 1.0 / balanced_design_size)) 

115 num_rows = repeat * balanced_design_size 

116 data = balanced(repeat=repeat, **categorical) 

117 r = np.random.RandomState(0) 

118 for name in sorted(numerical): 

119 data[name] = r.normal(size=num_rows) 

120 return data 

121 

122def test_demo_data(): 

123 d1 = demo_data("a", "b", "x") 

124 assert sorted(d1.keys()) == ["a", "b", "x"] 

125 assert d1["a"] == ["a1", "a1", "a2", "a2", "a1", "a1", "a2", "a2"] 

126 assert d1["b"] == ["b1", "b2", "b1", "b2", "b1", "b2", "b1", "b2"] 

127 assert d1["x"].dtype == np.dtype(float) 

128 assert d1["x"].shape == (8,) 

129 

130 d2 = demo_data("x", "y") 

131 assert sorted(d2.keys()) == ["x", "y"] 

132 assert len(d2["x"]) == len(d2["y"]) == 5 

133 

134 assert len(demo_data("x", min_rows=10)["x"]) == 10 

135 assert len(demo_data("a", "b", "x", min_rows=10)["x"]) == 12 

136 assert len(demo_data("a", "b", "x", min_rows=10, nlevels=3)["x"]) == 18 

137 

138 from nose.tools import assert_raises 

139 assert_raises(PatsyError, demo_data, "a", "b", "__123") 

140 assert_raises(TypeError, demo_data, "a", "b", asdfasdf=123) 

141 

142class LookupFactor(object): 

143 """A simple factor class that simply looks up a named entry in the given 

144 data. 

145 

146 Useful for programatically constructing formulas, and as a simple example 

147 of the factor protocol. For details see 

148 :ref:`expert-model-specification`. 

149 

150 Example:: 

151 

152 dmatrix(ModelDesc([], [Term([LookupFactor("x")])]), {"x": [1, 2, 3]}) 

153 

154 :arg varname: The name of this variable; used as a lookup key in the 

155 passed in data dictionary/DataFrame/whatever. 

156 :arg force_categorical: If True, then treat this factor as 

157 categorical. (Equivalent to using :func:`C` in a regular formula, but 

158 of course you can't do that with a :class:`LookupFactor`. 

159 :arg contrast: If given, the contrast to use; see :func:`C`. (Requires 

160 ``force_categorical=True``.) 

161 :arg levels: If given, the categorical levels; see :func:`C`. (Requires 

162 ``force_categorical=True``.) 

163 :arg origin: Either ``None``, or the :class:`Origin` of this factor for use 

164 in error reporting. 

165 

166 .. versionadded:: 0.2.0 

167 The ``force_categorical`` and related arguments. 

168 """ 

169 def __init__(self, varname, 

170 force_categorical=False, contrast=None, levels=None, 

171 origin=None): 

172 self._varname = varname 

173 self._force_categorical = force_categorical 

174 self._contrast = contrast 

175 self._levels = levels 

176 self.origin = origin 

177 if not self._force_categorical: 

178 if contrast is not None: 

179 raise ValueError("contrast= requires force_categorical=True") 

180 if levels is not None: 

181 raise ValueError("levels= requires force_categorical=True") 

182 

183 def name(self): 

184 return self._varname 

185 

186 def __repr__(self): 

187 return "%s(%r)" % (self.__class__.__name__, self._varname) 

188 

189 def __eq__(self, other): 

190 return (isinstance(other, LookupFactor) 

191 and self._varname == other._varname 

192 and self._force_categorical == other._force_categorical 

193 and self._contrast == other._contrast 

194 and self._levels == other._levels) 

195 

196 def __ne__(self, other): 

197 return not self == other 

198 

199 def __hash__(self): 

200 return hash((LookupFactor, self._varname, 

201 self._force_categorical, self._contrast, self._levels)) 

202 

203 def memorize_passes_needed(self, state, eval_env): 

204 return 0 

205 

206 def memorize_chunk(self, state, which_pass, data): # pragma: no cover 

207 assert False 

208 

209 def memorize_finish(self, state, which_pass): # pragma: no cover 

210 assert False 

211 

212 def eval(self, memorize_state, data): 

213 value = data[self._varname] 

214 if self._force_categorical: 

215 value = C(value, contrast=self._contrast, levels=self._levels) 

216 return value 

217 

218 __getstate__ = no_pickling 

219 

220def test_LookupFactor(): 

221 l_a = LookupFactor("a") 

222 assert l_a.name() == "a" 

223 assert l_a == LookupFactor("a") 

224 assert l_a != LookupFactor("b") 

225 assert hash(l_a) == hash(LookupFactor("a")) 

226 assert hash(l_a) != hash(LookupFactor("b")) 

227 assert l_a.eval({}, {"a": 1}) == 1 

228 assert l_a.eval({}, {"a": 2}) == 2 

229 assert repr(l_a) == "LookupFactor('a')" 

230 assert l_a.origin is None 

231 l_with_origin = LookupFactor("b", origin="asdf") 

232 assert l_with_origin.origin == "asdf" 

233 

234 l_c = LookupFactor("c", force_categorical=True, 

235 contrast="CONTRAST", levels=(1, 2)) 

236 box = l_c.eval({}, {"c": [1, 1, 2]}) 

237 assert box.data == [1, 1, 2] 

238 assert box.contrast == "CONTRAST" 

239 assert box.levels == (1, 2) 

240 

241 from nose.tools import assert_raises 

242 assert_raises(ValueError, LookupFactor, "nc", contrast="CONTRAST") 

243 assert_raises(ValueError, LookupFactor, "nc", levels=(1, 2)) 

244 

245 assert_no_pickling(LookupFactor("a"))