Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# This file has the code that figures out how each factor in some given Term 

6# should be coded. This is complicated by dealing with models with categorical 

7# factors like: 

8# 1 + a + a:b 

9# then technically 'a' (which represents the space of vectors that can be 

10# produced as linear combinations of the dummy coding of the levels of the 

11# factor a) is collinear with the intercept, and 'a:b' (which represents the 

12# space of vectors that can be produced as linear combinations of the dummy 

13# coding *of a new factor whose levels are the cartesian product of a and b) 

14# is collinear with both 'a' and the intercept. 

15# 

16# In such a case, the rule is that we find some way to code each term so that 

17# the full space of vectors that it represents *is present in the model* BUT 

18# there is no collinearity between the different terms. In effect, we have to 

19# choose a set of vectors that spans everything that that term wants to span, 

20# *except* that part of the vector space which was already spanned by earlier 

21# terms. 

22 

23# How? We replace each term with the set of "subterms" that it covers, like 

24# so: 

25# 1 -> () 

26# a -> (), a- 

27# a:b -> (), a-, b-, a-:b- 

28# where "-" means "coded so as not to span the intercept". So that example 

29# above expands to 

30# [()] + [() + a-] + [() + a- + b- + a-:b-] 

31# so we go through from left to right, and for each term we: 

32# 1) toss out all the subterms that have already been used (this is a simple 

33# equality test, no magic) 

34# 2) simplify the terms that are left, according to rules like 

35# () + a- = a+ 

36# (here + means, "coded to span the intercept") 

37# 3) use the resulting subterm list as our coding for this term! 

38# So in the above, we go: 

39# (): stays the same, coded as intercept 

40# () + a-: reduced to just a-, which is what we code 

41# () + a- + b- + a-:b-: reduced to b- + a-:b-, which is simplified to a+:b-. 

42 

43from __future__ import print_function 

44 

45from patsy.util import no_pickling 

46 

47# This should really be a named tuple, but those don't exist until Python 

48# 2.6... 

49class _ExpandedFactor(object): 

50 """A factor, with an additional annotation for whether it is coded 

51 full-rank (includes_intercept=True) or not. 

52 

53 These objects are treated as immutable.""" 

54 def __init__(self, includes_intercept, factor): 

55 self.includes_intercept = includes_intercept 

56 self.factor = factor 

57 

58 def __hash__(self): 

59 return hash((_ExpandedFactor, self.includes_intercept, self.factor)) 

60 

61 def __eq__(self, other): 

62 return (isinstance(other, _ExpandedFactor) 

63 and other.includes_intercept == self.includes_intercept 

64 and other.factor == self.factor) 

65 

66 def __ne__(self, other): 

67 return not self == other 

68 

69 def __repr__(self): 

70 if self.includes_intercept: 

71 suffix = "+" 

72 else: 

73 suffix = "-" 

74 return "%r%s" % (self.factor, suffix) 

75 

76 __getstate__ = no_pickling 

77 

78class _Subterm(object): 

79 "Also immutable." 

80 def __init__(self, efactors): 

81 self.efactors = frozenset(efactors) 

82 

83 def can_absorb(self, other): 

84 # returns True if 'self' is like a-:b-, and 'other' is like a- 

85 return (len(self.efactors) - len(other.efactors) == 1 

86 and self.efactors.issuperset(other.efactors)) 

87 

88 def absorb(self, other): 

89 diff = self.efactors.difference(other.efactors) 

90 assert len(diff) == 1 

91 efactor = list(diff)[0] 

92 assert not efactor.includes_intercept 

93 new_factors = set(other.efactors) 

94 new_factors.add(_ExpandedFactor(True, efactor.factor)) 

95 return _Subterm(new_factors) 

96 

97 def __hash__(self): 

98 return hash((_Subterm, self.efactors)) 

99 

100 def __eq__(self, other): 

101 return (isinstance(other, _Subterm) 

102 and self.efactors == self.efactors) 

103 

104 def __ne__(self, other): 

105 return not self == other 

106 

107 def __repr__(self): 

108 return "%s(%r)" % (self.__class__.__name__, list(self.efactors)) 

109 

110 __getstate__ = no_pickling 

111 

112# For testing: takes a shorthand description of a list of subterms like 

113# [(), ("a-",), ("a-", "b+")] 

114# and expands it into a list of _Subterm and _ExpandedFactor objects. 

115def _expand_test_abbrevs(short_subterms): 

116 subterms = [] 

117 for subterm in short_subterms: 

118 factors = [] 

119 for factor_name in subterm: 

120 assert factor_name[-1] in ("+", "-") 

121 factors.append(_ExpandedFactor(factor_name[-1] == "+", 

122 factor_name[:-1])) 

123 subterms.append(_Subterm(factors)) 

124 return subterms 

125 

126def test__Subterm(): 

127 s_ab = _expand_test_abbrevs([["a-", "b-"]])[0] 

128 s_abc = _expand_test_abbrevs([["a-", "b-", "c-"]])[0] 

129 s_null = _expand_test_abbrevs([[]])[0] 

130 s_cd = _expand_test_abbrevs([["c-", "d-"]])[0] 

131 s_a = _expand_test_abbrevs([["a-"]])[0] 

132 s_ap = _expand_test_abbrevs([["a+"]])[0] 

133 s_abp = _expand_test_abbrevs([["a-", "b+"]])[0] 

134 for bad in s_abc, s_null, s_cd, s_ap, s_abp: 

135 assert not s_ab.can_absorb(bad) 

136 assert s_ab.can_absorb(s_a) 

137 assert s_ab.absorb(s_a) == s_abp 

138 

139# Importantly, this preserves the order of the input. Both the items inside 

140# each subset are in the order they were in the original tuple, and the tuples 

141# are emitted so that they're sorted with respect to their elements position 

142# in the original tuple. 

143def _subsets_sorted(tupl): 

144 def helper(seq): 

145 if not seq: 

146 yield () 

147 else: 

148 obj = seq[0] 

149 for subset in _subsets_sorted(seq[1:]): 

150 yield subset 

151 yield (obj,) + subset 

152 # Transform each obj -> (idx, obj) tuple, so that we can later sort them 

153 # by their position in the original list. 

154 expanded = list(enumerate(tupl)) 

155 expanded_subsets = list(helper(expanded)) 

156 # This exploits Python's stable sort: we want short before long, and ties 

157 # broken by natural ordering on the (idx, obj) entries in each subset. So 

158 # we sort by the latter first, then by the former. 

159 expanded_subsets.sort() 

160 expanded_subsets.sort(key=len) 

161 # And finally, we strip off the idx's: 

162 for subset in expanded_subsets: 

163 yield tuple([obj for (idx, obj) in subset]) 

164 

165def test__subsets_sorted(): 

166 assert list(_subsets_sorted((1, 2))) == [(), (1,), (2,), (1, 2)] 

167 assert (list(_subsets_sorted((1, 2, 3))) 

168 == [(), (1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)]) 

169 assert len(list(_subsets_sorted(range(5)))) == 2 ** 5 

170 

171def _simplify_one_subterm(subterms): 

172 # We simplify greedily from left to right. 

173 # Returns True if succeeded, False otherwise 

174 for short_i, short_subterm in enumerate(subterms): 

175 for long_i, long_subterm in enumerate(subterms[short_i + 1:]): 

176 if long_subterm.can_absorb(short_subterm): 

177 new_subterm = long_subterm.absorb(short_subterm) 

178 subterms[short_i + 1 + long_i] = new_subterm 

179 subterms.pop(short_i) 

180 return True 

181 return False 

182 

183def _simplify_subterms(subterms): 

184 while _simplify_one_subterm(subterms): 

185 pass 

186 

187def test__simplify_subterms(): 

188 def t(given, expected): 

189 given = _expand_test_abbrevs(given) 

190 expected = _expand_test_abbrevs(expected) 

191 print("testing if:", given, "->", expected) 

192 _simplify_subterms(given) 

193 assert given == expected 

194 t([("a-",)], [("a-",)]) 

195 t([(), ("a-",)], [("a+",)]) 

196 t([(), ("a-",), ("b-",), ("a-", "b-")], [("a+", "b+")]) 

197 t([(), ("a-",), ("a-", "b-")], [("a+",), ("a-", "b-")]) 

198 t([("a-",), ("b-",), ("a-", "b-")], [("b-",), ("a-", "b+")]) 

199 

200# 'term' is a Term 

201# 'numeric_factors' is any set-like object which lists the 

202# numeric/non-categorical factors in this term. Such factors are just 

203# ignored by this routine. 

204# 'used_subterms' is a set which records which subterms have previously been 

205# used. E.g., a:b has subterms (), a, b, a:b, and if we're processing 

206# y ~ a + a:b 

207# then by the time we reach a:b, the () and a subterms will have already 

208# been used. This is an in/out argument, and should be treated as opaque by 

209# callers -- really it is a way for multiple invocations of this routine to 

210# talk to each other. Each time it is called, this routine adds the subterms 

211# of each factor to this set in place. So the first time this routine is 

212# called, pass in an empty set, and then just keep passing the same set to 

213# any future calls. 

214# Returns: a list of dicts. Each dict maps from factors to booleans. The 

215# coding for the given term should use a full-rank contrast for those factors 

216# which map to True, a (n-1)-rank contrast for those factors which map to 

217# False, and any factors which are not mentioned are numeric and should be 

218# added back in. These dicts should add columns to the design matrix from left 

219# to right. 

220def pick_contrasts_for_term(term, numeric_factors, used_subterms): 

221 categorical_factors = [f for f in term.factors if f not in numeric_factors] 

222 # Converts a term into an expanded list of subterms like: 

223 # a:b -> 1 + a- + b- + a-:b- 

224 # and discards the ones that have already been used. 

225 subterms = [] 

226 for subset in _subsets_sorted(categorical_factors): 

227 subterm = _Subterm([_ExpandedFactor(False, f) for f in subset]) 

228 if subterm not in used_subterms: 

229 subterms.append(subterm) 

230 used_subterms.update(subterms) 

231 _simplify_subterms(subterms) 

232 factor_codings = [] 

233 for subterm in subterms: 

234 factor_coding = {} 

235 for expanded in subterm.efactors: 

236 factor_coding[expanded.factor] = expanded.includes_intercept 

237 factor_codings.append(factor_coding) 

238 return factor_codings 

239 

240def test_pick_contrasts_for_term(): 

241 from patsy.desc import Term 

242 used = set() 

243 codings = pick_contrasts_for_term(Term([]), set(), used) 

244 assert codings == [{}] 

245 codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used) 

246 assert codings == [{"a": False}] 

247 codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used) 

248 assert codings == [{"a": True, "b": False}] 

249 used_snapshot = set(used) 

250 codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used) 

251 assert codings == [{"d": False}, {"c": False, "d": True}] 

252 # Do it again backwards, to make sure we're deterministic with respect to 

253 # order: 

254 codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot) 

255 assert codings == [{"c": False}, {"c": True, "d": False}]