Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/redundancy.py : 20%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# This file has the code that figures out how each factor in some given Term
6# should be coded. This is complicated by dealing with models with categorical
7# factors like:
8# 1 + a + a:b
9# then technically 'a' (which represents the space of vectors that can be
10# produced as linear combinations of the dummy coding of the levels of the
11# factor a) is collinear with the intercept, and 'a:b' (which represents the
12# space of vectors that can be produced as linear combinations of the dummy
13# coding *of a new factor whose levels are the cartesian product of a and b)
14# is collinear with both 'a' and the intercept.
15#
16# In such a case, the rule is that we find some way to code each term so that
17# the full space of vectors that it represents *is present in the model* BUT
18# there is no collinearity between the different terms. In effect, we have to
19# choose a set of vectors that spans everything that that term wants to span,
20# *except* that part of the vector space which was already spanned by earlier
21# terms.
23# How? We replace each term with the set of "subterms" that it covers, like
24# so:
25# 1 -> ()
26# a -> (), a-
27# a:b -> (), a-, b-, a-:b-
28# where "-" means "coded so as not to span the intercept". So that example
29# above expands to
30# [()] + [() + a-] + [() + a- + b- + a-:b-]
31# so we go through from left to right, and for each term we:
32# 1) toss out all the subterms that have already been used (this is a simple
33# equality test, no magic)
34# 2) simplify the terms that are left, according to rules like
35# () + a- = a+
36# (here + means, "coded to span the intercept")
37# 3) use the resulting subterm list as our coding for this term!
38# So in the above, we go:
39# (): stays the same, coded as intercept
40# () + a-: reduced to just a-, which is what we code
41# () + a- + b- + a-:b-: reduced to b- + a-:b-, which is simplified to a+:b-.
43from __future__ import print_function
45from patsy.util import no_pickling
47# This should really be a named tuple, but those don't exist until Python
48# 2.6...
49class _ExpandedFactor(object):
50 """A factor, with an additional annotation for whether it is coded
51 full-rank (includes_intercept=True) or not.
53 These objects are treated as immutable."""
54 def __init__(self, includes_intercept, factor):
55 self.includes_intercept = includes_intercept
56 self.factor = factor
58 def __hash__(self):
59 return hash((_ExpandedFactor, self.includes_intercept, self.factor))
61 def __eq__(self, other):
62 return (isinstance(other, _ExpandedFactor)
63 and other.includes_intercept == self.includes_intercept
64 and other.factor == self.factor)
66 def __ne__(self, other):
67 return not self == other
69 def __repr__(self):
70 if self.includes_intercept:
71 suffix = "+"
72 else:
73 suffix = "-"
74 return "%r%s" % (self.factor, suffix)
76 __getstate__ = no_pickling
78class _Subterm(object):
79 "Also immutable."
80 def __init__(self, efactors):
81 self.efactors = frozenset(efactors)
83 def can_absorb(self, other):
84 # returns True if 'self' is like a-:b-, and 'other' is like a-
85 return (len(self.efactors) - len(other.efactors) == 1
86 and self.efactors.issuperset(other.efactors))
88 def absorb(self, other):
89 diff = self.efactors.difference(other.efactors)
90 assert len(diff) == 1
91 efactor = list(diff)[0]
92 assert not efactor.includes_intercept
93 new_factors = set(other.efactors)
94 new_factors.add(_ExpandedFactor(True, efactor.factor))
95 return _Subterm(new_factors)
97 def __hash__(self):
98 return hash((_Subterm, self.efactors))
100 def __eq__(self, other):
101 return (isinstance(other, _Subterm)
102 and self.efactors == self.efactors)
104 def __ne__(self, other):
105 return not self == other
107 def __repr__(self):
108 return "%s(%r)" % (self.__class__.__name__, list(self.efactors))
110 __getstate__ = no_pickling
112# For testing: takes a shorthand description of a list of subterms like
113# [(), ("a-",), ("a-", "b+")]
114# and expands it into a list of _Subterm and _ExpandedFactor objects.
115def _expand_test_abbrevs(short_subterms):
116 subterms = []
117 for subterm in short_subterms:
118 factors = []
119 for factor_name in subterm:
120 assert factor_name[-1] in ("+", "-")
121 factors.append(_ExpandedFactor(factor_name[-1] == "+",
122 factor_name[:-1]))
123 subterms.append(_Subterm(factors))
124 return subterms
126def test__Subterm():
127 s_ab = _expand_test_abbrevs([["a-", "b-"]])[0]
128 s_abc = _expand_test_abbrevs([["a-", "b-", "c-"]])[0]
129 s_null = _expand_test_abbrevs([[]])[0]
130 s_cd = _expand_test_abbrevs([["c-", "d-"]])[0]
131 s_a = _expand_test_abbrevs([["a-"]])[0]
132 s_ap = _expand_test_abbrevs([["a+"]])[0]
133 s_abp = _expand_test_abbrevs([["a-", "b+"]])[0]
134 for bad in s_abc, s_null, s_cd, s_ap, s_abp:
135 assert not s_ab.can_absorb(bad)
136 assert s_ab.can_absorb(s_a)
137 assert s_ab.absorb(s_a) == s_abp
139# Importantly, this preserves the order of the input. Both the items inside
140# each subset are in the order they were in the original tuple, and the tuples
141# are emitted so that they're sorted with respect to their elements position
142# in the original tuple.
143def _subsets_sorted(tupl):
144 def helper(seq):
145 if not seq:
146 yield ()
147 else:
148 obj = seq[0]
149 for subset in _subsets_sorted(seq[1:]):
150 yield subset
151 yield (obj,) + subset
152 # Transform each obj -> (idx, obj) tuple, so that we can later sort them
153 # by their position in the original list.
154 expanded = list(enumerate(tupl))
155 expanded_subsets = list(helper(expanded))
156 # This exploits Python's stable sort: we want short before long, and ties
157 # broken by natural ordering on the (idx, obj) entries in each subset. So
158 # we sort by the latter first, then by the former.
159 expanded_subsets.sort()
160 expanded_subsets.sort(key=len)
161 # And finally, we strip off the idx's:
162 for subset in expanded_subsets:
163 yield tuple([obj for (idx, obj) in subset])
165def test__subsets_sorted():
166 assert list(_subsets_sorted((1, 2))) == [(), (1,), (2,), (1, 2)]
167 assert (list(_subsets_sorted((1, 2, 3)))
168 == [(), (1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)])
169 assert len(list(_subsets_sorted(range(5)))) == 2 ** 5
171def _simplify_one_subterm(subterms):
172 # We simplify greedily from left to right.
173 # Returns True if succeeded, False otherwise
174 for short_i, short_subterm in enumerate(subterms):
175 for long_i, long_subterm in enumerate(subterms[short_i + 1:]):
176 if long_subterm.can_absorb(short_subterm):
177 new_subterm = long_subterm.absorb(short_subterm)
178 subterms[short_i + 1 + long_i] = new_subterm
179 subterms.pop(short_i)
180 return True
181 return False
183def _simplify_subterms(subterms):
184 while _simplify_one_subterm(subterms):
185 pass
187def test__simplify_subterms():
188 def t(given, expected):
189 given = _expand_test_abbrevs(given)
190 expected = _expand_test_abbrevs(expected)
191 print("testing if:", given, "->", expected)
192 _simplify_subterms(given)
193 assert given == expected
194 t([("a-",)], [("a-",)])
195 t([(), ("a-",)], [("a+",)])
196 t([(), ("a-",), ("b-",), ("a-", "b-")], [("a+", "b+")])
197 t([(), ("a-",), ("a-", "b-")], [("a+",), ("a-", "b-")])
198 t([("a-",), ("b-",), ("a-", "b-")], [("b-",), ("a-", "b+")])
200# 'term' is a Term
201# 'numeric_factors' is any set-like object which lists the
202# numeric/non-categorical factors in this term. Such factors are just
203# ignored by this routine.
204# 'used_subterms' is a set which records which subterms have previously been
205# used. E.g., a:b has subterms (), a, b, a:b, and if we're processing
206# y ~ a + a:b
207# then by the time we reach a:b, the () and a subterms will have already
208# been used. This is an in/out argument, and should be treated as opaque by
209# callers -- really it is a way for multiple invocations of this routine to
210# talk to each other. Each time it is called, this routine adds the subterms
211# of each factor to this set in place. So the first time this routine is
212# called, pass in an empty set, and then just keep passing the same set to
213# any future calls.
214# Returns: a list of dicts. Each dict maps from factors to booleans. The
215# coding for the given term should use a full-rank contrast for those factors
216# which map to True, a (n-1)-rank contrast for those factors which map to
217# False, and any factors which are not mentioned are numeric and should be
218# added back in. These dicts should add columns to the design matrix from left
219# to right.
220def pick_contrasts_for_term(term, numeric_factors, used_subterms):
221 categorical_factors = [f for f in term.factors if f not in numeric_factors]
222 # Converts a term into an expanded list of subterms like:
223 # a:b -> 1 + a- + b- + a-:b-
224 # and discards the ones that have already been used.
225 subterms = []
226 for subset in _subsets_sorted(categorical_factors):
227 subterm = _Subterm([_ExpandedFactor(False, f) for f in subset])
228 if subterm not in used_subterms:
229 subterms.append(subterm)
230 used_subterms.update(subterms)
231 _simplify_subterms(subterms)
232 factor_codings = []
233 for subterm in subterms:
234 factor_coding = {}
235 for expanded in subterm.efactors:
236 factor_coding[expanded.factor] = expanded.includes_intercept
237 factor_codings.append(factor_coding)
238 return factor_codings
240def test_pick_contrasts_for_term():
241 from patsy.desc import Term
242 used = set()
243 codings = pick_contrasts_for_term(Term([]), set(), used)
244 assert codings == [{}]
245 codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used)
246 assert codings == [{"a": False}]
247 codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used)
248 assert codings == [{"a": True, "b": False}]
249 used_snapshot = set(used)
250 codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used)
251 assert codings == [{"d": False}, {"c": False, "d": True}]
252 # Do it again backwards, to make sure we're deterministic with respect to
253 # order:
254 codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot)
255 assert codings == [{"c": False}, {"c": True, "d": False}]