Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2013 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# Missing data detection/handling 

6 

7# First, how do we represent missing data? (i.e., which values count as 

8# "missing"?) In the long run, we want to use numpy's NA support... but that 

9# doesn't exist yet. Until then, people use various sorts of ad-hoc 

10# things. Some things that might be considered NA: 

11# NA (eventually) 

12# NaN (in float or object arrays) 

13# None (in object arrays) 

14# np.ma.masked (in numpy.ma masked arrays) 

15# Pandas compatibility considerations: 

16# For numeric arrays, None is unconditionally converted to NaN. 

17# For object arrays (including string arrays!), None and NaN are preserved, 

18# but pandas.isnull() returns True for both. 

19# np.ma compatibility considerations: 

20# Preserving array subtypes is a huge pain, because it means that we can't 

21# just call 'asarray' and be done... we already jump through tons of hoops 

22# to write code that can handle both ndarray's and pandas objects, and 

23# just thinking about adding another item to this list makes me tired. So 

24# for now we don't support np.ma missing values. Use pandas! 

25 

26# Next, what should be done once we find missing data? R's options: 

27# -- throw away those rows (from all aligned matrices) 

28# -- with or without preserving information on which rows were discarded 

29# -- error out 

30# -- carry on 

31# The 'carry on' option requires that we have some way to represent NA in our 

32# output array. To avoid further solidifying the use of NaN for this purpose, 

33# we'll leave this option out for now, until real NA support is 

34# available. Also, we always preserve information on which rows were 

35# discarded, using the pandas index functionality (currently this is only 

36# returned to the original caller if they used return_type="dataframe", 

37# though). 

38 

39import numpy as np 

40from patsy import PatsyError 

41from patsy.util import (safe_isnan, safe_scalar_isnan, 

42 no_pickling, assert_no_pickling) 

43 

44# These are made available in the patsy.* namespace 

45__all__ = ["NAAction"] 

46 

47_valid_NA_types = ["None", "NaN"] 

48_valid_NA_responses = ["raise", "drop"] 

49def _desc_options(options): 

50 return ", ".join([repr(opt) for opt in options]) 

51 

52class NAAction(object): 

53 """An :class:`NAAction` object defines a strategy for handling missing 

54 data. 

55 

56 "NA" is short for "Not Available", and is used to refer to any value which 

57 is somehow unmeasured or unavailable. In the long run, it is devoutly 

58 hoped that numpy will gain first-class missing value support. Until then, 

59 we work around this lack as best we're able. 

60 

61 There are two parts to this: First, we have to determine what counts as 

62 missing data. For numerical data, the default is to treat NaN values 

63 (e.g., ``numpy.nan``) as missing. For categorical data, the default is to 

64 treat NaN values, and also the Python object None, as missing. (This is 

65 consistent with how pandas does things, so if you're already using 

66 None/NaN to mark missing data in your pandas DataFrames, you're good to 

67 go.) 

68 

69 Second, we have to decide what to do with any missing data when we 

70 encounter it. One option is to simply discard any rows which contain 

71 missing data from our design matrices (``drop``). Another option is to 

72 raise an error (``raise``). A third option would be to simply let the 

73 missing values pass through into the returned design matrices. However, 

74 this last option is not yet implemented, because of the lack of any 

75 standard way to represent missing values in arbitrary numpy matrices; 

76 we're hoping numpy will get this sorted out before we standardize on 

77 anything ourselves. 

78 

79 You can control how patsy handles missing data through the ``NA_action=`` 

80 argument to functions like :func:`build_design_matrices` and 

81 :func:`dmatrix`. If all you want to do is to choose between ``drop`` and 

82 ``raise`` behaviour, you can pass one of those strings as the 

83 ``NA_action=`` argument directly. If you want more fine-grained control 

84 over how missing values are detected and handled, then you can create an 

85 instance of this class, or your own object that implements the same 

86 interface, and pass that as the ``NA_action=`` argument instead. 

87 """ 

88 def __init__(self, on_NA="drop", NA_types=["None", "NaN"]): 

89 """The :class:`NAAction` constructor takes the following arguments: 

90 

91 :arg on_NA: How to handle missing values. The default is ``"drop"``, 

92 which removes all rows from all matrices which contain any missing 

93 values. Also available is ``"raise"``, which raises an exception 

94 when any missing values are encountered. 

95 :arg NA_types: Which rules are used to identify missing values, as a 

96 list of strings. Allowed values are: 

97 

98 * ``"None"``: treat the ``None`` object as missing in categorical 

99 data. 

100 * ``"NaN"``: treat floating point NaN values as missing in 

101 categorical and numerical data. 

102 

103 .. versionadded:: 0.2.0 

104 """ 

105 self.on_NA = on_NA 

106 if self.on_NA not in _valid_NA_responses: 

107 raise ValueError("invalid on_NA action %r " 

108 "(should be one of %s)" 

109 % (on_NA, _desc_options(_valid_NA_responses))) 

110 if isinstance(NA_types, str): 

111 raise ValueError("NA_types should be a list of strings") 

112 self.NA_types = tuple(NA_types) 

113 for NA_type in self.NA_types: 

114 if NA_type not in _valid_NA_types: 

115 raise ValueError("invalid NA_type %r " 

116 "(should be one of %s)" 

117 % (NA_type, _desc_options(_valid_NA_types))) 

118 

119 def is_categorical_NA(self, obj): 

120 """Return True if `obj` is a categorical NA value. 

121 

122 Note that here `obj` is a single scalar value.""" 

123 if "NaN" in self.NA_types and safe_scalar_isnan(obj): 

124 return True 

125 if "None" in self.NA_types and obj is None: 

126 return True 

127 return False 

128 

129 def is_numerical_NA(self, arr): 

130 """Returns a 1-d mask array indicating which rows in an array of 

131 numerical values contain at least one NA value. 

132 

133 Note that here `arr` is a numpy array or pandas DataFrame.""" 

134 mask = np.zeros(arr.shape, dtype=bool) 

135 if "NaN" in self.NA_types: 

136 mask |= np.isnan(arr) 

137 if mask.ndim > 1: 

138 mask = np.any(mask, axis=1) 

139 return mask 

140 

141 def handle_NA(self, values, is_NAs, origins): 

142 """Takes a set of factor values that may have NAs, and handles them 

143 appropriately. 

144 

145 :arg values: A list of `ndarray` objects representing the data. 

146 These may be 1- or 2-dimensional, and may be of varying dtype. All 

147 will have the same number of rows (or entries, for 1-d arrays). 

148 :arg is_NAs: A list with the same number of entries as `values`, 

149 containing boolean `ndarray` objects that indicate which rows 

150 contain NAs in the corresponding entry in `values`. 

151 :arg origins: A list with the same number of entries as 

152 `values`, containing information on the origin of each 

153 value. If we encounter a problem with some particular value, we use 

154 the corresponding entry in `origins` as the origin argument when 

155 raising a :class:`PatsyError`. 

156 :returns: A list of new values (which may have a differing number of 

157 rows.) 

158 """ 

159 assert len(values) == len(is_NAs) == len(origins) 

160 if len(values) == 0: 

161 return values 

162 if self.on_NA == "raise": 

163 return self._handle_NA_raise(values, is_NAs, origins) 

164 elif self.on_NA == "drop": 

165 return self._handle_NA_drop(values, is_NAs, origins) 

166 else: # pragma: no cover 

167 assert False 

168 

169 def _handle_NA_raise(self, values, is_NAs, origins): 

170 for is_NA, origin in zip(is_NAs, origins): 

171 if np.any(is_NA): 

172 raise PatsyError("factor contains missing values", origin) 

173 return values 

174 

175 def _handle_NA_drop(self, values, is_NAs, origins): 

176 total_mask = np.zeros(is_NAs[0].shape[0], dtype=bool) 

177 for is_NA in is_NAs: 

178 total_mask |= is_NA 

179 good_mask = ~total_mask 

180 # "..." to handle 1- versus 2-dim indexing 

181 return [v[good_mask, ...] for v in values] 

182 

183 __getstate__ = no_pickling 

184 

185def test_NAAction_basic(): 

186 from nose.tools import assert_raises 

187 assert_raises(ValueError, NAAction, on_NA="pord") 

188 assert_raises(ValueError, NAAction, NA_types=("NaN", "asdf")) 

189 assert_raises(ValueError, NAAction, NA_types="NaN") 

190 

191 assert_no_pickling(NAAction()) 

192 

193def test_NAAction_NA_types_numerical(): 

194 for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]: 

195 action = NAAction(NA_types=NA_types) 

196 for extra_shape in [(), (1,), (2,)]: 

197 arr = np.ones((4,) + extra_shape, dtype=float) 

198 nan_rows = [0, 2] 

199 if arr.ndim > 1 and arr.shape[1] > 1: 

200 arr[nan_rows, [0, 1]] = np.nan 

201 else: 

202 arr[nan_rows] = np.nan 

203 exp_NA_mask = np.zeros(4, dtype=bool) 

204 if "NaN" in NA_types: 

205 exp_NA_mask[nan_rows] = True 

206 got_NA_mask = action.is_numerical_NA(arr) 

207 assert np.array_equal(got_NA_mask, exp_NA_mask) 

208 

209def test_NAAction_NA_types_categorical(): 

210 for NA_types in [[], ["NaN"], ["None"], ["NaN", "None"]]: 

211 action = NAAction(NA_types=NA_types) 

212 assert not action.is_categorical_NA("a") 

213 assert not action.is_categorical_NA(1) 

214 assert action.is_categorical_NA(None) == ("None" in NA_types) 

215 assert action.is_categorical_NA(np.nan) == ("NaN" in NA_types) 

216 

217def test_NAAction_drop(): 

218 action = NAAction("drop") 

219 in_values = [np.asarray([-1, 2, -1, 4, 5]), 

220 np.asarray([10.0, 20.0, 30.0, 40.0, 50.0]), 

221 np.asarray([[1.0, np.nan], 

222 [3.0, 4.0], 

223 [10.0, 5.0], 

224 [6.0, 7.0], 

225 [8.0, np.nan]]), 

226 ] 

227 is_NAs = [np.asarray([True, False, True, False, False]), 

228 np.zeros(5, dtype=bool), 

229 np.asarray([True, False, False, False, True]), 

230 ] 

231 out_values = action.handle_NA(in_values, is_NAs, [None] * 3) 

232 assert len(out_values) == 3 

233 assert np.array_equal(out_values[0], [2, 4]) 

234 assert np.array_equal(out_values[1], [20.0, 40.0]) 

235 assert np.array_equal(out_values[2], [[3.0, 4.0], [6.0, 7.0]]) 

236 

237def test_NAAction_raise(): 

238 action = NAAction(on_NA="raise") 

239 

240 # no-NA just passes through: 

241 in_arrs = [np.asarray([1.1, 1.2]), 

242 np.asarray([1, 2])] 

243 is_NAs = [np.asarray([False, False])] * 2 

244 got_arrs = action.handle_NA(in_arrs, is_NAs, [None, None]) 

245 assert np.array_equal(got_arrs[0], in_arrs[0]) 

246 assert np.array_equal(got_arrs[1], in_arrs[1]) 

247 

248 from patsy.origin import Origin 

249 o1 = Origin("asdf", 0, 1) 

250 o2 = Origin("asdf", 2, 3) 

251 

252 # NA raises an error with a correct origin 

253 in_idx = np.arange(2) 

254 in_arrs = [np.asarray([1.1, 1.2]), 

255 np.asarray([1.0, np.nan])] 

256 is_NAs = [np.asarray([False, False]), 

257 np.asarray([False, True])] 

258 try: 

259 action.handle_NA(in_arrs, is_NAs, [o1, o2]) 

260 assert False 

261 except PatsyError as e: 

262 assert e.origin is o2