Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/base/_penalties.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3A collection of smooth penalty functions.
5Penalties on vectors take a vector argument and return a scalar
6penalty. The gradient of the penalty is a vector with the same shape
7as the input value.
9Penalties on covariance matrices take two arguments: the matrix and
10its inverse, both in unpacked (square) form. The returned penalty is
11a scalar, and the gradient is returned as a vector that contains the
12gradient with respect to the free elements in the lower triangle of
13the covariance matrix.
15All penalties are subtracted from the log-likelihood, so greater
16penalty values correspond to a greater degree of penalization.
18The penaties should be smooth so that they can be subtracted from log
19likelihood functions and optimized using standard methods (i.e. L1
20penalties do not belong here).
21"""
22import numpy as np
25class Penalty(object):
26 """
27 A class for representing a scalar-value penalty.
29 Parameters
30 ----------
31 weights : array_like
32 A vector of weights that determines the weight of the penalty
33 for each parameter.
35 Notes
36 -----
37 The class has a member called `alpha` that scales the weights.
38 """
40 def __init__(self, weights=1.):
41 self.weights = weights
42 self.alpha = 1.
44 def func(self, params):
45 """
46 A penalty function on a vector of parameters.
48 Parameters
49 ----------
50 params : array_like
51 A vector of parameters.
53 Returns
54 -------
55 A scalar penaty value; greater values imply greater
56 penalization.
57 """
58 raise NotImplementedError
60 def deriv(self, params):
61 """
62 The gradient of a penalty function.
64 Parameters
65 ----------
66 params : array_like
67 A vector of parameters
69 Returns
70 -------
71 The gradient of the penalty with respect to each element in
72 `params`.
73 """
74 raise NotImplementedError
76 def grad(self, params):
77 import warnings
78 warnings.warn('grad method is deprecated. Use `deriv` instead',
79 DeprecationWarning)
80 return self.deriv(params)
82 def _null_weights(self, params):
83 """work around for Null model
85 This will not be needed anymore when we can use `self._null_drop_keys`
86 as in DiscreteModels.
87 TODO: check other models
88 """
89 if np.size(self.weights) > 1:
90 if len(params) == 1:
91 raise # raise to identify models where this would be needed
92 return 0.
94 return self.weights
97class NonePenalty(Penalty):
98 """
99 A penalty that does not penalize.
100 """
102 def __init__(self, **kwds):
103 super().__init__()
104 if kwds:
105 import warnings
106 warnings.warn('keyword arguments are be ignored')
108 def func(self, params):
109 if params.ndim == 2:
110 return np.zeros(params.shape[1:])
111 else:
112 return 0
114 def deriv(self, params):
115 return np.zeros(params.shape)
117 def deriv2(self, params):
118 # returns diagonal of hessian
119 return np.zeros(params.shape[0])
122class L2(Penalty):
123 """
124 The L2 (ridge) penalty.
125 """
127 def __init__(self, weights=1.):
128 super().__init__(weights)
130 def func(self, params):
131 return np.sum(self.weights * self.alpha * params**2)
133 def deriv(self, params):
134 return 2 * self.weights * self.alpha * params
136 def deriv2(self, params):
137 return 2 * self.weights * self.alpha * np.ones(len(params))
140class L2Univariate(Penalty):
141 """
142 The L2 (ridge) penalty applied to each parameter.
143 """
145 def __init__(self, weights=None):
146 if weights is None:
147 self.weights = 1.
148 else:
149 self.weights = weights
151 def func(self, params):
152 return self.weights * params**2
154 def deriv(self, params):
155 return 2 * self.weights * params
157 def deriv2(self, params):
158 return 2 * self.weights * np.ones(len(params))
161class PseudoHuber(Penalty):
162 """
163 The pseudo-Huber penalty.
164 """
166 def __init__(self, dlt, weights=1.):
167 super().__init__(weights)
168 self.dlt = dlt
170 def func(self, params):
171 v = np.sqrt(1 + (params / self.dlt)**2)
172 v -= 1
173 v *= self.dlt**2
174 return np.sum(self.weights * self.alpha * v, 0)
176 def deriv(self, params):
177 v = np.sqrt(1 + (params / self.dlt)**2)
178 return params * self.weights * self.alpha / v
180 def deriv2(self, params):
181 v = np.power(1 + (params / self.dlt)**2, -3/2)
182 return self.weights * self.alpha * v
185class SCAD(Penalty):
186 """
187 The SCAD penalty of Fan and Li.
189 The SCAD penalty is linear around zero as a L1 penalty up to threshold tau.
190 The SCAD penalty is constant for values larger than c*tau.
191 The middle segment is quadratic and connect the two segments with a continuous
192 derivative.
193 The penalty is symmetric around zero.
195 Parameterization follows Boo, Johnson, Li and Tan 2011.
196 Fan and Li use lambda instead of tau, and a instead of c. Fan and Li
197 recommend setting c=3.7.
199 f(x) = { tau |x| if 0 <= |x| < tau
200 { -(|x|^2 - 2 c tau |x| + tau^2) / (2 (c - 1)) if tau <= |x| < c tau
201 { (c + 1) tau^2 / 2 if c tau <= |x|
203 Parameters
204 ----------
205 tau : float
206 slope and threshold for linear segment
207 c : float
208 factor for second threshold which is c * tau
209 weights : None or array
210 weights for penalty of each parameter. If an entry is zero, then the
211 corresponding parameter will not be penalized.
213 References
214 ----------
215 Buu, Anne, Norman J. Johnson, Runze Li, and Xianming Tan. "New variable
216 selection methods for zero‐inflated count data with applications to the
217 substance abuse field."
218 Statistics in medicine 30, no. 18 (2011): 2326-2340.
220 Fan, Jianqing, and Runze Li. "Variable selection via nonconcave penalized
221 likelihood and its oracle properties."
222 Journal of the American statistical Association 96, no. 456 (2001):
223 1348-1360.
224 """
226 def __init__(self, tau, c=3.7, weights=1.):
227 super().__init__(weights)
228 self.tau = tau
229 self.c = c
231 def func(self, params):
233 # 3 segments in absolute value
234 tau = self.tau
235 p_abs = np.atleast_1d(np.abs(params))
236 res = np.empty(p_abs.shape, p_abs.dtype)
237 res.fill(np.nan)
238 mask1 = p_abs < tau
239 mask3 = p_abs >= self.c * tau
240 res[mask1] = tau * p_abs[mask1]
241 mask2 = ~mask1 & ~mask3
242 p_abs2 = p_abs[mask2]
243 tmp = (p_abs2**2 - 2 * self.c * tau * p_abs2 + tau**2)
244 res[mask2] = -tmp / (2 * (self.c - 1))
245 res[mask3] = (self.c + 1) * tau**2 / 2.
247 return (self.weights * res).sum(0)
249 def deriv(self, params):
251 # 3 segments in absolute value
252 tau = self.tau
253 p = np.atleast_1d(params)
254 p_abs = np.abs(p)
255 p_sign = np.sign(p)
256 res = np.empty(p_abs.shape)
257 res.fill(np.nan)
259 mask1 = p_abs < tau
260 mask3 = p_abs >= self.c * tau
261 mask2 = ~mask1 & ~mask3
262 res[mask1] = p_sign[mask1] * tau
263 tmp = p_sign[mask2] * (p_abs[mask2] - self.c * tau)
264 res[mask2] = -tmp / (self.c - 1)
265 res[mask3] = 0
267 return self.weights * res
269 def deriv2(self, params):
270 """Second derivative of function
272 This returns scalar or vector in same shape as params, not a square
273 Hessian. If the return is 1 dimensional, then it is the diagonal of
274 the Hessian.
275 """
277 # 3 segments in absolute value
278 tau = self.tau
279 p = np.atleast_1d(params)
280 p_abs = np.abs(p)
281 res = np.zeros(p_abs.shape)
283 mask1 = p_abs < tau
284 mask3 = p_abs >= self.c * tau
285 mask2 = ~mask1 & ~mask3
286 res[mask2] = -1 / (self.c - 1)
288 return self.weights * res
291class SCADSmoothed(SCAD):
292 """
293 The SCAD penalty of Fan and Li, quadratically smoothed around zero.
295 This follows Fan and Li 2001 equation (3.7).
297 Parameterization follows Boo, Johnson, Li and Tan 2011
298 see docstring of SCAD
300 Parameters
301 ----------
302 tau : float
303 slope and threshold for linear segment
304 c : float
305 factor for second threshold
306 c0 : float
307 threshold for quadratically smoothed segment
308 restriction : None or array
309 linear constraints for
311 Notes
312 -----
313 TODO: Use delegation instead of subclassing, so smoothing can be added to
314 all penalty classes.
315 """
317 def __init__(self, tau, c=3.7, c0=None, weights=1., restriction=None):
318 super().__init__(tau, c=c, weights=weights)
319 self.tau = tau
320 self.c = c
321 self.c0 = c0 if c0 is not None else tau * 0.1
322 if self.c0 > tau:
323 raise ValueError('c0 cannot be larger than tau')
325 # get coefficients for quadratic approximation
326 c0 = self.c0
327 # need to temporarily override weights for call to super
328 weights = self.weights
329 self.weights = 1.
330 deriv_c0 = super(SCADSmoothed, self).deriv(c0)
331 value_c0 = super(SCADSmoothed, self).func(c0)
332 self.weights = weights
334 self.aq1 = value_c0 - 0.5 * deriv_c0 * c0
335 self.aq2 = 0.5 * deriv_c0 / c0
336 self.restriction = restriction
338 def func(self, params):
339 # workaround for Null model
340 weights = self._null_weights(params)
341 # TODO: `and np.size(params) > 1` is hack for llnull, need better solution
342 if self.restriction is not None and np.size(params) > 1:
343 params = self.restriction.dot(params)
344 # need to temporarily override weights for call to super
345 # Note: we have the same problem with `restriction`
346 self_weights = self.weights
347 self.weights = 1.
348 value = super(SCADSmoothed, self).func(params[None, ...])
349 self.weights = self_weights
351 # shift down so func(0) == 0
352 value -= self.aq1
353 # change the segment corrsponding to quadratic approximation
354 p_abs = np.atleast_1d(np.abs(params))
355 mask = p_abs < self.c0
356 p_abs_masked = p_abs[mask]
357 value[mask] = self.aq2 * p_abs_masked**2
359 return (weights * value).sum(0)
361 def deriv(self, params):
362 # workaround for Null model
363 weights = self._null_weights(params)
364 if self.restriction is not None and np.size(params) > 1:
365 params = self.restriction.dot(params)
366 # need to temporarily override weights for call to super
367 self_weights = self.weights
368 self.weights = 1.
369 value = super(SCADSmoothed, self).deriv(params)
370 self.weights = self_weights
372 #change the segment corrsponding to quadratic approximation
373 p = np.atleast_1d(params)
374 mask = np.abs(p) < self.c0
375 value[mask] = 2 * self.aq2 * p[mask]
377 if self.restriction is not None and np.size(params) > 1:
378 return weights * value.dot(self.restriction)
379 else:
380 return weights * value
382 def deriv2(self, params):
383 # workaround for Null model
384 weights = self._null_weights(params)
385 if self.restriction is not None and np.size(params) > 1:
386 params = self.restriction.dot(params)
387 # need to temporarily override weights for call to super
388 self_weights = self.weights
389 self.weights = 1.
390 value = super(SCADSmoothed, self).deriv2(params)
391 self.weights = self_weights
393 # change the segment corrsponding to quadratic approximation
394 p = np.atleast_1d(params)
395 mask = np.abs(p) < self.c0
396 value[mask] = 2 * self.aq2
398 if self.restriction is not None and np.size(params) > 1:
399 # note: super returns 1d array for diag, i.e. hessian_diag
400 # TODO: weights are missing
401 return (self.restriction.T * (weights * value)
402 ).dot(self.restriction)
403 else:
404 return weights * value
407class ConstraintsPenalty(object):
408 """
409 Penalty applied to linear transformation of parameters
411 Parameters
412 ----------
413 penalty: instance of penalty function
414 currently this requires an instance of a univariate, vectorized
415 penalty class
416 weights : None or ndarray
417 weights for adding penalties of transformed params
418 restriction : None or ndarray
419 If it is not None, then restriction defines a linear transformation
420 of the parameters. The penalty function is applied to each transformed
421 parameter independently.
423 Notes
424 -----
425 `restrictions` allows us to impose penalization on contrasts or stochastic
426 constraints of the original parameters.
427 Examples for these contrast are difference penalities or all pairs
428 penalties.
429 """
431 def __init__(self, penalty, weights=None, restriction=None):
433 self.penalty = penalty
434 if weights is None:
435 self.weights = 1.
436 else:
437 self.weights = weights
439 if restriction is not None:
440 restriction = np.asarray(restriction)
442 self.restriction = restriction
444 def func(self, params):
445 """evaluate penalty function at params
447 Parameter
448 ---------
449 params : ndarray
450 array of parameters at which derivative is evaluated
452 Returns
453 -------
454 deriv2 : ndarray
455 value(s) of penalty function
456 """
457 # TODO: `and np.size(params) > 1` is hack for llnull, need better solution
458 # Is this still needed? it seems to work without
459 if self.restriction is not None:
460 params = self.restriction.dot(params)
462 value = self.penalty.func(params)
464 return (self.weights * value.T).T.sum(0)
466 def deriv(self, params):
467 """first derivative of penalty function w.r.t. params
469 Parameter
470 ---------
471 params : ndarray
472 array of parameters at which derivative is evaluated
474 Returns
475 -------
476 deriv2 : ndarray
477 array of first partial derivatives
478 """
479 if self.restriction is not None:
480 params = self.restriction.dot(params)
482 value = self.penalty.deriv(params)
484 if self.restriction is not None:
485 return self.weights * value.T.dot(self.restriction)
486 else:
487 return (self.weights * value.T)
489 grad = deriv
491 def deriv2(self, params):
492 """second derivative of penalty function w.r.t. params
494 Parameter
495 ---------
496 params : ndarray
497 array of parameters at which derivative is evaluated
499 Returns
500 -------
501 deriv2 : ndarray, 2-D
502 second derivative matrix
503 """
505 if self.restriction is not None:
506 params = self.restriction.dot(params)
508 value = self.penalty.deriv2(params)
510 if self.restriction is not None:
511 # note: univariate penalty returns 1d array for diag,
512 # i.e. hessian_diag
513 v = (self.restriction.T * value * self.weights)
514 value = v.dot(self.restriction)
515 else:
516 value = np.diag(self.weights * value)
518 return value
521class L2ContraintsPenalty(ConstraintsPenalty):
522 """convenience class of ConstraintsPenalty with L2 penalization
523 """
525 def __init__(self, weights=None, restriction=None, sigma_prior=None):
527 if sigma_prior is not None:
528 raise NotImplementedError('sigma_prior is not implemented yet')
530 penalty = L2Univariate()
532 super(L2ContraintsPenalty, self).__init__(penalty, weights=weights,
533 restriction=restriction)
536class CovariancePenalty(object):
538 def __init__(self, weight):
539 # weight should be scalar
540 self.weight = weight
542 def func(self, mat, mat_inv):
543 """
544 Parameters
545 ----------
546 mat : square matrix
547 The matrix to be penalized.
548 mat_inv : square matrix
549 The inverse of `mat`.
551 Returns
552 -------
553 A scalar penalty value
554 """
555 raise NotImplementedError
557 def deriv(self, mat, mat_inv):
558 """
559 Parameters
560 ----------
561 mat : square matrix
562 The matrix to be penalized.
563 mat_inv : square matrix
564 The inverse of `mat`.
566 Returns
567 -------
568 A vector containing the gradient of the penalty
569 with respect to each element in the lower triangle
570 of `mat`.
571 """
572 raise NotImplementedError
574 def grad(self, mat, mat_inv):
575 import warnings
576 warnings.warn('grad method is deprecated. Use `deriv` instead',
577 DeprecationWarning)
578 return self.deriv(mat, mat_inv)
581class PSD(CovariancePenalty):
582 """
583 A penalty that converges to +infinity as the argument matrix
584 approaches the boundary of the domain of symmetric, positive
585 definite matrices.
586 """
588 def func(self, mat, mat_inv):
589 try:
590 cy = np.linalg.cholesky(mat)
591 except np.linalg.LinAlgError:
592 return np.inf
593 return -2 * self.weight * np.sum(np.log(np.diag(cy)))
595 def deriv(self, mat, mat_inv):
596 cy = mat_inv.copy()
597 cy = 2*cy - np.diag(np.diag(cy))
598 i,j = np.tril_indices(mat.shape[0])
599 return -self.weight * cy[i,j]