Coverage for /Users/sebastiana/Documents/Sugarpills/confidence/spotify_confidence/analysis/frequentist/sequential_bound_solver.py: 16%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

166 statements  

1# Copyright 2017-2020 Spotify AB 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14 

15import numpy as np 

16import pandas 

17from scipy.stats import norm 

18 

19 

20def _alphas(alpha: np.array, phi: float, t: np.array): 

21 """Alpha spending function.""" 

22 pe = np.zeros(len(t)) 

23 pd = np.zeros(len(t)) 

24 for j, tt in enumerate(t): 

25 pe[j] = alpha * np.power(tt, phi) 

26 pd[j] = pe[j] if j == 0 else pe[j] - pe[j - 1] 

27 return pe, pd 

28 

29 

30def _qp(xq: float, last: float, nints: int, yam1: float, ybm1: float, stdv: float): 

31 hlast = (ybm1 - yam1) / nints 

32 grid = np.linspace(yam1, ybm1, nints + 1) 

33 fun = last * norm.cdf(grid, xq, stdv) 

34 qp = 0.5 * hlast * (2 * np.sum(fun) - fun[0] - fun[len(fun) - 1]) 

35 return qp 

36 

37 

38def _bsearch( 

39 last: np.array, 

40 nints: int, 

41 pd: float, 

42 stdv: float, 

43 ya: float, 

44 yb: float, 

45) -> np.array: 

46 """ 

47 Note: function signature slightly modified in comparison to R implementation (which takes complete nints 

48 array instead of scalar), but should be semantically equivalent 

49 """ 

50 max_iter = 50 

51 tol = 1e-7 

52 de = 10 

53 uppr = yb 

54 q = _qp(uppr, last, nints, ya, yb, stdv) 

55 while abs(q - pd) > tol: 

56 de = de / 10 

57 temp = 1 if q > (pd + tol) else 0 

58 incr = 2 * temp - 1 

59 j = 1 

60 while j <= max_iter: 

61 uppr = uppr + incr * de 

62 q = _qp(uppr, last, nints, ya, yb, stdv) 

63 if abs(q - pd) > tol and j >= max_iter: 

64 break 

65 elif (incr == 1 and q <= (pd + tol)) or (incr == -1 and q >= (pd - tol)): 

66 j = max_iter 

67 

68 j += 1 

69 ybval = uppr 

70 return ybval 

71 

72 

73_NORM_CONSTANT = 1 / np.sqrt(2 * np.pi) 

74 

75 

76def _fast_norm_pdf_prescaled(x: np.array, scale): 

77 norm_constant2 = _NORM_CONSTANT / scale 

78 pdf_val = norm_constant2 * np.exp(-0.5 * np.power(x, 2)) 

79 return pdf_val 

80 

81 

82def _fcab(last: np.array, nints: int, yam1: float, h: float, x: np.array, stdv: float): 

83 X, Y = np.meshgrid(x / stdv, (h * np.linspace(0, nints, nints + 1) + yam1) / stdv) 

84 scaled_x = Y - X 

85 pdf_prescaled = _fast_norm_pdf_prescaled(scaled_x, stdv) 

86 last_transposed = np.transpose(np.tile(last, len(x)).reshape(len(x), nints + 1)) 

87 

88 f = last_transposed * pdf_prescaled 

89 area = 0.5 * h * (2 * f.sum(0) - np.transpose(f[0, :]) - np.transpose(f[nints, :])) 

90 return area 

91 

92 

93# TODO use dataclass as soon as stets was migrated to Python 3.7 

94class ComputationState: 

95 """ 

96 Internal state that can be fed into bounds(...). Whenever the internal state changes, a new ComputationState object 

97 will be created. 

98 

99 It is not intended that other packages code operates on the attributes of this class because the internal 

100 structure may be changed anytime. 

101 """ 

102 

103 def __init__(self, df: pandas.DataFrame, last_fcab: np.array): 

104 if df is None or any(df["zb"].isnull()) or len(df) > 0 and last_fcab is None: 

105 raise ValueError() 

106 

107 self._df = df 

108 self._last_fcab = last_fcab 

109 

110 @property 

111 def df(self): 

112 # copy to avoid side effects 

113 return self._df.copy() 

114 

115 @property 

116 def last_fcab(self): 

117 """fcab calculation referring to the last row of df""" 

118 

119 # copy to avoid side effects 

120 return None if self._last_fcab is None else np.copy(self._last_fcab) 

121 

122 def __eq__(self, other): 

123 if isinstance(other, ComputationState): 

124 return self._df.equals(other._df) and np.array_equal(self._last_fcab, other._last_fcab) 

125 return False 

126 

127 

128def landem( 

129 t: np.array, 

130 alpha: float, 

131 phi: float, 

132 ztrun: float, 

133 state: ComputationState, 

134 max_nints: int = None, 

135): 

136 """ 

137 This function is a Python implementation of landem.R of ldbounds package. 

138 https://cran.r-project.org/web/packages/ldbounds/index.html 

139 Source code of that landem.R: https://github.com/cran/ldbounds/blob/master/R/landem.R 

140 

141 After making any changes, please run test_compare_with_ldbounds.py to gain confidence that functionality is 

142 not broken. 

143 

144 :param t: Monotonically increasing information ratios 

145 :param alpha: corrected alpha (other than R implementation, we do not modify alpha based on number of sides) 

146 :param phi: exponent used by alpha-sepending function 

147 :param ztrun: max value for truncating bounds 

148 :param state: state to build the computation upon 

149 :param max_nints: max value that internal nints parameter can take. Limiting this value reduces accuracy of the 

150 calculation but can lead to crucial performance improvement 

151 :return: A dataframe where the "zb" column contains the bounds and the i-th row reflects the results 

152 for the i-th information ratio from t 

153 """ 

154 

155 df = state.df # reading the property will copy the df to avoid side effects 

156 last_fcab = state.last_fcab 

157 

158 if len(t) <= len(df): 

159 # Simply return complete state and the existing result 

160 return df.iloc[: len(t)], state 

161 elif len(t) > len(df): 

162 # We reindex because appending rows *individually* to a DataFrame is expensive 

163 df = df.reindex(range(len(t))) 

164 

165 h = 0.05 

166 zninf = -ztrun 

167 tol = 1e-7 

168 

169 # t2 = t # ldbounds:::bounds() rescales t2=t/t.max() by default. We omit this because impact on bounds unclear 

170 

171 if df.isnull().all().all(): 

172 # start at index 0 if df was not yet initialized 

173 start = 0 

174 else: 

175 # start at the first index where "zb" column is null (or at the very end if all "zb" values are not null) 

176 zb_null_arr = np.where(df["zb"].isnull()) 

177 start = zb_null_arr[0][0] - 1 if len(zb_null_arr[0]) > 0 else len(df) - 1 

178 

179 rangestart = start + 1 

180 if start == 0: 

181 df.loc[0, "stdv"] = np.sqrt(t[0]) 

182 

183 df.loc[start + 1 : len(t), "stdv"] = np.sqrt(t[start + 1 : len(t)] - t[start : len(t) - 1]) 

184 

185 df["pe"], df["pd"] = _alphas(alpha, phi, t) 

186 df.loc[start:, "sdproc"] = np.sqrt(t[start:]) 

187 df.loc[start:, "information_ratio"] = t[start:] 

188 

189 if df.isnull().all(axis=0)[0]: 

190 # this needs to be done only to compute the very first row 

191 if df.at[start, "pd"] < 1: 

192 df.at[start, "zb"] = norm.ppf(1 - df.at[start, "pd"]) 

193 if df.at[start, "zb"] > ztrun: 

194 df.at[start, "zb"] = ztrun 

195 df.at[start, "pd"] = 1 - norm.cdf(df.at[start, "zb"]) 

196 df.at[start, "pe"] = df.at[start, "pd"] 

197 if len(t) > 1: 

198 df.at[1, "pd"] = df.at[start + 1, "pe"] - df.at[start, "pe"] 

199 df.at[start, "yb"] = df.at[start, "zb"] * df.at[start, "stdv"] 

200 

201 df.at[start, "za"] = zninf 

202 df.at[start, "ya"] = df.at[start, "za"] * df.at[start, "stdv"] 

203 df.at[start, "nints"] = np.ceil((df.at[start, "yb"] - df.at[start, "ya"]) / (h * df.at[start, "stdv"])) 

204 

205 grid = np.linspace( 

206 df.at[start, "ya"], 

207 df.at[start, "yb"], 

208 int(df.at[start, "nints"] + 1), 

209 ) 

210 scaled_x = grid / df.at[start, "stdv"] 

211 last_fcab = _fast_norm_pdf_prescaled(scaled_x, df.at[start, "stdv"]) 

212 

213 if len(t) >= 2: 

214 for i in range(rangestart, len(t)): 

215 if df["information_ratio"][i] - df["information_ratio"][i - 1] <= 1e-5: 

216 # If information ratio difference between time steps is 0, re-use result calculated for the previous 

217 # time step. Normally, it means that no data was added. We have to catch this case because nints 

218 # becomes float("inf") and makes the procedure crash. We check against 10e-6 instead of against 0 

219 # because an almost-zero information gain can cause pretty big numerical inaccuracy in practice. 

220 df.iloc[i] = df.iloc[i - 1] 

221 continue 

222 

223 # Possible error in spending function. May be due to truncation. 

224 if df.at[i, "pd"] != 1.0: 

225 df.at[i, "pd"] = df.at[i, "pe"] - df.at[i - 1, "pe"] 

226 df.at[i, "pd"] = df.at[i, "pd"].clip(0, 1) 

227 

228 if df.at[i, "pd"] < tol: 

229 df.at[i, "zb"] = -zninf 

230 if df.at[i, "zb"] > ztrun: 

231 df.at[i, "zb"] = ztrun 

232 df.at[i, "pd"] = _qp( 

233 df.at[i, "zb"] * df.at[i, "sdproc"], 

234 last_fcab, 

235 df.at[i - 1, "nints"], 

236 df.at[i - 1, "ya"], 

237 df.at[i - 1, "yb"], 

238 df.at[i, "stdv"], 

239 ) 

240 df.at[i, "pe"] = df.at[i, "pd"] + df.at[i - 1, "pe"] 

241 

242 df.at[i, "yb"] = df.at[i, "zb"] * df.at[i, "sdproc"] 

243 elif df.at[i, "pd"] == 1.0: 

244 df.at[i, "zb"] = 0.0 

245 df.at[i, "yb"] = 0.0 

246 elif tol <= df.at[i, "pd"] < 1: 

247 

248 df.at[i, "yb"] = _bsearch( 

249 last_fcab, 

250 int(df.loc[i - 1]["nints"]), # differs from R because we modified signature of bsearch 

251 df.at[i, "pd"], 

252 df.at[i, "stdv"], 

253 df.at[i - 1, "ya"], 

254 df.at[i - 1, "yb"], 

255 ) 

256 

257 df.at[i, "zb"] = df.at[i, "yb"] / df.at[i, "sdproc"] 

258 

259 if df.at[i, "zb"] > ztrun: 

260 df.at[i, "zb"] = ztrun 

261 df.at[i, "pd"] = _qp( 

262 df.at[i, "zb"] * df.at[i, "sdproc"], 

263 last_fcab, 

264 int(df.at[i - 1, "nints"]), 

265 df.at[i - 1, "ya"], 

266 df.at[i - 1, "yb"], 

267 df.at[i, "stdv"], 

268 ) 

269 df.at[i, "pe"] = df.at[i, "pd"] + df.at[i - 1, "pe"] 

270 

271 df.at[i, "yb"] = df.at[i, "zb"] * df.at[i, "sdproc"] 

272 

273 # in landem.R, the following two statements are in side==1 if clause 

274 df.at[i, "ya"] = zninf * df.at[i, "sdproc"] 

275 df.at[i, "za"] = zninf 

276 

277 nints_calc = np.ceil((df.at[i, "yb"] - df.at[i, "ya"]) / (h * df.at[i, "stdv"])) 

278 df.at[i, "nints"] = nints_calc if max_nints is None or nints_calc < max_nints else max_nints 

279 

280 if i < len(t): 

281 # in R implementation, i < len(t)-1. However we run until len(t) because that calculation will be 

282 # required if landem() is called again with df used as a starting point 

283 hlast = (df.at[i - 1, "yb"] - df.at[i - 1, "ya"]) / df.at[i - 1, "nints"] 

284 x = np.linspace( 

285 df.at[i, "ya"], 

286 df.at[i, "yb"], 

287 int(df.at[i, "nints"] + 1), 

288 ) 

289 last_fcab = _fcab( 

290 last_fcab, int(df.at[i - 1, "nints"]), df.at[i - 1, "ya"], hlast, x, df.at[i, "stdv"] 

291 ) 

292 return df, ComputationState(df, last_fcab) 

293 

294 

295# Simple type to return results in a structured way 

296class CalculationResult: 

297 def __init__(self, df: pandas.DataFrame, state: ComputationState): 

298 self._df = df 

299 self._state = state 

300 

301 @property 

302 def df(self): 

303 return self._df 

304 

305 @property 

306 def bounds(self): 

307 return self._df["zb"].values 

308 

309 @property 

310 def state(self): 

311 return self._state 

312 

313 

314columns = ["za", "zb", "ya", "yb", "pe", "pd", "stdv", "sdproc", "nints", "information_ratio"] 

315 

316# Initial state to be fed into bounds() to calculate sequential bounds from scratch 

317EMPTY_STATE = ComputationState(df=pandas.DataFrame(index=None, columns=columns, dtype=float), last_fcab=None) 

318 

319 

320def bounds( 

321 t: np.array, 

322 alpha: float, 

323 rho: float, 

324 ztrun: float, 

325 sides: int, 

326 state: ComputationState = EMPTY_STATE, 

327 max_nints=None, 

328) -> CalculationResult: 

329 """ 

330 See landem() for parameter explanation 

331 

332 :return: If a state is provided, returns a tuple of result and state. Otherwise, return only boundary result. 

333 """ 

334 

335 def get_input_str(): 

336 return ( 

337 f"input params: t={t}, alpha={alpha}, sides={sides}, rho={rho}, ztrun={ztrun}," 

338 f"state_df={state.df.to_json()}, state_fcab={state.last_fcab}, max_nints={max_nints}" 

339 ) 

340 

341 if any(t == 0.0): 

342 raise ValueError(f"Information ratio must must not be zero, {get_input_str()}") 

343 if any(t[i] > t[i + 1] for i in range(len(t) - 1)): 

344 raise ValueError(f"Information ratio must be monotonically increasing, {get_input_str()}") 

345 if not (sides == 1 or sides == 2): 

346 raise ValueError(f"sides must either be one a zero, {get_input_str()}") 

347 

348 if state is None: 

349 state = EMPTY_STATE 

350 

351 alph = alpha / sides 

352 

353 df_result, new_state = landem(t, alph, rho, ztrun, state, max_nints) 

354 

355 # guardrail check 

356 fixed_horizon_bound = norm.ppf(1 - alph) 

357 last_sequential_bound = df_result["zb"].values[-1] 

358 if fixed_horizon_bound > last_sequential_bound: 

359 raise Exception( 

360 f"Last bound ({last_sequential_bound}) is less conservative than fixed horizon bound " 

361 f"({fixed_horizon_bound}), {get_input_str()} " 

362 ) 

363 

364 return CalculationResult(df_result, new_state)