1# Copyright 2017-2020 Spotify AB
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15import numpy as np
16import pandas
17from scipy.stats import norm
18
19
20def _alphas(alpha: np.array, phi: float, t: np.array):
21 """Alpha spending function."""
22 pe = np.zeros(len(t))
23 pd = np.zeros(len(t))
24 for j, tt in enumerate(t):
25 pe[j] = alpha * np.power(tt, phi)
26 pd[j] = pe[j] if j == 0 else pe[j] - pe[j - 1]
27 return pe, pd
28
29
30def _qp(xq: float, last: float, nints: int, yam1: float, ybm1: float, stdv: float):
31 hlast = (ybm1 - yam1) / nints
32 grid = np.linspace(yam1, ybm1, nints + 1)
33 fun = last * norm.cdf(grid, xq, stdv)
34 qp = 0.5 * hlast * (2 * np.sum(fun) - fun[0] - fun[len(fun) - 1])
35 return qp
36
37
38def _bsearch(
39 last: np.array,
40 nints: int,
41 pd: float,
42 stdv: float,
43 ya: float,
44 yb: float,
45) -> np.array:
46 """
47 Note: function signature slightly modified in comparison to R implementation (which takes complete nints
48 array instead of scalar), but should be semantically equivalent
49 """
50 max_iter = 50
51 tol = 1e-7
52 de = 10
53 uppr = yb
54 q = _qp(uppr, last, nints, ya, yb, stdv)
55 while abs(q - pd) > tol:
56 de = de / 10
57 temp = 1 if q > (pd + tol) else 0
58 incr = 2 * temp - 1
59 j = 1
60 while j <= max_iter:
61 uppr = uppr + incr * de
62 q = _qp(uppr, last, nints, ya, yb, stdv)
63 if abs(q - pd) > tol and j >= max_iter:
64 break
65 elif (incr == 1 and q <= (pd + tol)) or (incr == -1 and q >= (pd - tol)):
66 j = max_iter
67
68 j += 1
69 ybval = uppr
70 return ybval
71
72
73_NORM_CONSTANT = 1 / np.sqrt(2 * np.pi)
74
75
76def _fast_norm_pdf_prescaled(x: np.array, scale):
77 norm_constant2 = _NORM_CONSTANT / scale
78 pdf_val = norm_constant2 * np.exp(-0.5 * np.power(x, 2))
79 return pdf_val
80
81
82def _fcab(last: np.array, nints: int, yam1: float, h: float, x: np.array, stdv: float):
83 X, Y = np.meshgrid(x / stdv, (h * np.linspace(0, nints, nints + 1) + yam1) / stdv)
84 scaled_x = Y - X
85 pdf_prescaled = _fast_norm_pdf_prescaled(scaled_x, stdv)
86 last_transposed = np.transpose(np.tile(last, len(x)).reshape(len(x), nints + 1))
87
88 f = last_transposed * pdf_prescaled
89 area = 0.5 * h * (2 * f.sum(0) - np.transpose(f[0, :]) - np.transpose(f[nints, :]))
90 return area
91
92
93# TODO use dataclass as soon as stets was migrated to Python 3.7
94class ComputationState:
95 """
96 Internal state that can be fed into bounds(...). Whenever the internal state changes, a new ComputationState object
97 will be created.
98
99 It is not intended that other packages code operates on the attributes of this class because the internal
100 structure may be changed anytime.
101 """
102
103 def __init__(self, df: pandas.DataFrame, last_fcab: np.array):
104 if df is None or any(df["zb"].isnull()) or len(df) > 0 and last_fcab is None:
105 raise ValueError()
106
107 self._df = df
108 self._last_fcab = last_fcab
109
110 @property
111 def df(self):
112 # copy to avoid side effects
113 return self._df.copy()
114
115 @property
116 def last_fcab(self):
117 """fcab calculation referring to the last row of df"""
118
119 # copy to avoid side effects
120 return None if self._last_fcab is None else np.copy(self._last_fcab)
121
122 def __eq__(self, other):
123 if isinstance(other, ComputationState):
124 return self._df.equals(other._df) and np.array_equal(self._last_fcab, other._last_fcab)
125 return False
126
127
128def landem(
129 t: np.array,
130 alpha: float,
131 phi: float,
132 ztrun: float,
133 state: ComputationState,
134 max_nints: int = None,
135):
136 """
137 This function is a Python implementation of landem.R of ldbounds package.
138 https://cran.r-project.org/web/packages/ldbounds/index.html
139 Source code of that landem.R: https://github.com/cran/ldbounds/blob/master/R/landem.R
140
141 After making any changes, please run test_compare_with_ldbounds.py to gain confidence that functionality is
142 not broken.
143
144 :param t: Monotonically increasing information ratios
145 :param alpha: corrected alpha (other than R implementation, we do not modify alpha based on number of sides)
146 :param phi: exponent used by alpha-sepending function
147 :param ztrun: max value for truncating bounds
148 :param state: state to build the computation upon
149 :param max_nints: max value that internal nints parameter can take. Limiting this value reduces accuracy of the
150 calculation but can lead to crucial performance improvement
151 :return: A dataframe where the "zb" column contains the bounds and the i-th row reflects the results
152 for the i-th information ratio from t
153 """
154
155 df = state.df # reading the property will copy the df to avoid side effects
156 last_fcab = state.last_fcab
157
158 if len(t) <= len(df):
159 # Simply return complete state and the existing result
160 return df.iloc[: len(t)], state
161 elif len(t) > len(df):
162 # We reindex because appending rows *individually* to a DataFrame is expensive
163 df = df.reindex(range(len(t)))
164
165 h = 0.05
166 zninf = -ztrun
167 tol = 1e-7
168
169 # t2 = t # ldbounds:::bounds() rescales t2=t/t.max() by default. We omit this because impact on bounds unclear
170
171 if df.isnull().all().all():
172 # start at index 0 if df was not yet initialized
173 start = 0
174 else:
175 # start at the first index where "zb" column is null (or at the very end if all "zb" values are not null)
176 zb_null_arr = np.where(df["zb"].isnull())
177 start = zb_null_arr[0][0] - 1 if len(zb_null_arr[0]) > 0 else len(df) - 1
178
179 rangestart = start + 1
180 if start == 0:
181 df.loc[0, "stdv"] = np.sqrt(t[0])
182
183 df.loc[start + 1 : len(t), "stdv"] = np.sqrt(t[start + 1 : len(t)] - t[start : len(t) - 1])
184
185 df["pe"], df["pd"] = _alphas(alpha, phi, t)
186 df.loc[start:, "sdproc"] = np.sqrt(t[start:])
187 df.loc[start:, "information_ratio"] = t[start:]
188
189 if df.isnull().all(axis=0)[0]:
190 # this needs to be done only to compute the very first row
191 if df.at[start, "pd"] < 1:
192 df.at[start, "zb"] = norm.ppf(1 - df.at[start, "pd"])
193 if df.at[start, "zb"] > ztrun:
194 df.at[start, "zb"] = ztrun
195 df.at[start, "pd"] = 1 - norm.cdf(df.at[start, "zb"])
196 df.at[start, "pe"] = df.at[start, "pd"]
197 if len(t) > 1:
198 df.at[1, "pd"] = df.at[start + 1, "pe"] - df.at[start, "pe"]
199 df.at[start, "yb"] = df.at[start, "zb"] * df.at[start, "stdv"]
200
201 df.at[start, "za"] = zninf
202 df.at[start, "ya"] = df.at[start, "za"] * df.at[start, "stdv"]
203 df.at[start, "nints"] = np.ceil((df.at[start, "yb"] - df.at[start, "ya"]) / (h * df.at[start, "stdv"]))
204
205 grid = np.linspace(
206 df.at[start, "ya"],
207 df.at[start, "yb"],
208 int(df.at[start, "nints"] + 1),
209 )
210 scaled_x = grid / df.at[start, "stdv"]
211 last_fcab = _fast_norm_pdf_prescaled(scaled_x, df.at[start, "stdv"])
212
213 if len(t) >= 2:
214 for i in range(rangestart, len(t)):
215 if df["information_ratio"][i] - df["information_ratio"][i - 1] <= 1e-5:
216 # If information ratio difference between time steps is 0, re-use result calculated for the previous
217 # time step. Normally, it means that no data was added. We have to catch this case because nints
218 # becomes float("inf") and makes the procedure crash. We check against 10e-6 instead of against 0
219 # because an almost-zero information gain can cause pretty big numerical inaccuracy in practice.
220 df.iloc[i] = df.iloc[i - 1]
221 continue
222
223 # Possible error in spending function. May be due to truncation.
224 if df.at[i, "pd"] != 1.0:
225 df.at[i, "pd"] = df.at[i, "pe"] - df.at[i - 1, "pe"]
226 df.at[i, "pd"] = df.at[i, "pd"].clip(0, 1)
227
228 if df.at[i, "pd"] < tol:
229 df.at[i, "zb"] = -zninf
230 if df.at[i, "zb"] > ztrun:
231 df.at[i, "zb"] = ztrun
232 df.at[i, "pd"] = _qp(
233 df.at[i, "zb"] * df.at[i, "sdproc"],
234 last_fcab,
235 df.at[i - 1, "nints"],
236 df.at[i - 1, "ya"],
237 df.at[i - 1, "yb"],
238 df.at[i, "stdv"],
239 )
240 df.at[i, "pe"] = df.at[i, "pd"] + df.at[i - 1, "pe"]
241
242 df.at[i, "yb"] = df.at[i, "zb"] * df.at[i, "sdproc"]
243 elif df.at[i, "pd"] == 1.0:
244 df.at[i, "zb"] = 0.0
245 df.at[i, "yb"] = 0.0
246 elif tol <= df.at[i, "pd"] < 1:
247
248 df.at[i, "yb"] = _bsearch(
249 last_fcab,
250 int(df.loc[i - 1]["nints"]), # differs from R because we modified signature of bsearch
251 df.at[i, "pd"],
252 df.at[i, "stdv"],
253 df.at[i - 1, "ya"],
254 df.at[i - 1, "yb"],
255 )
256
257 df.at[i, "zb"] = df.at[i, "yb"] / df.at[i, "sdproc"]
258
259 if df.at[i, "zb"] > ztrun:
260 df.at[i, "zb"] = ztrun
261 df.at[i, "pd"] = _qp(
262 df.at[i, "zb"] * df.at[i, "sdproc"],
263 last_fcab,
264 int(df.at[i - 1, "nints"]),
265 df.at[i - 1, "ya"],
266 df.at[i - 1, "yb"],
267 df.at[i, "stdv"],
268 )
269 df.at[i, "pe"] = df.at[i, "pd"] + df.at[i - 1, "pe"]
270
271 df.at[i, "yb"] = df.at[i, "zb"] * df.at[i, "sdproc"]
272
273 # in landem.R, the following two statements are in side==1 if clause
274 df.at[i, "ya"] = zninf * df.at[i, "sdproc"]
275 df.at[i, "za"] = zninf
276
277 nints_calc = np.ceil((df.at[i, "yb"] - df.at[i, "ya"]) / (h * df.at[i, "stdv"]))
278 df.at[i, "nints"] = nints_calc if max_nints is None or nints_calc < max_nints else max_nints
279
280 if i < len(t):
281 # in R implementation, i < len(t)-1. However we run until len(t) because that calculation will be
282 # required if landem() is called again with df used as a starting point
283 hlast = (df.at[i - 1, "yb"] - df.at[i - 1, "ya"]) / df.at[i - 1, "nints"]
284 x = np.linspace(
285 df.at[i, "ya"],
286 df.at[i, "yb"],
287 int(df.at[i, "nints"] + 1),
288 )
289 last_fcab = _fcab(
290 last_fcab, int(df.at[i - 1, "nints"]), df.at[i - 1, "ya"], hlast, x, df.at[i, "stdv"]
291 )
292 return df, ComputationState(df, last_fcab)
293
294
295# Simple type to return results in a structured way
296class CalculationResult:
297 def __init__(self, df: pandas.DataFrame, state: ComputationState):
298 self._df = df
299 self._state = state
300
301 @property
302 def df(self):
303 return self._df
304
305 @property
306 def bounds(self):
307 return self._df["zb"].values
308
309 @property
310 def state(self):
311 return self._state
312
313
314columns = ["za", "zb", "ya", "yb", "pe", "pd", "stdv", "sdproc", "nints", "information_ratio"]
315
316# Initial state to be fed into bounds() to calculate sequential bounds from scratch
317EMPTY_STATE = ComputationState(df=pandas.DataFrame(index=None, columns=columns, dtype=float), last_fcab=None)
318
319
320def bounds(
321 t: np.array,
322 alpha: float,
323 rho: float,
324 ztrun: float,
325 sides: int,
326 state: ComputationState = EMPTY_STATE,
327 max_nints=None,
328) -> CalculationResult:
329 """
330 See landem() for parameter explanation
331
332 :return: If a state is provided, returns a tuple of result and state. Otherwise, return only boundary result.
333 """
334
335 def get_input_str():
336 return (
337 f"input params: t={t}, alpha={alpha}, sides={sides}, rho={rho}, ztrun={ztrun},"
338 f"state_df={state.df.to_json()}, state_fcab={state.last_fcab}, max_nints={max_nints}"
339 )
340
341 if any(t == 0.0):
342 raise ValueError(f"Information ratio must must not be zero, {get_input_str()}")
343 if any(t[i] > t[i + 1] for i in range(len(t) - 1)):
344 raise ValueError(f"Information ratio must be monotonically increasing, {get_input_str()}")
345 if not (sides == 1 or sides == 2):
346 raise ValueError(f"sides must either be one a zero, {get_input_str()}")
347
348 if state is None:
349 state = EMPTY_STATE
350
351 alph = alpha / sides
352
353 df_result, new_state = landem(t, alph, rho, ztrun, state, max_nints)
354
355 # guardrail check
356 fixed_horizon_bound = norm.ppf(1 - alph)
357 last_sequential_bound = df_result["zb"].values[-1]
358 if fixed_horizon_bound > last_sequential_bound:
359 raise Exception(
360 f"Last bound ({last_sequential_bound}) is less conservative than fixed horizon bound "
361 f"({fixed_horizon_bound}), {get_input_str()} "
362 )
363
364 return CalculationResult(df_result, new_state)