Coverage for src/duelboard/calculators/bootstrap.py: 100%
60 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 19:18 +0900
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-14 19:18 +0900
1"""Bootstrap Elo calculator for confidence intervals."""
3import numpy as np
4import pandas as pd
5from tqdm import tqdm
7from duelboard.models import Battle, EloRating
8from duelboard.types import RatingsDict
10from .base import EloCalculator
13class BootstrapEloCalculator(EloCalculator):
14 """Bootstrap Elo calculator that provides confidence intervals."""
16 def __init__(
17 self,
18 k_factor: float = 4,
19 scale: float = 400,
20 base: float = 10,
21 initial_rating: float = 1000,
22 n_bootstrap: int = 1000,
23 confidence_level: float = 0.95,
24 random_seed: int | None = None,
25 ) -> None:
26 """Initialize the Bootstrap Elo calculator.
28 Args:
29 k_factor: K-factor for Elo updates
30 scale: Scale parameter for Elo calculation
31 base: Base for exponential calculation
32 initial_rating: Initial rating for new players
33 n_bootstrap: Number of bootstrap samples
34 confidence_level: Confidence level for intervals (e.g., 0.95 for 95%)
35 random_seed: Random seed for reproducibility
36 """
37 super().__init__(k_factor, scale, base, initial_rating)
38 self.n_bootstrap = n_bootstrap
39 self.confidence_level = confidence_level
40 self.random_seed = random_seed
42 if random_seed is not None:
43 np.random.seed(random_seed)
45 def calculate(self, battles: list[Battle] | pd.DataFrame) -> RatingsDict:
46 """Calculate Elo ratings with bootstrap confidence intervals.
48 Args:
49 battles: List of Battle objects or DataFrame with battles
51 Returns:
52 Dictionary mapping player names to EloRating objects with confidence intervals
53 """
54 if isinstance(battles, pd.DataFrame):
55 battles = self._dataframe_to_battles(battles)
57 bootstrap_results = []
59 for _ in tqdm(range(self.n_bootstrap), desc="Bootstrap sampling"):
60 sampled_battles = self._bootstrap_sample(battles)
61 ratings = super().calculate(sampled_battles)
62 bootstrap_results.append({
63 player: rating.rating
64 for player, rating in ratings.items()
65 })
67 # Convert to DataFrame for easier quantile calculation
68 bootstrap_df = pd.DataFrame(bootstrap_results)
70 # Calculate confidence intervals
71 alpha = 1 - self.confidence_level
72 lower_quantile = alpha / 2
73 upper_quantile = 1 - alpha / 2
75 results = {}
76 for player in bootstrap_df.columns:
77 median_rating = bootstrap_df[player].median()
78 lower_bound = bootstrap_df[player].quantile(lower_quantile)
79 upper_bound = bootstrap_df[player].quantile(upper_quantile)
81 # Count battles for this player
82 battle_count = sum(
83 1 for battle in battles
84 if player in (battle.player_a, battle.player_b)
85 )
87 results[player] = EloRating(
88 player=player,
89 rating=median_rating,
90 confidence_interval=(lower_bound, upper_bound),
91 battles=battle_count,
92 )
94 return results
96 def _bootstrap_sample(self, battles: list[Battle]) -> list[Battle]:
97 """Create a bootstrap sample of battles.
99 Args:
100 battles: Original list of battles
102 Returns:
103 Bootstrap sample of battles (with replacement)
104 """
105 n_battles = len(battles)
106 indices = np.random.choice(n_battles, size=n_battles, replace=True)
107 return [battles[i] for i in indices]
109 def calculate_even_sample(
110 self,
111 battles: list[Battle] | pd.DataFrame,
112 n_per_pair: int = 50,
113 ) -> RatingsDict:
114 """Calculate ratings using even sampling across model pairs.
116 Args:
117 battles: List of Battle objects or DataFrame with battles
118 n_per_pair: Number of samples per model pair
120 Returns:
121 Dictionary of EloRating objects with confidence intervals
122 """
123 df = (
124 battles.copy()
125 if isinstance(battles, pd.DataFrame)
126 else pd.DataFrame([
127 {
128 "player_a": battle.player_a,
129 "player_b": battle.player_b,
130 "winner": battle.outcome.value,
131 }
132 for battle in battles
133 ])
134 )
136 bootstrap_results = []
138 for _ in tqdm(range(self.n_bootstrap), desc="Bootstrap even sampling"):
139 sampled_df = self._sample_battles_evenly(df, n_per_pair)
140 battles_sample = self._dataframe_to_battles(sampled_df)
141 ratings = super().calculate(battles_sample)
142 bootstrap_results.append({
143 player: rating.rating
144 for player, rating in ratings.items()
145 })
147 # Calculate confidence intervals
148 bootstrap_df = pd.DataFrame(bootstrap_results)
149 alpha = 1 - self.confidence_level
150 lower_quantile = alpha / 2
151 upper_quantile = 1 - alpha / 2
153 results = {}
154 for player in bootstrap_df.columns:
155 median_rating = bootstrap_df[player].median()
156 lower_bound = bootstrap_df[player].quantile(lower_quantile)
157 upper_bound = bootstrap_df[player].quantile(upper_quantile)
159 results[player] = EloRating(
160 player=player,
161 rating=median_rating,
162 confidence_interval=(lower_bound, upper_bound),
163 battles=n_per_pair * 2, # Approximate
164 )
166 return results
168 def _sample_battles_evenly(self, df: pd.DataFrame, n_per_pair: int) -> pd.DataFrame:
169 """Sample battles evenly across model pairs.
171 Args:
172 df: DataFrame with battles
173 n_per_pair: Number of samples per pair
175 Returns:
176 DataFrame with evenly sampled battles
177 """
178 groups = df.groupby(["player_a", "player_b"], as_index=False)
179 return (
180 groups
181 .apply(lambda grp: grp.sample(n_per_pair, replace=True), include_groups=False)
182 .reset_index(drop=True)
183 )