Coverage for src/duelboard/calculators/bootstrap.py: 100%

60 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-14 19:18 +0900

1"""Bootstrap Elo calculator for confidence intervals.""" 

2 

3import numpy as np 

4import pandas as pd 

5from tqdm import tqdm 

6 

7from duelboard.models import Battle, EloRating 

8from duelboard.types import RatingsDict 

9 

10from .base import EloCalculator 

11 

12 

13class BootstrapEloCalculator(EloCalculator): 

14 """Bootstrap Elo calculator that provides confidence intervals.""" 

15 

16 def __init__( 

17 self, 

18 k_factor: float = 4, 

19 scale: float = 400, 

20 base: float = 10, 

21 initial_rating: float = 1000, 

22 n_bootstrap: int = 1000, 

23 confidence_level: float = 0.95, 

24 random_seed: int | None = None, 

25 ) -> None: 

26 """Initialize the Bootstrap Elo calculator. 

27 

28 Args: 

29 k_factor: K-factor for Elo updates 

30 scale: Scale parameter for Elo calculation 

31 base: Base for exponential calculation 

32 initial_rating: Initial rating for new players 

33 n_bootstrap: Number of bootstrap samples 

34 confidence_level: Confidence level for intervals (e.g., 0.95 for 95%) 

35 random_seed: Random seed for reproducibility 

36 """ 

37 super().__init__(k_factor, scale, base, initial_rating) 

38 self.n_bootstrap = n_bootstrap 

39 self.confidence_level = confidence_level 

40 self.random_seed = random_seed 

41 

42 if random_seed is not None: 

43 np.random.seed(random_seed) 

44 

45 def calculate(self, battles: list[Battle] | pd.DataFrame) -> RatingsDict: 

46 """Calculate Elo ratings with bootstrap confidence intervals. 

47 

48 Args: 

49 battles: List of Battle objects or DataFrame with battles 

50 

51 Returns: 

52 Dictionary mapping player names to EloRating objects with confidence intervals 

53 """ 

54 if isinstance(battles, pd.DataFrame): 

55 battles = self._dataframe_to_battles(battles) 

56 

57 bootstrap_results = [] 

58 

59 for _ in tqdm(range(self.n_bootstrap), desc="Bootstrap sampling"): 

60 sampled_battles = self._bootstrap_sample(battles) 

61 ratings = super().calculate(sampled_battles) 

62 bootstrap_results.append({ 

63 player: rating.rating 

64 for player, rating in ratings.items() 

65 }) 

66 

67 # Convert to DataFrame for easier quantile calculation 

68 bootstrap_df = pd.DataFrame(bootstrap_results) 

69 

70 # Calculate confidence intervals 

71 alpha = 1 - self.confidence_level 

72 lower_quantile = alpha / 2 

73 upper_quantile = 1 - alpha / 2 

74 

75 results = {} 

76 for player in bootstrap_df.columns: 

77 median_rating = bootstrap_df[player].median() 

78 lower_bound = bootstrap_df[player].quantile(lower_quantile) 

79 upper_bound = bootstrap_df[player].quantile(upper_quantile) 

80 

81 # Count battles for this player 

82 battle_count = sum( 

83 1 for battle in battles 

84 if player in (battle.player_a, battle.player_b) 

85 ) 

86 

87 results[player] = EloRating( 

88 player=player, 

89 rating=median_rating, 

90 confidence_interval=(lower_bound, upper_bound), 

91 battles=battle_count, 

92 ) 

93 

94 return results 

95 

96 def _bootstrap_sample(self, battles: list[Battle]) -> list[Battle]: 

97 """Create a bootstrap sample of battles. 

98 

99 Args: 

100 battles: Original list of battles 

101 

102 Returns: 

103 Bootstrap sample of battles (with replacement) 

104 """ 

105 n_battles = len(battles) 

106 indices = np.random.choice(n_battles, size=n_battles, replace=True) 

107 return [battles[i] for i in indices] 

108 

109 def calculate_even_sample( 

110 self, 

111 battles: list[Battle] | pd.DataFrame, 

112 n_per_pair: int = 50, 

113 ) -> RatingsDict: 

114 """Calculate ratings using even sampling across model pairs. 

115 

116 Args: 

117 battles: List of Battle objects or DataFrame with battles 

118 n_per_pair: Number of samples per model pair 

119 

120 Returns: 

121 Dictionary of EloRating objects with confidence intervals 

122 """ 

123 df = ( 

124 battles.copy() 

125 if isinstance(battles, pd.DataFrame) 

126 else pd.DataFrame([ 

127 { 

128 "player_a": battle.player_a, 

129 "player_b": battle.player_b, 

130 "winner": battle.outcome.value, 

131 } 

132 for battle in battles 

133 ]) 

134 ) 

135 

136 bootstrap_results = [] 

137 

138 for _ in tqdm(range(self.n_bootstrap), desc="Bootstrap even sampling"): 

139 sampled_df = self._sample_battles_evenly(df, n_per_pair) 

140 battles_sample = self._dataframe_to_battles(sampled_df) 

141 ratings = super().calculate(battles_sample) 

142 bootstrap_results.append({ 

143 player: rating.rating 

144 for player, rating in ratings.items() 

145 }) 

146 

147 # Calculate confidence intervals 

148 bootstrap_df = pd.DataFrame(bootstrap_results) 

149 alpha = 1 - self.confidence_level 

150 lower_quantile = alpha / 2 

151 upper_quantile = 1 - alpha / 2 

152 

153 results = {} 

154 for player in bootstrap_df.columns: 

155 median_rating = bootstrap_df[player].median() 

156 lower_bound = bootstrap_df[player].quantile(lower_quantile) 

157 upper_bound = bootstrap_df[player].quantile(upper_quantile) 

158 

159 results[player] = EloRating( 

160 player=player, 

161 rating=median_rating, 

162 confidence_interval=(lower_bound, upper_bound), 

163 battles=n_per_pair * 2, # Approximate 

164 ) 

165 

166 return results 

167 

168 def _sample_battles_evenly(self, df: pd.DataFrame, n_per_pair: int) -> pd.DataFrame: 

169 """Sample battles evenly across model pairs. 

170 

171 Args: 

172 df: DataFrame with battles 

173 n_per_pair: Number of samples per pair 

174 

175 Returns: 

176 DataFrame with evenly sampled battles 

177 """ 

178 groups = df.groupby(["player_a", "player_b"], as_index=False) 

179 return ( 

180 groups 

181 .apply(lambda grp: grp.sample(n_per_pair, replace=True), include_groups=False) 

182 .reset_index(drop=True) 

183 )