readabs.read_rba_table
Read a table from the RBA website and store it in a pandas DataFrame.
1"""Read a table from the RBA website and store it in a pandas DataFrame.""" 2 3import re 4from io import BytesIO 5from typing import Any, cast 6 7from pandas import ( 8 DataFrame, 9 DatetimeIndex, 10 Index, 11 Period, 12 PeriodIndex, 13 Series, 14 Timestamp, 15 period_range, 16 read_excel, 17) 18 19from readabs.download_cache import CacheError, HttpError, get_file 20 21# local imports 22from readabs.rba_catalogue import rba_catalogue 23from readabs.rba_meta_data import rba_metacol as rm 24 25# Constants for frequency detection 26MONTHLY_MIN_DAYS = 28 27MONTHLY_MAX_DAYS = 31 28QUARTERLY_MIN_DAYS = 90 29QUARTERLY_MAX_DAYS = 92 30YEARLY_MIN_DAYS = 365 31YEARLY_MAX_DAYS = 366 32 33 34# --- PRIVATE --- 35def _get_excel_file( 36 table: str, 37 *, 38 ignore_errors: bool, 39 **kwargs: Any, # cache args 40) -> bytes | None: 41 """Get the Excel file from the RBA website for the given table. 42 43 Return bytes if successful, otherwise return None. 44 Raises an exception if ignore_errors is False. 45 """ 46 # get the relevant URL for a table moniker 47 cat_map = rba_catalogue() 48 if table not in cat_map.index: 49 message = f"Table '{table}' not found in RBA catalogue." 50 if ignore_errors: 51 print(f"Ignoring error: {message}") 52 return None 53 raise ValueError(message) 54 url = str(cat_map.loc[table, "URL"]) 55 56 # get Excel file - try different file name extensions 57 # becasue the RBA website sometimes changes the file 58 # extension in error 59 urls = [ 60 url, 61 ] 62 rex = re.compile(r"\.[^/]*$") 63 match = rex.search(url) 64 if match is not None: 65 tail = match.group() 66 replace_with = {".xls": ".xlsx", ".xlsx": ".xls"} 67 new_url = re.sub(rex, replace_with.get(tail, tail), url) 68 if new_url != url: 69 urls += [new_url] 70 71 # try to get the Excel file - including with different exensions 72 excel = None 73 for this_url in urls: 74 try: 75 excel = get_file(this_url, **kwargs) 76 break # Success, exit loop 77 except (HttpError, CacheError) as e: 78 if this_url == urls[-1]: 79 if ignore_errors: 80 print(f"Ignoring error: {e}") 81 return None 82 raise 83 84 return excel 85 86 87# --- PUBLIC --- 88def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]: # ignore_errors 89 """Read a table from the RBA website and return the actual data and meta data. 90 91 Returns the actual data and the meta data in a tuple of two DataFrames. 92 93 Parameters 94 ---------- 95 table : str 96 The table to read from the RBA website. 97 **kwargs : Any 98 Additional keyword arguments. 99 The only keyword argument that is used is ignore_errors. 100 ignore_errors : bool = False 101 If True, then any major errors encountered will be printed and the function 102 will return empty DataFrames. If False, then any major errors encountered 103 will raise an exception. 104 105 Returns 106 ------- 107 tuple[DataFrame, DataFrame] 108 The primary data and the meta data in a tuple of two DataFrames. 109 110 Examples 111 -------- 112 ```python 113 data, meta = read_rba_table("C1") 114 ``` 115 116 """ 117 # set-up 118 ignore_errors = kwargs.get("ignore_errors", False) 119 data, meta = DataFrame(), DataFrame() 120 121 # get the Excel file 122 excel = _get_excel_file(table, ignore_errors=ignore_errors, **kwargs) 123 if excel is None: 124 return data, meta 125 126 # read Excel file into DataFrame 127 try: 128 raw = read_excel(BytesIO(excel), header=None, index_col=None) 129 except Exception as e: 130 if ignore_errors: 131 print(f"Ignoring error: {e}") 132 return data, meta 133 raise 134 135 # extract the meta data 136 meta = raw.iloc[1:11, :].T.copy() 137 meta.columns = Index(meta.iloc[0]) 138 renamer = { 139 "Mnemonic": rm.id, 140 } # historical data is inconsistent 141 meta = meta.rename(columns=renamer) 142 meta = meta.iloc[1:, :] 143 meta.index = Index(meta[rm.id]) 144 meta[rm.table] = table 145 meta[rm.tdesc] = raw.iloc[0, 0] 146 meta = meta.dropna(how="all", axis=1) # drop columns with all NaNs 147 148 # extract the data 149 data = raw.iloc[10:, :].copy() 150 data.columns = Index(data.iloc[0]) 151 data = data.iloc[1:, :] 152 data.index = DatetimeIndex(data.iloc[:, 0]) 153 data = data.iloc[:, 1:] 154 data = data.dropna(how="all", axis=1) # drop columns with all NaNs 155 156 # can we make the index into a PeriodIndex? 157 days = data.index.to_series().diff(1).dropna().dt.days 158 if days.min() >= MONTHLY_MIN_DAYS and days.max() <= MONTHLY_MAX_DAYS: 159 data.index = PeriodIndex(data.index, freq="M") 160 elif days.min() >= QUARTERLY_MIN_DAYS and days.max() <= QUARTERLY_MAX_DAYS: 161 data.index = PeriodIndex(data.index, freq="Q") 162 elif days.min() >= YEARLY_MIN_DAYS and days.max() <= YEARLY_MAX_DAYS: 163 data.index = PeriodIndex(data.index, freq="Y") 164 else: 165 data.index = PeriodIndex(data.index, freq="D") 166 167 return data, meta 168 169 170def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> Series: # ignore_errors 171 """Read the Official Cash Rate (OCR) from the RBA website. 172 173 Return it in a pandas Series, with either a daily or monthly PeriodIndex, 174 depending on the value of the monthly parameter. The default is monthly. 175 176 Parameters 177 ---------- 178 monthly : bool = True 179 If True, then the data will be returned with a monthly PeriodIndex. 180 If False, then the data will be returned with a daily PeriodIndex. 181 **kwargs : Any 182 Additional keyword arguments. The only keyword argument that is used is ignore_errors. 183 ignore_errors : bool = False 184 If True, then any major errors encountered will be printed and the function 185 will return an empty Series. If False, then any major errors encountered 186 will raise an exception. 187 188 Returns 189 ------- 190 Series 191 The OCR data in a pandas Series, with an index of either daily or monthly Periods. 192 193 Examples 194 -------- 195 ```python 196 ocr = read_rba_ocr(monthly=True) 197 ``` 198 199 """ 200 # read the OCR table from the RBA website, make float and sort, name the series 201 rba, _rba_meta = read_rba_table("A2", **kwargs) # should have a daily PeriodIndex 202 ocr_series = rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"] 203 ocr = ocr_series.astype(float).sort_index() # pyright: ignore[reportAttributeAccessIssue] 204 ocr.name = "RBA Official Cash Rate" 205 206 # bring up to date 207 today = Period(Timestamp.today(), freq=cast("PeriodIndex", ocr.index).freqstr) 208 last_period = cast("Period", ocr.index[-1]) 209 if last_period < today: 210 ocr[today] = ocr.iloc[-1] 211 212 if not monthly: 213 # fill in missing days and return daily data 214 daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D") 215 return ocr.reindex(daily_index).ffill() 216 217 # convert to monthly data, keeping last value if duplicates in month 218 # fill in missing months 219 ocr.index = PeriodIndex(ocr.index, freq="M") 220 ocr = ocr[~ocr.index.duplicated(keep="last")] 221 monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M") 222 return ocr.reindex(monthly_index, method="ffill") 223 224 225# --- TESTING --- 226if __name__ == "__main__": 227 228 def test_read_rba_table() -> None: 229 """Test the read_rba_table function.""" 230 # test with a known table 231 d, m = read_rba_table("C1") 232 print(m) 233 print(d.head()) 234 print(d.tail()) 235 print("=" * 20) 236 237 # test with an unknown table 238 try: 239 d, m = read_rba_table("XYZ") 240 except ValueError as e: 241 print(e) 242 print("=" * 20) 243 244 test_read_rba_table() 245 246 def test_read_rba_ocr() -> None: 247 """Test the read_rba_ocr function.""" 248 # test with monthly data 249 ocr = read_rba_ocr(monthly=True) 250 print(ocr.head()) 251 print("...") 252 print(ocr.tail()) 253 print("=" * 20) 254 255 # test with daily data 256 ocr = read_rba_ocr(monthly=False) 257 print(ocr.head()) 258 print("...") 259 print(ocr.tail()) 260 print("=" * 20) 261 262 test_read_rba_ocr()
89def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]: # ignore_errors 90 """Read a table from the RBA website and return the actual data and meta data. 91 92 Returns the actual data and the meta data in a tuple of two DataFrames. 93 94 Parameters 95 ---------- 96 table : str 97 The table to read from the RBA website. 98 **kwargs : Any 99 Additional keyword arguments. 100 The only keyword argument that is used is ignore_errors. 101 ignore_errors : bool = False 102 If True, then any major errors encountered will be printed and the function 103 will return empty DataFrames. If False, then any major errors encountered 104 will raise an exception. 105 106 Returns 107 ------- 108 tuple[DataFrame, DataFrame] 109 The primary data and the meta data in a tuple of two DataFrames. 110 111 Examples 112 -------- 113 ```python 114 data, meta = read_rba_table("C1") 115 ``` 116 117 """ 118 # set-up 119 ignore_errors = kwargs.get("ignore_errors", False) 120 data, meta = DataFrame(), DataFrame() 121 122 # get the Excel file 123 excel = _get_excel_file(table, ignore_errors=ignore_errors, **kwargs) 124 if excel is None: 125 return data, meta 126 127 # read Excel file into DataFrame 128 try: 129 raw = read_excel(BytesIO(excel), header=None, index_col=None) 130 except Exception as e: 131 if ignore_errors: 132 print(f"Ignoring error: {e}") 133 return data, meta 134 raise 135 136 # extract the meta data 137 meta = raw.iloc[1:11, :].T.copy() 138 meta.columns = Index(meta.iloc[0]) 139 renamer = { 140 "Mnemonic": rm.id, 141 } # historical data is inconsistent 142 meta = meta.rename(columns=renamer) 143 meta = meta.iloc[1:, :] 144 meta.index = Index(meta[rm.id]) 145 meta[rm.table] = table 146 meta[rm.tdesc] = raw.iloc[0, 0] 147 meta = meta.dropna(how="all", axis=1) # drop columns with all NaNs 148 149 # extract the data 150 data = raw.iloc[10:, :].copy() 151 data.columns = Index(data.iloc[0]) 152 data = data.iloc[1:, :] 153 data.index = DatetimeIndex(data.iloc[:, 0]) 154 data = data.iloc[:, 1:] 155 data = data.dropna(how="all", axis=1) # drop columns with all NaNs 156 157 # can we make the index into a PeriodIndex? 158 days = data.index.to_series().diff(1).dropna().dt.days 159 if days.min() >= MONTHLY_MIN_DAYS and days.max() <= MONTHLY_MAX_DAYS: 160 data.index = PeriodIndex(data.index, freq="M") 161 elif days.min() >= QUARTERLY_MIN_DAYS and days.max() <= QUARTERLY_MAX_DAYS: 162 data.index = PeriodIndex(data.index, freq="Q") 163 elif days.min() >= YEARLY_MIN_DAYS and days.max() <= YEARLY_MAX_DAYS: 164 data.index = PeriodIndex(data.index, freq="Y") 165 else: 166 data.index = PeriodIndex(data.index, freq="D") 167 168 return data, meta
Read a table from the RBA website and return the actual data and meta data.
Returns the actual data and the meta data in a tuple of two DataFrames.
Parameters
table : str The table to read from the RBA website. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return empty DataFrames. If False, then any major errors encountered will raise an exception.
Returns
tuple[DataFrame, DataFrame] The primary data and the meta data in a tuple of two DataFrames.
Examples
data, meta = read_rba_table("C1")
171def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> Series: # ignore_errors 172 """Read the Official Cash Rate (OCR) from the RBA website. 173 174 Return it in a pandas Series, with either a daily or monthly PeriodIndex, 175 depending on the value of the monthly parameter. The default is monthly. 176 177 Parameters 178 ---------- 179 monthly : bool = True 180 If True, then the data will be returned with a monthly PeriodIndex. 181 If False, then the data will be returned with a daily PeriodIndex. 182 **kwargs : Any 183 Additional keyword arguments. The only keyword argument that is used is ignore_errors. 184 ignore_errors : bool = False 185 If True, then any major errors encountered will be printed and the function 186 will return an empty Series. If False, then any major errors encountered 187 will raise an exception. 188 189 Returns 190 ------- 191 Series 192 The OCR data in a pandas Series, with an index of either daily or monthly Periods. 193 194 Examples 195 -------- 196 ```python 197 ocr = read_rba_ocr(monthly=True) 198 ``` 199 200 """ 201 # read the OCR table from the RBA website, make float and sort, name the series 202 rba, _rba_meta = read_rba_table("A2", **kwargs) # should have a daily PeriodIndex 203 ocr_series = rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"] 204 ocr = ocr_series.astype(float).sort_index() # pyright: ignore[reportAttributeAccessIssue] 205 ocr.name = "RBA Official Cash Rate" 206 207 # bring up to date 208 today = Period(Timestamp.today(), freq=cast("PeriodIndex", ocr.index).freqstr) 209 last_period = cast("Period", ocr.index[-1]) 210 if last_period < today: 211 ocr[today] = ocr.iloc[-1] 212 213 if not monthly: 214 # fill in missing days and return daily data 215 daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D") 216 return ocr.reindex(daily_index).ffill() 217 218 # convert to monthly data, keeping last value if duplicates in month 219 # fill in missing months 220 ocr.index = PeriodIndex(ocr.index, freq="M") 221 ocr = ocr[~ocr.index.duplicated(keep="last")] 222 monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M") 223 return ocr.reindex(monthly_index, method="ffill")
Read the Official Cash Rate (OCR) from the RBA website.
Return it in a pandas Series, with either a daily or monthly PeriodIndex, depending on the value of the monthly parameter. The default is monthly.
Parameters
monthly : bool = True If True, then the data will be returned with a monthly PeriodIndex. If False, then the data will be returned with a daily PeriodIndex. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return an empty Series. If False, then any major errors encountered will raise an exception.
Returns
Series The OCR data in a pandas Series, with an index of either daily or monthly Periods.
Examples
ocr = read_rba_ocr(monthly=True)