readabs.read_rba_table

Read a table from the RBA website and store it in a pandas DataFrame.

  1"""Read a table from the RBA website and store it in a pandas DataFrame."""
  2
  3import re
  4from io import BytesIO
  5from typing import Any, cast
  6
  7from pandas import (
  8    DataFrame,
  9    DatetimeIndex,
 10    Index,
 11    Period,
 12    PeriodIndex,
 13    Series,
 14    Timestamp,
 15    period_range,
 16    read_excel,
 17)
 18
 19from readabs.download_cache import CacheError, HttpError, get_file
 20
 21# local imports
 22from readabs.rba_catalogue import rba_catalogue
 23from readabs.rba_meta_data import rba_metacol as rm
 24
 25# Constants for frequency detection
 26MONTHLY_MIN_DAYS = 28
 27MONTHLY_MAX_DAYS = 31
 28QUARTERLY_MIN_DAYS = 90
 29QUARTERLY_MAX_DAYS = 92
 30YEARLY_MIN_DAYS = 365
 31YEARLY_MAX_DAYS = 366
 32
 33
 34# --- PRIVATE ---
 35def _get_excel_file(
 36    table: str,
 37    *,
 38    ignore_errors: bool,
 39    **kwargs: Any,  # cache args
 40) -> bytes | None:
 41    """Get the Excel file from the RBA website for the given table.
 42
 43    Return bytes if successful, otherwise return None.
 44    Raises an exception if ignore_errors is False.
 45    """
 46    # get the relevant URL for a table moniker
 47    cat_map = rba_catalogue()
 48    if table not in cat_map.index:
 49        message = f"Table '{table}' not found in RBA catalogue."
 50        if ignore_errors:
 51            print(f"Ignoring error: {message}")
 52            return None
 53        raise ValueError(message)
 54    url = str(cat_map.loc[table, "URL"])
 55
 56    # get Excel file - try different file name extensions
 57    # becasue the RBA website sometimes changes the file
 58    # extension in error
 59    urls = [
 60        url,
 61    ]
 62    rex = re.compile(r"\.[^/]*$")
 63    match = rex.search(url)
 64    if match is not None:
 65        tail = match.group()
 66        replace_with = {".xls": ".xlsx", ".xlsx": ".xls"}
 67        new_url = re.sub(rex, replace_with.get(tail, tail), url)
 68        if new_url != url:
 69            urls += [new_url]
 70
 71    # try to get the Excel file - including with different exensions
 72    excel = None
 73    for this_url in urls:
 74        try:
 75            excel = get_file(this_url, **kwargs)
 76            break  # Success, exit loop
 77        except (HttpError, CacheError) as e:
 78            if this_url == urls[-1]:
 79                if ignore_errors:
 80                    print(f"Ignoring error: {e}")
 81                    return None
 82                raise
 83
 84    return excel
 85
 86
 87# --- PUBLIC ---
 88def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]:  # ignore_errors
 89    """Read a table from the RBA website and return the actual data and meta data.
 90
 91    Returns the actual data and the meta data in a tuple of two DataFrames.
 92
 93    Parameters
 94    ----------
 95    table : str
 96        The table to read from the RBA website.
 97    **kwargs : Any
 98        Additional keyword arguments.
 99        The only keyword argument that is used is ignore_errors.
100    ignore_errors : bool = False
101        If True, then any major errors encountered will be printed and the function
102        will return empty DataFrames. If False, then any major errors encountered
103        will raise an exception.
104
105    Returns
106    -------
107    tuple[DataFrame, DataFrame]
108        The primary data and the meta data in a tuple of two DataFrames.
109
110    Examples
111    --------
112    ```python
113    data, meta = read_rba_table("C1")
114    ```
115
116    """
117    # set-up
118    ignore_errors = kwargs.get("ignore_errors", False)
119    data, meta = DataFrame(), DataFrame()
120
121    # get the Excel file
122    excel = _get_excel_file(table, ignore_errors=ignore_errors, **kwargs)
123    if excel is None:
124        return data, meta
125
126    # read Excel file into DataFrame
127    try:
128        raw = read_excel(BytesIO(excel), header=None, index_col=None)
129    except Exception as e:
130        if ignore_errors:
131            print(f"Ignoring error: {e}")
132            return data, meta
133        raise
134
135    # extract the meta data
136    meta = raw.iloc[1:11, :].T.copy()
137    meta.columns = Index(meta.iloc[0])
138    renamer = {
139        "Mnemonic": rm.id,
140    }  # historical data is inconsistent
141    meta = meta.rename(columns=renamer)
142    meta = meta.iloc[1:, :]
143    meta.index = Index(meta[rm.id])
144    meta[rm.table] = table
145    meta[rm.tdesc] = raw.iloc[0, 0]
146    meta = meta.dropna(how="all", axis=1)  # drop columns with all NaNs
147
148    # extract the data
149    data = raw.iloc[10:, :].copy()
150    data.columns = Index(data.iloc[0])
151    data = data.iloc[1:, :]
152    data.index = DatetimeIndex(data.iloc[:, 0])
153    data = data.iloc[:, 1:]
154    data = data.dropna(how="all", axis=1)  # drop columns with all NaNs
155
156    # can we make the index into a PeriodIndex?
157    days = data.index.to_series().diff(1).dropna().dt.days
158    if days.min() >= MONTHLY_MIN_DAYS and days.max() <= MONTHLY_MAX_DAYS:
159        data.index = PeriodIndex(data.index, freq="M")
160    elif days.min() >= QUARTERLY_MIN_DAYS and days.max() <= QUARTERLY_MAX_DAYS:
161        data.index = PeriodIndex(data.index, freq="Q")
162    elif days.min() >= YEARLY_MIN_DAYS and days.max() <= YEARLY_MAX_DAYS:
163        data.index = PeriodIndex(data.index, freq="Y")
164    else:
165        data.index = PeriodIndex(data.index, freq="D")
166
167    return data, meta
168
169
170def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> Series:  # ignore_errors
171    """Read the Official Cash Rate (OCR) from the RBA website.
172
173    Return it in a pandas Series, with either a daily or monthly PeriodIndex,
174    depending on the value of the monthly parameter. The default is monthly.
175
176    Parameters
177    ----------
178    monthly : bool = True
179        If True, then the data will be returned with a monthly PeriodIndex.
180        If False, then the data will be returned with a daily PeriodIndex.
181    **kwargs : Any
182        Additional keyword arguments. The only keyword argument that is used is ignore_errors.
183    ignore_errors : bool = False
184        If True, then any major errors encountered will be printed and the function
185        will return an empty Series. If False, then any major errors encountered
186        will raise an exception.
187
188    Returns
189    -------
190    Series
191        The OCR data in a pandas Series, with an index of either daily or monthly Periods.
192
193    Examples
194    --------
195    ```python
196    ocr = read_rba_ocr(monthly=True)
197    ```
198
199    """
200    # read the OCR table from the RBA website, make float and sort, name the series
201    rba, _rba_meta = read_rba_table("A2", **kwargs)  # should have a daily PeriodIndex
202    ocr_series = rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"]
203    ocr = ocr_series.astype(float).sort_index()  # pyright: ignore[reportAttributeAccessIssue]
204    ocr.name = "RBA Official Cash Rate"
205
206    # bring up to date
207    today = Period(Timestamp.today(), freq=cast("PeriodIndex", ocr.index).freqstr)
208    last_period = cast("Period", ocr.index[-1])
209    if last_period < today:
210        ocr[today] = ocr.iloc[-1]
211
212    if not monthly:
213        # fill in missing days and return daily data
214        daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D")
215        return ocr.reindex(daily_index).ffill()
216
217    # convert to monthly data, keeping last value if duplicates in month
218    # fill in missing months
219    ocr.index = PeriodIndex(ocr.index, freq="M")
220    ocr = ocr[~ocr.index.duplicated(keep="last")]
221    monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M")
222    return ocr.reindex(monthly_index, method="ffill")
223
224
225# --- TESTING ---
226if __name__ == "__main__":
227
228    def test_read_rba_table() -> None:
229        """Test the read_rba_table function."""
230        # test with a known table
231        d, m = read_rba_table("C1")
232        print(m)
233        print(d.head())
234        print(d.tail())
235        print("=" * 20)
236
237        # test with an unknown table
238        try:
239            d, m = read_rba_table("XYZ")
240        except ValueError as e:
241            print(e)
242        print("=" * 20)
243
244    test_read_rba_table()
245
246    def test_read_rba_ocr() -> None:
247        """Test the read_rba_ocr function."""
248        # test with monthly data
249        ocr = read_rba_ocr(monthly=True)
250        print(ocr.head())
251        print("...")
252        print(ocr.tail())
253        print("=" * 20)
254
255        # test with daily data
256        ocr = read_rba_ocr(monthly=False)
257        print(ocr.head())
258        print("...")
259        print(ocr.tail())
260        print("=" * 20)
261
262    test_read_rba_ocr()
MONTHLY_MIN_DAYS = 28
MONTHLY_MAX_DAYS = 31
QUARTERLY_MIN_DAYS = 90
QUARTERLY_MAX_DAYS = 92
YEARLY_MIN_DAYS = 365
YEARLY_MAX_DAYS = 366
def read_rba_table(table: str, **kwargs: Any) -> tuple[pandas.DataFrame, pandas.DataFrame]:
 89def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]:  # ignore_errors
 90    """Read a table from the RBA website and return the actual data and meta data.
 91
 92    Returns the actual data and the meta data in a tuple of two DataFrames.
 93
 94    Parameters
 95    ----------
 96    table : str
 97        The table to read from the RBA website.
 98    **kwargs : Any
 99        Additional keyword arguments.
100        The only keyword argument that is used is ignore_errors.
101    ignore_errors : bool = False
102        If True, then any major errors encountered will be printed and the function
103        will return empty DataFrames. If False, then any major errors encountered
104        will raise an exception.
105
106    Returns
107    -------
108    tuple[DataFrame, DataFrame]
109        The primary data and the meta data in a tuple of two DataFrames.
110
111    Examples
112    --------
113    ```python
114    data, meta = read_rba_table("C1")
115    ```
116
117    """
118    # set-up
119    ignore_errors = kwargs.get("ignore_errors", False)
120    data, meta = DataFrame(), DataFrame()
121
122    # get the Excel file
123    excel = _get_excel_file(table, ignore_errors=ignore_errors, **kwargs)
124    if excel is None:
125        return data, meta
126
127    # read Excel file into DataFrame
128    try:
129        raw = read_excel(BytesIO(excel), header=None, index_col=None)
130    except Exception as e:
131        if ignore_errors:
132            print(f"Ignoring error: {e}")
133            return data, meta
134        raise
135
136    # extract the meta data
137    meta = raw.iloc[1:11, :].T.copy()
138    meta.columns = Index(meta.iloc[0])
139    renamer = {
140        "Mnemonic": rm.id,
141    }  # historical data is inconsistent
142    meta = meta.rename(columns=renamer)
143    meta = meta.iloc[1:, :]
144    meta.index = Index(meta[rm.id])
145    meta[rm.table] = table
146    meta[rm.tdesc] = raw.iloc[0, 0]
147    meta = meta.dropna(how="all", axis=1)  # drop columns with all NaNs
148
149    # extract the data
150    data = raw.iloc[10:, :].copy()
151    data.columns = Index(data.iloc[0])
152    data = data.iloc[1:, :]
153    data.index = DatetimeIndex(data.iloc[:, 0])
154    data = data.iloc[:, 1:]
155    data = data.dropna(how="all", axis=1)  # drop columns with all NaNs
156
157    # can we make the index into a PeriodIndex?
158    days = data.index.to_series().diff(1).dropna().dt.days
159    if days.min() >= MONTHLY_MIN_DAYS and days.max() <= MONTHLY_MAX_DAYS:
160        data.index = PeriodIndex(data.index, freq="M")
161    elif days.min() >= QUARTERLY_MIN_DAYS and days.max() <= QUARTERLY_MAX_DAYS:
162        data.index = PeriodIndex(data.index, freq="Q")
163    elif days.min() >= YEARLY_MIN_DAYS and days.max() <= YEARLY_MAX_DAYS:
164        data.index = PeriodIndex(data.index, freq="Y")
165    else:
166        data.index = PeriodIndex(data.index, freq="D")
167
168    return data, meta

Read a table from the RBA website and return the actual data and meta data.

Returns the actual data and the meta data in a tuple of two DataFrames.

Parameters

table : str The table to read from the RBA website. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return empty DataFrames. If False, then any major errors encountered will raise an exception.

Returns

tuple[DataFrame, DataFrame] The primary data and the meta data in a tuple of two DataFrames.

Examples

data, meta = read_rba_table("C1")
def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> pandas.Series:
171def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> Series:  # ignore_errors
172    """Read the Official Cash Rate (OCR) from the RBA website.
173
174    Return it in a pandas Series, with either a daily or monthly PeriodIndex,
175    depending on the value of the monthly parameter. The default is monthly.
176
177    Parameters
178    ----------
179    monthly : bool = True
180        If True, then the data will be returned with a monthly PeriodIndex.
181        If False, then the data will be returned with a daily PeriodIndex.
182    **kwargs : Any
183        Additional keyword arguments. The only keyword argument that is used is ignore_errors.
184    ignore_errors : bool = False
185        If True, then any major errors encountered will be printed and the function
186        will return an empty Series. If False, then any major errors encountered
187        will raise an exception.
188
189    Returns
190    -------
191    Series
192        The OCR data in a pandas Series, with an index of either daily or monthly Periods.
193
194    Examples
195    --------
196    ```python
197    ocr = read_rba_ocr(monthly=True)
198    ```
199
200    """
201    # read the OCR table from the RBA website, make float and sort, name the series
202    rba, _rba_meta = read_rba_table("A2", **kwargs)  # should have a daily PeriodIndex
203    ocr_series = rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"]
204    ocr = ocr_series.astype(float).sort_index()  # pyright: ignore[reportAttributeAccessIssue]
205    ocr.name = "RBA Official Cash Rate"
206
207    # bring up to date
208    today = Period(Timestamp.today(), freq=cast("PeriodIndex", ocr.index).freqstr)
209    last_period = cast("Period", ocr.index[-1])
210    if last_period < today:
211        ocr[today] = ocr.iloc[-1]
212
213    if not monthly:
214        # fill in missing days and return daily data
215        daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D")
216        return ocr.reindex(daily_index).ffill()
217
218    # convert to monthly data, keeping last value if duplicates in month
219    # fill in missing months
220    ocr.index = PeriodIndex(ocr.index, freq="M")
221    ocr = ocr[~ocr.index.duplicated(keep="last")]
222    monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M")
223    return ocr.reindex(monthly_index, method="ffill")

Read the Official Cash Rate (OCR) from the RBA website.

Return it in a pandas Series, with either a daily or monthly PeriodIndex, depending on the value of the monthly parameter. The default is monthly.

Parameters

monthly : bool = True If True, then the data will be returned with a monthly PeriodIndex. If False, then the data will be returned with a daily PeriodIndex. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return an empty Series. If False, then any major errors encountered will raise an exception.

Returns

Series The OCR data in a pandas Series, with an index of either daily or monthly Periods.

Examples

ocr = read_rba_ocr(monthly=True)