readabs.rba_catalogue

Extract links to RBA data files from the RBA website.

  1"""Extract links to RBA data files from the RBA website."""
  2
  3import re
  4from functools import cache
  5from typing import Any
  6
  7from bs4 import BeautifulSoup, Tag
  8from pandas import DataFrame
  9
 10from readabs.download_cache import CacheError, HttpError, get_file
 11
 12# Constants
 13EXPECTED_PAIR_LENGTH = 2
 14
 15
 16@cache
 17def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame:
 18    """Return a DataFrame of RBA Catalogue numbers.
 19
 20    In the first instance, this is downloaded from the RBA website, and
 21    cached for future use.
 22
 23    Parameters
 24    ----------
 25    cache_only : bool = False
 26        If True, only use the cache.
 27    verbose : bool = False
 28        If True, print progress messages.
 29
 30    Returns
 31    -------
 32    DataFrame
 33        A DataFrame of RBA Catalogue numbers.
 34
 35    Example
 36    -------
 37    ```python
 38    import readabs as ra
 39    catalogue = ra.rba_catalogue()
 40    ```
 41
 42    """
 43    return _get_rba_links(cache_only=cache_only, verbose=verbose)
 44
 45
 46def print_rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> None:
 47    """Print to standard output a table of the RBA Catalogue Numbers.
 48
 49    This function prints a formatted table of RBA catalogue numbers
 50    to standard output.
 51
 52    Parameters
 53    ----------
 54    cache_only : bool = False
 55        If True, only use the cache.
 56    verbose : bool = False
 57        If True, print progress messages.
 58
 59    Returns
 60    -------
 61    None
 62        This function does not return anything.
 63
 64    Example
 65    -------
 66
 67    ```python
 68    import readabs as ra
 69    ra.print_rba_catalogue()
 70    ```
 71
 72    """
 73    rba_catalog = rba_catalogue(cache_only=cache_only, verbose=verbose)
 74    print(rba_catalog.loc[:, rba_catalog.columns != "URL"].to_markdown())
 75
 76
 77def _get_soup(url: str, **kwargs: Any) -> BeautifulSoup | None:  # cache args
 78    """Return a BeautifulSoup object from a URL.
 79
 80    Returns None on error.
 81    """
 82    try:
 83        page = get_file(url, **kwargs)
 84    except (HttpError, CacheError) as e:
 85        print(f"Error: {e}")
 86        return None
 87
 88    # remove those pesky span tags - possibly not necessary
 89    page = re.sub(b"<span[^>]*>", b" ", page)
 90    page = re.sub(b"</span>", b" ", page)
 91    page = re.sub(b"\\s+", b" ", page)  # tidy up white space
 92
 93    return BeautifulSoup(page, "html.parser")
 94
 95
 96def _historical_name_fix(
 97    moniker: str,
 98    foretext: str,
 99    prefix: str,
100) -> tuple[str, str]:
101    """Fix the historical data names. Returns a tuple of moniker and foretext."""
102    if "Exchange Rates" in foretext:
103        foretext = f"{foretext} - {moniker}"
104        moniker = "F11.1"
105
106    for word in ["Daily", "Monthly", "Detailed", "Summary", "Allotted"]:
107        if word in foretext:
108            moniker = f"{moniker}-{word}"
109            break
110
111    last = foretext.rsplit(" ", 1)[-1]
112    if re.match(r"\d{4}", last):
113        moniker = f"{moniker}-{last}"
114
115    moniker = f"{prefix}{moniker}"
116
117    return moniker, foretext
118
119
120def _excel_link_capture(
121    soup: BeautifulSoup,
122    prefix: str,
123) -> dict[str, dict[str, str]]:
124    """Capture all links (of Microsoft Excel types) from the BeautifulSoup object.
125
126    Returns a dictionary with the following structure:
127    {moniker: {"Description": text, "URL": url}}.
128    """
129    # The RBA has a number of historic tables that are not well
130    # formated. We will exclude these from the dictionary.
131    historic_exclusions = ("E4", "E5", "E6", "E7", "J1", "J2")
132
133    link_dict = {}
134    for link in soup.find_all("a"):
135        # Ensure we have a Tag object with href attribute
136        if not isinstance(link, Tag):
137            continue
138        href = link.get("href")
139        if not href or not isinstance(href, str):
140            continue
141        url = href.strip()
142        if not url:
143            continue
144
145        tail = url.rsplit("/", 1)[-1].lower()
146        if "." not in tail:
147            continue
148        if not tail.endswith(".xls") and not tail.endswith(".xlsx"):
149            continue
150        text, url = link.text, _make_absolute_url(url.strip())
151        text = text.replace("\u2013", "-").strip()  # Replace EN DASH with HYPHEN
152
153        pair = text.rsplit(" - ", 1)
154        if len(pair) != EXPECTED_PAIR_LENGTH:
155            continue
156        foretext, moniker = pair
157
158        if prefix:
159            # Remove historical data that does not easily
160            # parse under the same rules as for the current data.
161            if moniker in historic_exclusions:
162                continue
163            if "Occasional Paper" in moniker:
164                continue
165
166            # The historical data is a bit ugly. Let's clean it up.
167            moniker, foretext = _historical_name_fix(moniker, foretext, prefix)
168
169        if moniker in link_dict:
170            print(f"Warning: {moniker} already exists in the dictionary {tail}")
171            if tail != ".xlsx":
172                # do not replace a .xlsx link with an .xls link
173                continue
174        link_dict[moniker] = {"Description": foretext.strip(), "URL": url}
175
176    return link_dict
177
178
179@cache
180def _get_rba_links(**kwargs: Any) -> DataFrame:  # cache args
181    """Extract links to RBA data files in Excel format from the RBA website.
182
183    Returns a DataFrame with the following columns: 'Description' and 'URL'.
184    The index is the 'Table' number. Returns an empty DataFrame on error.
185    """
186    urls = [
187        ("https://www.rba.gov.au/statistics/tables/", ""),  # current
188        ("https://www.rba.gov.au/statistics/historical-data.html", "Z:"),  # history
189    ]
190
191    link_dict = {}
192    for url, prefix in urls:
193        soup = _get_soup(url, **kwargs)
194        if soup is not None:
195            link_dict.update(_excel_link_capture(soup, prefix))
196
197    rba_catalog = DataFrame(link_dict).T.sort_index()
198    rba_catalog.index.name = "Table"
199    return rba_catalog
200
201
202# private
203def _make_absolute_url(url: str, prefix: str = "https://www.rba.gov.au") -> str:
204    """Convert a relative URL address found on the RBA site to an absolute URL.
205
206    Takes a relative URL and converts it to an absolute URL address.
207    """
208    # remove a prefix if it already exists (just to be sure)
209    url = url.replace(prefix, "")
210    url = url.replace(prefix.replace("https://", "http://"), "")
211    # then add the prefix (back) ...
212    return f"{prefix}{url}"
213
214
215# --- testing ---
216if __name__ == "__main__":
217    print_rba_catalogue(cache_only=False, verbose=False)
EXPECTED_PAIR_LENGTH = 2
@cache
def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> pandas.DataFrame:
17@cache
18def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame:
19    """Return a DataFrame of RBA Catalogue numbers.
20
21    In the first instance, this is downloaded from the RBA website, and
22    cached for future use.
23
24    Parameters
25    ----------
26    cache_only : bool = False
27        If True, only use the cache.
28    verbose : bool = False
29        If True, print progress messages.
30
31    Returns
32    -------
33    DataFrame
34        A DataFrame of RBA Catalogue numbers.
35
36    Example
37    -------
38    ```python
39    import readabs as ra
40    catalogue = ra.rba_catalogue()
41    ```
42
43    """
44    return _get_rba_links(cache_only=cache_only, verbose=verbose)

Return a DataFrame of RBA Catalogue numbers.

In the first instance, this is downloaded from the RBA website, and cached for future use.

Parameters

cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.

Returns

DataFrame A DataFrame of RBA Catalogue numbers.

Example

import readabs as ra
catalogue = ra.rba_catalogue()