readabs.rba_catalogue
Extract links to RBA data files from the RBA website.
1"""Extract links to RBA data files from the RBA website.""" 2 3import re 4from functools import cache 5from typing import Any 6 7from bs4 import BeautifulSoup, Tag 8from pandas import DataFrame 9 10from readabs.download_cache import CacheError, HttpError, get_file 11 12# Constants 13EXPECTED_PAIR_LENGTH = 2 14 15 16@cache 17def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame: 18 """Return a DataFrame of RBA Catalogue numbers. 19 20 In the first instance, this is downloaded from the RBA website, and 21 cached for future use. 22 23 Parameters 24 ---------- 25 cache_only : bool = False 26 If True, only use the cache. 27 verbose : bool = False 28 If True, print progress messages. 29 30 Returns 31 ------- 32 DataFrame 33 A DataFrame of RBA Catalogue numbers. 34 35 Example 36 ------- 37 ```python 38 import readabs as ra 39 catalogue = ra.rba_catalogue() 40 ``` 41 42 """ 43 return _get_rba_links(cache_only=cache_only, verbose=verbose) 44 45 46def print_rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> None: 47 """Print to standard output a table of the RBA Catalogue Numbers. 48 49 This function prints a formatted table of RBA catalogue numbers 50 to standard output. 51 52 Parameters 53 ---------- 54 cache_only : bool = False 55 If True, only use the cache. 56 verbose : bool = False 57 If True, print progress messages. 58 59 Returns 60 ------- 61 None 62 This function does not return anything. 63 64 Example 65 ------- 66 67 ```python 68 import readabs as ra 69 ra.print_rba_catalogue() 70 ``` 71 72 """ 73 rba_catalog = rba_catalogue(cache_only=cache_only, verbose=verbose) 74 print(rba_catalog.loc[:, rba_catalog.columns != "URL"].to_markdown()) 75 76 77def _get_soup(url: str, **kwargs: Any) -> BeautifulSoup | None: # cache args 78 """Return a BeautifulSoup object from a URL. 79 80 Returns None on error. 81 """ 82 try: 83 page = get_file(url, **kwargs) 84 except (HttpError, CacheError) as e: 85 print(f"Error: {e}") 86 return None 87 88 # remove those pesky span tags - possibly not necessary 89 page = re.sub(b"<span[^>]*>", b" ", page) 90 page = re.sub(b"</span>", b" ", page) 91 page = re.sub(b"\\s+", b" ", page) # tidy up white space 92 93 return BeautifulSoup(page, "html.parser") 94 95 96def _historical_name_fix( 97 moniker: str, 98 foretext: str, 99 prefix: str, 100) -> tuple[str, str]: 101 """Fix the historical data names. Returns a tuple of moniker and foretext.""" 102 if "Exchange Rates" in foretext: 103 foretext = f"{foretext} - {moniker}" 104 moniker = "F11.1" 105 106 for word in ["Daily", "Monthly", "Detailed", "Summary", "Allotted"]: 107 if word in foretext: 108 moniker = f"{moniker}-{word}" 109 break 110 111 last = foretext.rsplit(" ", 1)[-1] 112 if re.match(r"\d{4}", last): 113 moniker = f"{moniker}-{last}" 114 115 moniker = f"{prefix}{moniker}" 116 117 return moniker, foretext 118 119 120def _excel_link_capture( 121 soup: BeautifulSoup, 122 prefix: str, 123) -> dict[str, dict[str, str]]: 124 """Capture all links (of Microsoft Excel types) from the BeautifulSoup object. 125 126 Returns a dictionary with the following structure: 127 {moniker: {"Description": text, "URL": url}}. 128 """ 129 # The RBA has a number of historic tables that are not well 130 # formated. We will exclude these from the dictionary. 131 historic_exclusions = ("E4", "E5", "E6", "E7", "J1", "J2") 132 133 link_dict = {} 134 for link in soup.find_all("a"): 135 # Ensure we have a Tag object with href attribute 136 if not isinstance(link, Tag): 137 continue 138 href = link.get("href") 139 if not href or not isinstance(href, str): 140 continue 141 url = href.strip() 142 if not url: 143 continue 144 145 tail = url.rsplit("/", 1)[-1].lower() 146 if "." not in tail: 147 continue 148 if not tail.endswith(".xls") and not tail.endswith(".xlsx"): 149 continue 150 text, url = link.text, _make_absolute_url(url.strip()) 151 text = text.replace("\u2013", "-").strip() # Replace EN DASH with HYPHEN 152 153 pair = text.rsplit(" - ", 1) 154 if len(pair) != EXPECTED_PAIR_LENGTH: 155 continue 156 foretext, moniker = pair 157 158 if prefix: 159 # Remove historical data that does not easily 160 # parse under the same rules as for the current data. 161 if moniker in historic_exclusions: 162 continue 163 if "Occasional Paper" in moniker: 164 continue 165 166 # The historical data is a bit ugly. Let's clean it up. 167 moniker, foretext = _historical_name_fix(moniker, foretext, prefix) 168 169 if moniker in link_dict: 170 print(f"Warning: {moniker} already exists in the dictionary {tail}") 171 if tail != ".xlsx": 172 # do not replace a .xlsx link with an .xls link 173 continue 174 link_dict[moniker] = {"Description": foretext.strip(), "URL": url} 175 176 return link_dict 177 178 179@cache 180def _get_rba_links(**kwargs: Any) -> DataFrame: # cache args 181 """Extract links to RBA data files in Excel format from the RBA website. 182 183 Returns a DataFrame with the following columns: 'Description' and 'URL'. 184 The index is the 'Table' number. Returns an empty DataFrame on error. 185 """ 186 urls = [ 187 ("https://www.rba.gov.au/statistics/tables/", ""), # current 188 ("https://www.rba.gov.au/statistics/historical-data.html", "Z:"), # history 189 ] 190 191 link_dict = {} 192 for url, prefix in urls: 193 soup = _get_soup(url, **kwargs) 194 if soup is not None: 195 link_dict.update(_excel_link_capture(soup, prefix)) 196 197 rba_catalog = DataFrame(link_dict).T.sort_index() 198 rba_catalog.index.name = "Table" 199 return rba_catalog 200 201 202# private 203def _make_absolute_url(url: str, prefix: str = "https://www.rba.gov.au") -> str: 204 """Convert a relative URL address found on the RBA site to an absolute URL. 205 206 Takes a relative URL and converts it to an absolute URL address. 207 """ 208 # remove a prefix if it already exists (just to be sure) 209 url = url.replace(prefix, "") 210 url = url.replace(prefix.replace("https://", "http://"), "") 211 # then add the prefix (back) ... 212 return f"{prefix}{url}" 213 214 215# --- testing --- 216if __name__ == "__main__": 217 print_rba_catalogue(cache_only=False, verbose=False)
EXPECTED_PAIR_LENGTH =
2
@cache
def
rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> pandas.DataFrame:
17@cache 18def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame: 19 """Return a DataFrame of RBA Catalogue numbers. 20 21 In the first instance, this is downloaded from the RBA website, and 22 cached for future use. 23 24 Parameters 25 ---------- 26 cache_only : bool = False 27 If True, only use the cache. 28 verbose : bool = False 29 If True, print progress messages. 30 31 Returns 32 ------- 33 DataFrame 34 A DataFrame of RBA Catalogue numbers. 35 36 Example 37 ------- 38 ```python 39 import readabs as ra 40 catalogue = ra.rba_catalogue() 41 ``` 42 43 """ 44 return _get_rba_links(cache_only=cache_only, verbose=verbose)
Return a DataFrame of RBA Catalogue numbers.
In the first instance, this is downloaded from the RBA website, and cached for future use.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Returns
DataFrame A DataFrame of RBA Catalogue numbers.
Example
import readabs as ra
catalogue = ra.rba_catalogue()
def
print_rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> None:
47def print_rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> None: 48 """Print to standard output a table of the RBA Catalogue Numbers. 49 50 This function prints a formatted table of RBA catalogue numbers 51 to standard output. 52 53 Parameters 54 ---------- 55 cache_only : bool = False 56 If True, only use the cache. 57 verbose : bool = False 58 If True, print progress messages. 59 60 Returns 61 ------- 62 None 63 This function does not return anything. 64 65 Example 66 ------- 67 68 ```python 69 import readabs as ra 70 ra.print_rba_catalogue() 71 ``` 72 73 """ 74 rba_catalog = rba_catalogue(cache_only=cache_only, verbose=verbose) 75 print(rba_catalog.loc[:, rba_catalog.columns != "URL"].to_markdown())
Print to standard output a table of the RBA Catalogue Numbers.
This function prints a formatted table of RBA catalogue numbers to standard output.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Returns
None This function does not return anything.
Example
import readabs as ra
ra.print_rba_catalogue()