readabs.grab_abs_url
Find and extract DataFrames from an ABS webpage.
1"""Find and extract DataFrames from an ABS webpage.""" 2 3# --- imports --- 4# standard library imports 5import zipfile 6from functools import cache 7from io import BytesIO 8from pathlib import Path 9from typing import Any, Unpack 10 11# analytic imports 12import pandas as pd 13from pandas import DataFrame 14 15from readabs.abs_catalogue import abs_catalogue 16from readabs.download_cache import get_file 17 18# local imports 19from readabs.get_abs_links import get_abs_links, get_table_name 20from readabs.read_support import HYPHEN, ReadArgs, check_kwargs, get_args 21 22# --- constants --- 23# File extensions for ABS data files 24ZIP_EXTENSION = ".zip" 25EXCEL_EXTENSION = ".xlsx" 26 27# Processing order: ZIP files must be processed before Excel files 28# This prevents duplicate data when ZIP files contain Excel files 29FILE_EXTENSIONS_PROCESSING_ORDER = (ZIP_EXTENSION, EXCEL_EXTENSION) 30 31# Default values and limits 32EMPTY_BYTES_LENGTH = 0 33 34 35# --- public - primary entry point for this module 36@cache # minimise slowness with repeat business 37def grab_abs_url( 38 cat: str = "", 39 url: str = "", 40 **kwargs: Unpack[ReadArgs], 41) -> dict[str, DataFrame]: 42 """For a given URL, extract the data from the Excel and ZIP file links found on that page. 43 44 The data is returned as a dictionary of DataFrames. The Excel files are converted 45 into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. 46 ZIP files are examined for Excel files, which are similarly converted into 47 DataFrames. The dictionary of DataFrames is returned. 48 49 The preferred mechanism for reading ABS data is to use the `read_abs_cat()` 50 or `read_abs_series()` functions. This function is provided for those 51 cases where the data is not available in the ABS catalogue, where the 52 data is not a timeseries, or where the user wants to extract data from 53 a specific ABS landingpage. 54 55 56 Parameters 57 ---------- 58 url : str = "" 59 A URL for an ABS Catalogue landing page. Either a url or 60 a catalogue number must be provided. If both are provided, the 61 URL will be used. 62 63 cat : str = "" 64 An ABS Catalogue number. If provided, and the URL is not 65 provided, then the Catalogue number will be used to get the URL. 66 67 **kwargs : Unpack[ReadArgs] 68 Accepts the same keyword arguments as `read_abs_cat()`. 69 70 Returns 71 ------- 72 dict[str, DataFrame] 73 A dictionary of DataFrames. 74 75 """ 76 # check/get the keyword arguments 77 url = _get_url(url, cat) 78 check_kwargs(kwargs, "grab_abs_url") # warn if invalid kwargs 79 args = get_args(kwargs, "grab_abs_url") # get the valid kwargs 80 if verbose := args["verbose"]: 81 print(f"grab_abs_url(): {url=}, {args=}") 82 83 # get the URL links to the relevant ABS data files on that webpage 84 links = get_abs_links(url, **args) 85 if not links: 86 print(f"No data files found at URL: {url}") 87 return {} # return an empty Dictionary 88 89 # read the data files into a dictionary of DataFrames 90 abs_dict: dict[str, DataFrame] = {} 91 92 # Process single file requests first 93 abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose) 94 if abs_dict: # If single file was found and processed, return it 95 return abs_dict 96 97 # Process all files based on configuration 98 return _process_all_files(abs_dict, links, args) 99 100 101def grab_abs_zip( 102 zip_path: Path | str, 103 **kwargs: Unpack[ReadArgs] 104) -> dict[str, DataFrame]: 105 """Grab and process a single ABS ZIP file from a file system location. 106 107 This is a convenience function that opens an ABS ZIP file from a local 108 filesystem path. Expect to be used rarely. 109 110 Parameters 111 ---------- 112 zip_path : Path | str 113 The local filesystem path of the ABS ZIP file to open and process. 114 115 **kwargs : Unpack[ReadArgs] 116 Additional keyword arguments for file retrieval and processing. 117 118 Returns 119 ------- 120 dict[str, DataFrame] 121 A dictionary of DataFrames extracted from the ZIP file. 122 123 """ 124 check_kwargs(kwargs, "grab_abs_zip") # warn if invalid kwargs 125 args = get_args(kwargs, "grab_abs_zip") # get the valid kwargs 126 127 zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path) 128 zip_bytes = zp.read_bytes() 129 abs_dict: dict[str, DataFrame] = {} 130 return _process_zip(abs_dict, zip_bytes, **args) 131 132 133# --- private 134def _process_single_files( 135 abs_dict: dict[str, DataFrame], 136 links: dict[str, list[str]], 137 args: dict[str, Any], # ReadArgs after processing 138 *, 139 verbose: bool, 140) -> dict[str, DataFrame]: 141 """Process single file requests (single_excel_only or single_zip_only).""" 142 if args["single_excel_only"]: 143 link = _find_url(links, EXCEL_EXTENSION, args["single_excel_only"], verbose=verbose) 144 if link: 145 return _add_excel(abs_dict, link, **args) 146 147 if args["single_zip_only"]: 148 link = _find_url(links, ZIP_EXTENSION, args["single_zip_only"], verbose=verbose) 149 if link: 150 return _add_zip(abs_dict, link, **args) 151 152 return abs_dict 153 154 155def _process_all_files( 156 abs_dict: dict[str, DataFrame], 157 links: dict[str, list[str]], 158 args: dict[str, Any], # ReadArgs after processing 159) -> dict[str, DataFrame]: 160 """Process all files based on configuration (get_zip, get_excel, etc.).""" 161 for link_type in FILE_EXTENSIONS_PROCESSING_ORDER: 162 for link in links.get(link_type, []): 163 if link_type == ZIP_EXTENSION and args["get_zip"]: 164 abs_dict = _add_zip(abs_dict, link, **args) 165 elif link_type == EXCEL_EXTENSION and _should_process_excel_file(args, links): 166 abs_dict = _add_excel(abs_dict, link, **args) 167 return abs_dict 168 169 170def _should_process_excel_file(args: dict[str, Any], links: dict[str, list[str]]) -> bool: 171 """Determine if Excel files should be processed based on configuration. 172 173 Excel files are processed if: 174 1. get_excel is explicitly True, or 175 2. get_excel_if_no_zip is True AND (get_zip is False OR no ZIP files are available) 176 177 Args: 178 args: Configuration arguments from user 179 links: Dictionary of available file links by type 180 181 Returns: 182 bool: True if Excel files should be processed 183 184 """ 185 # Always process if explicitly requested 186 if args["get_excel"]: 187 return True 188 189 # Process Excel if requested when no ZIP files, and either: 190 # - ZIP processing is disabled, or 191 # - No ZIP files are available 192 if args["get_excel_if_no_zip"]: 193 zip_processing_disabled = not args["get_zip"] 194 no_zip_files_available = not links.get(ZIP_EXTENSION, []) 195 return zip_processing_disabled or no_zip_files_available 196 197 return False 198 199 200def _find_url(links: dict[str, list[str]], targ_type: str, target: str, *, verbose: bool = False) -> str: 201 """Find the URL for a target file type. 202 203 Args: 204 links: Dictionary mapping file types to lists of URLs 205 targ_type: Target file extension (e.g., '.xlsx', '.zip') 206 target: Target filename without extension 207 verbose: Whether to print debug information 208 209 Returns: 210 str: The matching URL if found, otherwise an empty string 211 212 """ 213 targ_list = links.get(targ_type, []) 214 if not targ_list: 215 return "" 216 goal = f"{target}{targ_type}" 217 if verbose: 218 print(f"_find_url(): looking for {goal} in {targ_list}.") 219 for link in targ_list: 220 if link.endswith(goal): 221 return link 222 return "" 223 224 225def _get_url(url: str, cat: str) -> str: 226 """Get URL from provided URL or catalogue number. 227 228 If an ABS catalogue number is provided and URL is not provided, 229 get the URL for the ABS data files on the ABS webpage. 230 Otherwise, return the URL provided. Either the 'url' or 231 'cat' argument must be provided. 232 233 Args: 234 url: The URL to use if provided 235 cat: The catalogue number to use if URL is not provided 236 237 Returns: 238 str: The URL to use for data retrieval 239 240 Raises: 241 ValueError: If neither URL nor valid catalogue number is provided 242 243 """ 244 if not url and cat: 245 try: 246 cat_map = abs_catalogue() 247 if cat in cat_map.index: 248 url = str(cat_map.loc[cat, "URL"]) 249 except (KeyError, IndexError) as e: 250 raise ValueError(f"Catalogue number '{cat}' not found in ABS catalogue: {e}") from e 251 except (ConnectionError, TimeoutError) as e: 252 raise ValueError(f"Network error retrieving catalogue for '{cat}': {e}") from e 253 except (ValueError, TypeError) as e: 254 raise ValueError(f"Invalid catalogue data for '{cat}': {e}") from e 255 256 if not url and cat: 257 raise ValueError( 258 f"Catalogue number '{cat}' not found in the ABS Time Series Directory. " 259 f"This may be a discontinued series. If you know the ABS landing page URL, " 260 f"you can use: read_abs_cat(cat='{cat}', url='https://www.abs.gov.au/...')" 261 ) 262 263 if not url: 264 raise ValueError("_get_url(): no URL or valid catalogue number provided.") 265 266 return url 267 268 269def _process_zip( 270 abs_dict: dict[str, DataFrame], 271 zip_contents: bytes, 272 **args: Any, # ReadArgs compatible 273) -> dict[str, DataFrame]: 274 """Read and process a ZIP file's contents from bytes.""" 275 if len(zip_contents) == EMPTY_BYTES_LENGTH: 276 return abs_dict 277 278 with zipfile.ZipFile(BytesIO(zip_contents)) as zipped: 279 for element in zipped.infolist(): 280 # get the zipfile into pandas 281 table_name = get_table_name(url=element.filename) 282 raw_bytes = zipped.read(element.filename) 283 abs_dict = _add_excel_bytes(abs_dict, raw_bytes, table_name, args) 284 285 return abs_dict 286 287 288def _add_zip( 289 abs_dict: dict[str, DataFrame], 290 link: str, 291 **args: Any, # ReadArgs compatible 292) -> dict[str, DataFrame]: 293 """Read and process a ZIP file from a URL. 294 295 Downloads the ZIP file and iterates over its contents, calling 296 _add_excel_bytes() to extract Excel files and add their contents 297 to the DataFrames dictionary. 298 299 Args: 300 abs_dict: Dictionary to store extracted DataFrames 301 link: URL to the ZIP file 302 **args: Additional arguments passed to file retrieval functions 303 304 Returns: 305 dict[str, DataFrame]: Updated dictionary with new DataFrames from ZIP contents 306 307 """ 308 zip_contents = get_file(link, **args) 309 return _process_zip(abs_dict, zip_contents, **args) 310 311 312def _add_excel_bytes( 313 abs_dict: dict[str, DataFrame], 314 raw_bytes: bytes, 315 name: str, 316 args: dict[str, Any], # ReadArgs after processing 317) -> dict[str, DataFrame]: 318 """Convert Excel file bytes to DataFrames and add to dictionary. 319 320 Processes the bytes as an Excel file, converting each sheet to a DataFrame 321 and adding them to the dictionary using 'name---sheet_name' as keys. 322 323 Args: 324 abs_dict: Dictionary to store extracted DataFrames 325 raw_bytes: Bytes content of the Excel file 326 name: Base name for the Excel file 327 args: Dictionary of processing arguments 328 329 Returns: 330 dict[str, DataFrame]: Updated dictionary with new DataFrames from Excel sheets 331 332 """ 333 verbose = args.get("verbose", False) 334 335 if len(raw_bytes) == EMPTY_BYTES_LENGTH: 336 if verbose: 337 print("_add_excel_bytes(): the raw bytes are empty.") 338 return abs_dict 339 340 # convert the raw bytes into a pandas ExcelFile 341 try: 342 excel = pd.ExcelFile(BytesIO(raw_bytes)) 343 except (ValueError, TypeError) as e: 344 message = f"With {name}: could not convert raw bytes to ExcelFile.\n{e}" 345 print(message) 346 return abs_dict 347 348 # iterate over the sheets in the Excel file 349 for sheet_name in excel.sheet_names: 350 # grab and go - no treatment of the data 351 sheet_data = excel.parse( 352 sheet_name, 353 ) 354 if len(sheet_data) == EMPTY_BYTES_LENGTH: 355 if verbose: 356 print(f"_add_excel_bytes(): sheet {sheet_name} in {name} is empty.") 357 continue 358 abs_dict[f"{name}{HYPHEN}{sheet_name}"] = sheet_data 359 360 # return the dictionary of DataFrames 361 return abs_dict 362 363 364def _add_excel( 365 abs_dict: dict[str, DataFrame], 366 link: str, 367 **args: Any, # ReadArgs compatible 368) -> dict[str, DataFrame]: 369 """Read in an Excel file at the URL in the 'link' argument. 370 371 Pass those bytes to _add_excel_bytes() to put the contents 372 into the dictionary of DataFrames given by 'abs_dict'. When done, 373 return the dictionary of DataFrames. 374 """ 375 name = get_table_name(link) 376 377 if name in abs_dict: 378 # table already in the dictionary 379 return abs_dict 380 381 raw_bytes = get_file(link, **args) 382 383 return _add_excel_bytes(abs_dict, raw_bytes, name, args) 384 385 386# --- main --- 387if __name__ == "__main__": 388 389 def simple_test() -> None: 390 """Test the grab_abs_url and grab_abs_zip functions.""" 391 392 def test(name: str, **kwargs: Any) -> None: # ReadArgs compatible 393 print(f"TEST -- {name}") 394 try: 395 data_dict = grab_abs_url(**kwargs) 396 print("---") 397 if not data_dict: 398 print("PROBLEM -- No data found.") 399 else: 400 print(f"SUCCESS -- Found {len(data_dict)} datasets: {list(data_dict.keys())}") 401 except Exception as e: # pylint: disable=broad-except 402 print(f"ERROR -- Test failed with exception: {e}") 403 print(f"Done.\n{'=' * 20}\n") 404 405 def test_zip(zip_path: Path, **kwargs: Any) -> None: # ReadArgs compatible 406 print(f"TEST -- grab_abs_zip() with {zip_path}") 407 try: 408 data_dict = grab_abs_zip(zip_path, **kwargs) 409 print("---") 410 if not data_dict: 411 print("PROBLEM -- No data found.") 412 else: 413 print(f"SUCCESS -- Found {len(data_dict)} datasets: {list(data_dict.keys())}") 414 except Exception as e: # pylint: disable=broad-except 415 print(f"ERROR -- Test failed with exception: {e}") 416 print(f"Done.\n{'=' * 20}\n") 417 418 # 4 -- grab from url 419 _name = ".test-data/Qrtly-CPI-Time-series-spreadsheets-all.zip" 420 _zip_path = Path(_name) 421 test_zip(_zip_path, verbose=True) 422 423 # --- various grab_abs_url() tests 424 425 name = "1 -- grab a single zip file" 426 test( 427 name, 428 cat="6291.0.55.001", 429 single_zip_only="p6291_all_quarterly_spreadsheets", 430 get_zip=True, 431 verbose=True, 432 ) 433 434 name = "2 -- grab a single Excel file" 435 test( 436 name, 437 cat="6202.0", 438 get_excel=True, 439 single_excel_only="6202001", 440 verbose=False, 441 ) 442 443 # 3 -- grab the whole shebang 444 urls = [ 445 "https://www.abs.gov.au/statistics/labour/jobs/weekly-payroll-jobs/latest-release", 446 "https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/dec-2023", 447 ] 448 for i, url_ in enumerate(urls): 449 name = f"3.{i} -- grab the whole shebang {url_}" 450 test(name, url=url_, verbose=True) 451 452 simple_test()
37@cache # minimise slowness with repeat business 38def grab_abs_url( 39 cat: str = "", 40 url: str = "", 41 **kwargs: Unpack[ReadArgs], 42) -> dict[str, DataFrame]: 43 """For a given URL, extract the data from the Excel and ZIP file links found on that page. 44 45 The data is returned as a dictionary of DataFrames. The Excel files are converted 46 into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. 47 ZIP files are examined for Excel files, which are similarly converted into 48 DataFrames. The dictionary of DataFrames is returned. 49 50 The preferred mechanism for reading ABS data is to use the `read_abs_cat()` 51 or `read_abs_series()` functions. This function is provided for those 52 cases where the data is not available in the ABS catalogue, where the 53 data is not a timeseries, or where the user wants to extract data from 54 a specific ABS landingpage. 55 56 57 Parameters 58 ---------- 59 url : str = "" 60 A URL for an ABS Catalogue landing page. Either a url or 61 a catalogue number must be provided. If both are provided, the 62 URL will be used. 63 64 cat : str = "" 65 An ABS Catalogue number. If provided, and the URL is not 66 provided, then the Catalogue number will be used to get the URL. 67 68 **kwargs : Unpack[ReadArgs] 69 Accepts the same keyword arguments as `read_abs_cat()`. 70 71 Returns 72 ------- 73 dict[str, DataFrame] 74 A dictionary of DataFrames. 75 76 """ 77 # check/get the keyword arguments 78 url = _get_url(url, cat) 79 check_kwargs(kwargs, "grab_abs_url") # warn if invalid kwargs 80 args = get_args(kwargs, "grab_abs_url") # get the valid kwargs 81 if verbose := args["verbose"]: 82 print(f"grab_abs_url(): {url=}, {args=}") 83 84 # get the URL links to the relevant ABS data files on that webpage 85 links = get_abs_links(url, **args) 86 if not links: 87 print(f"No data files found at URL: {url}") 88 return {} # return an empty Dictionary 89 90 # read the data files into a dictionary of DataFrames 91 abs_dict: dict[str, DataFrame] = {} 92 93 # Process single file requests first 94 abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose) 95 if abs_dict: # If single file was found and processed, return it 96 return abs_dict 97 98 # Process all files based on configuration 99 return _process_all_files(abs_dict, links, args)
For a given URL, extract the data from the Excel and ZIP file links found on that page.
The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.
The preferred mechanism for reading ABS data is to use the read_abs_cat()
or read_abs_series() functions. This function is provided for those
cases where the data is not available in the ABS catalogue, where the
data is not a timeseries, or where the user wants to extract data from
a specific ABS landingpage.
Parameters
url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.
cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.
**kwargs : Unpack[ReadArgs]
Accepts the same keyword arguments as read_abs_cat().
Returns
dict[str, DataFrame] A dictionary of DataFrames.
102def grab_abs_zip( 103 zip_path: Path | str, 104 **kwargs: Unpack[ReadArgs] 105) -> dict[str, DataFrame]: 106 """Grab and process a single ABS ZIP file from a file system location. 107 108 This is a convenience function that opens an ABS ZIP file from a local 109 filesystem path. Expect to be used rarely. 110 111 Parameters 112 ---------- 113 zip_path : Path | str 114 The local filesystem path of the ABS ZIP file to open and process. 115 116 **kwargs : Unpack[ReadArgs] 117 Additional keyword arguments for file retrieval and processing. 118 119 Returns 120 ------- 121 dict[str, DataFrame] 122 A dictionary of DataFrames extracted from the ZIP file. 123 124 """ 125 check_kwargs(kwargs, "grab_abs_zip") # warn if invalid kwargs 126 args = get_args(kwargs, "grab_abs_zip") # get the valid kwargs 127 128 zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path) 129 zip_bytes = zp.read_bytes() 130 abs_dict: dict[str, DataFrame] = {} 131 return _process_zip(abs_dict, zip_bytes, **args)
Grab and process a single ABS ZIP file from a file system location.
This is a convenience function that opens an ABS ZIP file from a local filesystem path. Expect to be used rarely.
Parameters
zip_path : Path | str The local filesystem path of the ABS ZIP file to open and process.
**kwargs : Unpack[ReadArgs] Additional keyword arguments for file retrieval and processing.
Returns
dict[str, DataFrame] A dictionary of DataFrames extracted from the ZIP file.