readabs.grab_abs_url

Find and extract DataFrames from an ABS webpage.

  1"""Find and extract DataFrames from an ABS webpage."""
  2
  3# --- imports ---
  4# standard library imports
  5import zipfile
  6from functools import cache
  7from io import BytesIO
  8from pathlib import Path
  9from typing import Any, Unpack
 10
 11# analytic imports
 12import pandas as pd
 13from pandas import DataFrame
 14
 15from readabs.abs_catalogue import abs_catalogue
 16from readabs.download_cache import get_file
 17
 18# local imports
 19from readabs.get_abs_links import get_abs_links, get_table_name
 20from readabs.read_support import HYPHEN, ReadArgs, check_kwargs, get_args
 21
 22# --- constants ---
 23# File extensions for ABS data files
 24ZIP_EXTENSION = ".zip"
 25EXCEL_EXTENSION = ".xlsx"
 26
 27# Processing order: ZIP files must be processed before Excel files
 28# This prevents duplicate data when ZIP files contain Excel files
 29FILE_EXTENSIONS_PROCESSING_ORDER = (ZIP_EXTENSION, EXCEL_EXTENSION)
 30
 31# Default values and limits
 32EMPTY_BYTES_LENGTH = 0
 33
 34
 35# --- public - primary entry point for this module
 36@cache  # minimise slowness with repeat business
 37def grab_abs_url(
 38    cat: str = "",
 39    url: str = "",
 40    **kwargs: Unpack[ReadArgs],
 41) -> dict[str, DataFrame]:
 42    """For a given URL, extract the data from the Excel and ZIP file links found on that page.
 43
 44    The data is returned as a dictionary of DataFrames. The Excel files are converted
 45    into DataFrames, with each sheet in each Excel file becoming a separate DataFrame.
 46    ZIP files are examined for Excel files, which are similarly converted into
 47    DataFrames. The dictionary of DataFrames is returned.
 48
 49    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
 50    or `read_abs_series()` functions. This function is provided for those
 51    cases where the data is not available in the ABS catalogue, where the
 52    data is not a timeseries, or where the user wants to extract data from
 53    a specific ABS landingpage.
 54
 55
 56    Parameters
 57    ----------
 58    url : str = ""
 59        A URL for an ABS Catalogue landing page. Either a url or
 60        a catalogue number must be provided. If both are provided, the
 61        URL will be used.
 62
 63    cat : str = ""
 64        An ABS Catalogue number. If provided, and the URL is not
 65        provided, then the Catalogue number will be used to get the URL.
 66
 67    **kwargs : Unpack[ReadArgs]
 68        Accepts the same keyword arguments as `read_abs_cat()`.
 69
 70    Returns
 71    -------
 72    dict[str, DataFrame]
 73        A dictionary of DataFrames.
 74
 75    """
 76    # check/get the keyword arguments
 77    url = _get_url(url, cat)
 78    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
 79    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
 80    if verbose := args["verbose"]:
 81        print(f"grab_abs_url(): {url=}, {args=}")
 82
 83    # get the URL links to the relevant ABS data files on that webpage
 84    links = get_abs_links(url, **args)
 85    if not links:
 86        print(f"No data files found at URL: {url}")
 87        return {}  # return an empty Dictionary
 88
 89    # read the data files into a dictionary of DataFrames
 90    abs_dict: dict[str, DataFrame] = {}
 91
 92    # Process single file requests first
 93    abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose)
 94    if abs_dict:  # If single file was found and processed, return it
 95        return abs_dict
 96
 97    # Process all files based on configuration
 98    return _process_all_files(abs_dict, links, args)
 99
100
101def grab_abs_zip(
102    zip_path: Path | str,
103    **kwargs: Unpack[ReadArgs]
104) -> dict[str, DataFrame]:
105    """Grab and process a single ABS ZIP file from a file system location.
106
107    This is a convenience function that opens an ABS ZIP file from a local
108    filesystem path. Expect to be used rarely.
109
110    Parameters
111    ----------
112    zip_path : Path | str
113        The local filesystem path of the ABS ZIP file to open and process.
114
115    **kwargs : Unpack[ReadArgs]
116        Additional keyword arguments for file retrieval and processing.
117
118    Returns
119    -------
120    dict[str, DataFrame]
121        A dictionary of DataFrames extracted from the ZIP file.
122
123    """
124    check_kwargs(kwargs, "grab_abs_zip")  # warn if invalid kwargs
125    args = get_args(kwargs, "grab_abs_zip")  # get the valid kwargs
126
127    zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path)
128    zip_bytes = zp.read_bytes()
129    abs_dict: dict[str, DataFrame] = {}
130    return _process_zip(abs_dict, zip_bytes, **args)
131
132
133# --- private
134def _process_single_files(
135    abs_dict: dict[str, DataFrame],
136    links: dict[str, list[str]],
137    args: dict[str, Any],  # ReadArgs after processing
138    *,
139    verbose: bool,
140) -> dict[str, DataFrame]:
141    """Process single file requests (single_excel_only or single_zip_only)."""
142    if args["single_excel_only"]:
143        link = _find_url(links, EXCEL_EXTENSION, args["single_excel_only"], verbose=verbose)
144        if link:
145            return _add_excel(abs_dict, link, **args)
146
147    if args["single_zip_only"]:
148        link = _find_url(links, ZIP_EXTENSION, args["single_zip_only"], verbose=verbose)
149        if link:
150            return _add_zip(abs_dict, link, **args)
151
152    return abs_dict
153
154
155def _process_all_files(
156    abs_dict: dict[str, DataFrame],
157    links: dict[str, list[str]],
158    args: dict[str, Any],  # ReadArgs after processing
159) -> dict[str, DataFrame]:
160    """Process all files based on configuration (get_zip, get_excel, etc.)."""
161    for link_type in FILE_EXTENSIONS_PROCESSING_ORDER:
162        for link in links.get(link_type, []):
163            if link_type == ZIP_EXTENSION and args["get_zip"]:
164                abs_dict = _add_zip(abs_dict, link, **args)
165            elif link_type == EXCEL_EXTENSION and _should_process_excel_file(args, links):
166                abs_dict = _add_excel(abs_dict, link, **args)
167    return abs_dict
168
169
170def _should_process_excel_file(args: dict[str, Any], links: dict[str, list[str]]) -> bool:
171    """Determine if Excel files should be processed based on configuration.
172
173    Excel files are processed if:
174    1. get_excel is explicitly True, or
175    2. get_excel_if_no_zip is True AND (get_zip is False OR no ZIP files are available)
176
177    Args:
178        args: Configuration arguments from user
179        links: Dictionary of available file links by type
180
181    Returns:
182        bool: True if Excel files should be processed
183
184    """
185    # Always process if explicitly requested
186    if args["get_excel"]:
187        return True
188
189    # Process Excel if requested when no ZIP files, and either:
190    # - ZIP processing is disabled, or
191    # - No ZIP files are available
192    if args["get_excel_if_no_zip"]:
193        zip_processing_disabled = not args["get_zip"]
194        no_zip_files_available = not links.get(ZIP_EXTENSION, [])
195        return zip_processing_disabled or no_zip_files_available
196
197    return False
198
199
200def _find_url(links: dict[str, list[str]], targ_type: str, target: str, *, verbose: bool = False) -> str:
201    """Find the URL for a target file type.
202
203    Args:
204        links: Dictionary mapping file types to lists of URLs
205        targ_type: Target file extension (e.g., '.xlsx', '.zip')
206        target: Target filename without extension
207        verbose: Whether to print debug information
208
209    Returns:
210        str: The matching URL if found, otherwise an empty string
211
212    """
213    targ_list = links.get(targ_type, [])
214    if not targ_list:
215        return ""
216    goal = f"{target}{targ_type}"
217    if verbose:
218        print(f"_find_url(): looking for {goal} in {targ_list}.")
219    for link in targ_list:
220        if link.endswith(goal):
221            return link
222    return ""
223
224
225def _get_url(url: str, cat: str) -> str:
226    """Get URL from provided URL or catalogue number.
227
228    If an ABS catalogue number is provided and URL is not provided,
229    get the URL for the ABS data files on the ABS webpage.
230    Otherwise, return the URL provided. Either the 'url' or
231    'cat' argument must be provided.
232
233    Args:
234        url: The URL to use if provided
235        cat: The catalogue number to use if URL is not provided
236
237    Returns:
238        str: The URL to use for data retrieval
239
240    Raises:
241        ValueError: If neither URL nor valid catalogue number is provided
242
243    """
244    if not url and cat:
245        try:
246            cat_map = abs_catalogue()
247            if cat in cat_map.index:
248                url = str(cat_map.loc[cat, "URL"])
249        except (KeyError, IndexError) as e:
250            raise ValueError(f"Catalogue number '{cat}' not found in ABS catalogue: {e}") from e
251        except (ConnectionError, TimeoutError) as e:
252            raise ValueError(f"Network error retrieving catalogue for '{cat}': {e}") from e
253        except (ValueError, TypeError) as e:
254            raise ValueError(f"Invalid catalogue data for '{cat}': {e}") from e
255
256    if not url and cat:
257        raise ValueError(
258            f"Catalogue number '{cat}' not found in the ABS Time Series Directory. "
259            f"This may be a discontinued series. If you know the ABS landing page URL, "
260            f"you can use: read_abs_cat(cat='{cat}', url='https://www.abs.gov.au/...')"
261        )
262
263    if not url:
264        raise ValueError("_get_url(): no URL or valid catalogue number provided.")
265
266    return url
267
268
269def _process_zip(
270    abs_dict: dict[str, DataFrame],
271    zip_contents: bytes,
272    **args: Any,  # ReadArgs compatible
273) -> dict[str, DataFrame]:
274    """Read and process a ZIP file's contents from bytes."""
275    if len(zip_contents) == EMPTY_BYTES_LENGTH:
276        return abs_dict
277
278    with zipfile.ZipFile(BytesIO(zip_contents)) as zipped:
279        for element in zipped.infolist():
280            # get the zipfile into pandas
281            table_name = get_table_name(url=element.filename)
282            raw_bytes = zipped.read(element.filename)
283            abs_dict = _add_excel_bytes(abs_dict, raw_bytes, table_name, args)
284
285    return abs_dict
286
287
288def _add_zip(
289    abs_dict: dict[str, DataFrame],
290    link: str,
291    **args: Any,  # ReadArgs compatible
292) -> dict[str, DataFrame]:
293    """Read and process a ZIP file from a URL.
294
295    Downloads the ZIP file and iterates over its contents, calling
296    _add_excel_bytes() to extract Excel files and add their contents
297    to the DataFrames dictionary.
298
299    Args:
300        abs_dict: Dictionary to store extracted DataFrames
301        link: URL to the ZIP file
302        **args: Additional arguments passed to file retrieval functions
303
304    Returns:
305        dict[str, DataFrame]: Updated dictionary with new DataFrames from ZIP contents
306
307    """
308    zip_contents = get_file(link, **args)
309    return _process_zip(abs_dict, zip_contents, **args)
310
311
312def _add_excel_bytes(
313    abs_dict: dict[str, DataFrame],
314    raw_bytes: bytes,
315    name: str,
316    args: dict[str, Any],  # ReadArgs after processing
317) -> dict[str, DataFrame]:
318    """Convert Excel file bytes to DataFrames and add to dictionary.
319
320    Processes the bytes as an Excel file, converting each sheet to a DataFrame
321    and adding them to the dictionary using 'name---sheet_name' as keys.
322
323    Args:
324        abs_dict: Dictionary to store extracted DataFrames
325        raw_bytes: Bytes content of the Excel file
326        name: Base name for the Excel file
327        args: Dictionary of processing arguments
328
329    Returns:
330        dict[str, DataFrame]: Updated dictionary with new DataFrames from Excel sheets
331
332    """
333    verbose = args.get("verbose", False)
334
335    if len(raw_bytes) == EMPTY_BYTES_LENGTH:
336        if verbose:
337            print("_add_excel_bytes(): the raw bytes are empty.")
338        return abs_dict
339
340    # convert the raw bytes into a pandas ExcelFile
341    try:
342        excel = pd.ExcelFile(BytesIO(raw_bytes))
343    except (ValueError, TypeError) as e:
344        message = f"With {name}: could not convert raw bytes to ExcelFile.\n{e}"
345        print(message)
346        return abs_dict
347
348    # iterate over the sheets in the Excel file
349    for sheet_name in excel.sheet_names:
350        # grab and go - no treatment of the data
351        sheet_data = excel.parse(
352            sheet_name,
353        )
354        if len(sheet_data) == EMPTY_BYTES_LENGTH:
355            if verbose:
356                print(f"_add_excel_bytes(): sheet {sheet_name} in {name} is empty.")
357            continue
358        abs_dict[f"{name}{HYPHEN}{sheet_name}"] = sheet_data
359
360    # return the dictionary of DataFrames
361    return abs_dict
362
363
364def _add_excel(
365    abs_dict: dict[str, DataFrame],
366    link: str,
367    **args: Any,  # ReadArgs compatible
368) -> dict[str, DataFrame]:
369    """Read in an Excel file at the URL in the 'link' argument.
370
371    Pass those bytes to _add_excel_bytes() to put the contents
372    into the dictionary of DataFrames given by 'abs_dict'. When done,
373    return the dictionary of DataFrames.
374    """
375    name = get_table_name(link)
376
377    if name in abs_dict:
378        # table already in the dictionary
379        return abs_dict
380
381    raw_bytes = get_file(link, **args)
382
383    return _add_excel_bytes(abs_dict, raw_bytes, name, args)
384
385
386# --- main ---
387if __name__ == "__main__":
388
389    def simple_test() -> None:
390        """Test the grab_abs_url and grab_abs_zip functions."""
391
392        def test(name: str, **kwargs: Any) -> None:  # ReadArgs compatible
393            print(f"TEST -- {name}")
394            try:
395                data_dict = grab_abs_url(**kwargs)
396                print("---")
397                if not data_dict:
398                    print("PROBLEM -- No data found.")
399                else:
400                    print(f"SUCCESS -- Found {len(data_dict)} datasets: {list(data_dict.keys())}")
401            except Exception as e:  # pylint: disable=broad-except
402                print(f"ERROR -- Test failed with exception: {e}")
403            print(f"Done.\n{'=' * 20}\n")
404
405        def test_zip(zip_path: Path, **kwargs: Any) -> None:  # ReadArgs compatible
406            print(f"TEST -- grab_abs_zip() with {zip_path}")
407            try:
408                data_dict = grab_abs_zip(zip_path, **kwargs)
409                print("---")
410                if not data_dict:
411                    print("PROBLEM -- No data found.")
412                else:
413                    print(f"SUCCESS -- Found {len(data_dict)} datasets: {list(data_dict.keys())}")
414            except Exception as e:  # pylint: disable=broad-except
415                print(f"ERROR -- Test failed with exception: {e}")
416            print(f"Done.\n{'=' * 20}\n")
417
418        # 4 -- grab from url
419        _name = ".test-data/Qrtly-CPI-Time-series-spreadsheets-all.zip"
420        _zip_path = Path(_name)
421        test_zip(_zip_path, verbose=True)
422
423        # --- various grab_abs_url() tests
424
425        name = "1 -- grab a single zip file"
426        test(
427            name,
428            cat="6291.0.55.001",
429            single_zip_only="p6291_all_quarterly_spreadsheets",
430            get_zip=True,
431            verbose=True,
432        )
433
434        name = "2 -- grab a single Excel file"
435        test(
436            name,
437            cat="6202.0",
438            get_excel=True,
439            single_excel_only="6202001",
440            verbose=False,
441        )
442
443        # 3 -- grab the whole shebang
444        urls = [
445            "https://www.abs.gov.au/statistics/labour/jobs/weekly-payroll-jobs/latest-release",
446            "https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/dec-2023",
447        ]
448        for i, url_ in enumerate(urls):
449            name = f"3.{i} -- grab the whole shebang {url_}"
450            test(name, url=url_, verbose=True)
451
452    simple_test()
ZIP_EXTENSION = '.zip'
EXCEL_EXTENSION = '.xlsx'
FILE_EXTENSIONS_PROCESSING_ORDER = ('.zip', '.xlsx')
EMPTY_BYTES_LENGTH = 0
@cache
def grab_abs_url( cat: str = '', url: str = '', **kwargs: Unpack[readabs.ReadArgs]) -> dict[str, pandas.DataFrame]:
37@cache  # minimise slowness with repeat business
38def grab_abs_url(
39    cat: str = "",
40    url: str = "",
41    **kwargs: Unpack[ReadArgs],
42) -> dict[str, DataFrame]:
43    """For a given URL, extract the data from the Excel and ZIP file links found on that page.
44
45    The data is returned as a dictionary of DataFrames. The Excel files are converted
46    into DataFrames, with each sheet in each Excel file becoming a separate DataFrame.
47    ZIP files are examined for Excel files, which are similarly converted into
48    DataFrames. The dictionary of DataFrames is returned.
49
50    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
51    or `read_abs_series()` functions. This function is provided for those
52    cases where the data is not available in the ABS catalogue, where the
53    data is not a timeseries, or where the user wants to extract data from
54    a specific ABS landingpage.
55
56
57    Parameters
58    ----------
59    url : str = ""
60        A URL for an ABS Catalogue landing page. Either a url or
61        a catalogue number must be provided. If both are provided, the
62        URL will be used.
63
64    cat : str = ""
65        An ABS Catalogue number. If provided, and the URL is not
66        provided, then the Catalogue number will be used to get the URL.
67
68    **kwargs : Unpack[ReadArgs]
69        Accepts the same keyword arguments as `read_abs_cat()`.
70
71    Returns
72    -------
73    dict[str, DataFrame]
74        A dictionary of DataFrames.
75
76    """
77    # check/get the keyword arguments
78    url = _get_url(url, cat)
79    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
80    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
81    if verbose := args["verbose"]:
82        print(f"grab_abs_url(): {url=}, {args=}")
83
84    # get the URL links to the relevant ABS data files on that webpage
85    links = get_abs_links(url, **args)
86    if not links:
87        print(f"No data files found at URL: {url}")
88        return {}  # return an empty Dictionary
89
90    # read the data files into a dictionary of DataFrames
91    abs_dict: dict[str, DataFrame] = {}
92
93    # Process single file requests first
94    abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose)
95    if abs_dict:  # If single file was found and processed, return it
96        return abs_dict
97
98    # Process all files based on configuration
99    return _process_all_files(abs_dict, links, args)

For a given URL, extract the data from the Excel and ZIP file links found on that page.

The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.

The preferred mechanism for reading ABS data is to use the read_abs_cat() or read_abs_series() functions. This function is provided for those cases where the data is not available in the ABS catalogue, where the data is not a timeseries, or where the user wants to extract data from a specific ABS landingpage.

Parameters

url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.

cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.

**kwargs : Unpack[ReadArgs] Accepts the same keyword arguments as read_abs_cat().

Returns

dict[str, DataFrame] A dictionary of DataFrames.

def grab_abs_zip( zip_path: pathlib.Path | str, **kwargs: Unpack[readabs.ReadArgs]) -> dict[str, pandas.DataFrame]:
102def grab_abs_zip(
103    zip_path: Path | str,
104    **kwargs: Unpack[ReadArgs]
105) -> dict[str, DataFrame]:
106    """Grab and process a single ABS ZIP file from a file system location.
107
108    This is a convenience function that opens an ABS ZIP file from a local
109    filesystem path. Expect to be used rarely.
110
111    Parameters
112    ----------
113    zip_path : Path | str
114        The local filesystem path of the ABS ZIP file to open and process.
115
116    **kwargs : Unpack[ReadArgs]
117        Additional keyword arguments for file retrieval and processing.
118
119    Returns
120    -------
121    dict[str, DataFrame]
122        A dictionary of DataFrames extracted from the ZIP file.
123
124    """
125    check_kwargs(kwargs, "grab_abs_zip")  # warn if invalid kwargs
126    args = get_args(kwargs, "grab_abs_zip")  # get the valid kwargs
127
128    zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path)
129    zip_bytes = zp.read_bytes()
130    abs_dict: dict[str, DataFrame] = {}
131    return _process_zip(abs_dict, zip_bytes, **args)

Grab and process a single ABS ZIP file from a file system location.

This is a convenience function that opens an ABS ZIP file from a local filesystem path. Expect to be used rarely.

Parameters

zip_path : Path | str The local filesystem path of the ABS ZIP file to open and process.

**kwargs : Unpack[ReadArgs] Additional keyword arguments for file retrieval and processing.

Returns

dict[str, DataFrame] A dictionary of DataFrames extracted from the ZIP file.