readabs.abs_catalogue

Catalogue map for ABS data.

  1"""Catalogue map for ABS data."""
  2
  3from functools import cache
  4from io import StringIO
  5
  6from pandas import DataFrame, Index, Series, read_html
  7
  8from readabs.download_cache import CacheError, HttpError, get_file
  9
 10# Constants
 11ABS_CATALOGUE_URL = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory"
 12ABS_STATISTICS_ROOT = "https://www.abs.gov.au/statistics/"
 13EXPECTED_COLUMNS = ["Theme", "Parent Topic", "Topic"]
 14CATALOGUE_INDEX_NAME = "Catalogue ID"
 15CEASED_MARKER = "Ceased"
 16DEFAULT_ENCODING = "utf-8"
 17
 18
 19class CatalogueError(Exception):
 20    """Error processing ABS catalogue data."""
 21
 22
 23@cache
 24def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame:
 25    """Return a DataFrame of ABS Catalogue numbers.
 26
 27    Downloads catalogue data from the ABS website on first call and caches
 28    for future use. The returned DataFrame contains catalogue numbers with
 29    their topics, themes, URLs, and status.
 30
 31    Parameters
 32    ----------
 33    cache_only : bool, default False
 34        If True, only use cached data and don't attempt to download.
 35    verbose : bool, default False
 36        If True, print progress messages.
 37
 38    Returns
 39    -------
 40    DataFrame
 41        DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status']
 42        and index of catalogue IDs.
 43
 44    Raises
 45    ------
 46    CatalogueError
 47        If the catalogue data cannot be retrieved or parsed.
 48    HttpError
 49        If there's a network error downloading the catalogue.
 50    CacheError
 51        If cache_only=True but no cached data is available.
 52
 53    Example
 54    -------
 55    >>> import readabs as ra
 56    >>> catalogue = ra.abs_catalogue()
 57    >>> print(catalogue.head())
 58
 59    """
 60    try:
 61        # Download ABS catalogue page
 62        abs_bytes = get_file(ABS_CATALOGUE_URL, cache_only=cache_only, verbose=verbose)
 63
 64        if not abs_bytes:
 65            raise CatalogueError("No data retrieved from ABS catalogue URL")
 66
 67        # Parse HTML content
 68        try:
 69            html_content = abs_bytes.decode(DEFAULT_ENCODING, errors="replace")
 70        except UnicodeDecodeError as e:
 71            raise CatalogueError(f"Failed to decode HTML content: {e}") from e
 72
 73        # Extract tables from HTML
 74        try:
 75            tables = read_html(StringIO(html_content), extract_links="body")
 76            if not tables:
 77                raise CatalogueError("No tables found in HTML content")
 78            links = tables[-1]  # Get the last table
 79        except (ValueError, IndexError) as e:
 80            raise CatalogueError(f"Failed to parse HTML tables: {e}") from e
 81
 82        # Validate required columns exist
 83        required_cols = ["Catalogue number", "Topic"]
 84        missing_cols = [col for col in required_cols if col not in links.columns]
 85        if missing_cols:
 86            raise CatalogueError(f"Missing required columns: {missing_cols}")
 87
 88        # Extract catalogue numbers and URLs
 89        try:
 90            cats = links["Catalogue number"].apply(Series)[0]
 91            urls = links["Topic"].apply(Series)[1]
 92        except (KeyError, IndexError) as e:
 93            raise CatalogueError(f"Failed to extract catalogue data: {e}") from e
 94
 95        # Process topic URLs to create hierarchical structure
 96        url_snippets = _process_topic_urls(urls)
 97
 98        # Create main DataFrame with hierarchical topic structure
 99        frame = _create_topic_frame(url_snippets)
100        frame["URL"] = urls
101
102        # Align catalogue numbers with processed frame
103        cats = cats[frame.index]
104
105        # Process catalogue status (active vs ceased)
106        cat_index, status = _process_catalogue_status(cats)
107
108        frame["Status"] = status
109        frame.index = Index(cat_index)
110        frame.index.name = CATALOGUE_INDEX_NAME
111
112    except (HttpError, CacheError, ValueError) as e:
113        raise CatalogueError(f"Error retrieving ABS catalogue: {e}") from e
114
115    return frame
116
117
118def _process_topic_urls(urls: Series) -> Series:
119    """Process topic URLs to extract clean topic hierarchy."""
120    # Remove root URL prefix
121    snippets = urls.str.replace(ABS_STATISTICS_ROOT, "", regex=False)
122
123    # Filter out invalid URLs and clean formatting
124    valid_snippets = snippets[~snippets.str.contains("http", na=False)]
125    return valid_snippets.str.replace("-", " ").str.title()
126
127
128def _create_topic_frame(snippets: Series) -> DataFrame:
129    """Create DataFrame with topic hierarchy from URL snippets."""
130    # Split URL paths into hierarchical components
131    frame = snippets.str.split("/", expand=True).iloc[:, :3]
132    frame.columns = Index(EXPECTED_COLUMNS)
133
134    return frame
135
136
137def _process_catalogue_status(cats: Series) -> tuple[Series, Series]:
138    """Process catalogue numbers to extract IDs and status."""
139    # Extract clean catalogue IDs (remove ceased marker)
140    cat_index = cats.str.replace(CEASED_MARKER, "", regex=False).str.strip()
141
142    # Determine status based on presence of ceased marker
143    status = Series("Active", index=cats.index)
144    ceased_mask = cats.str.contains(CEASED_MARKER, na=False)
145    status.loc[ceased_mask] = "Ceased"
146
147    return cat_index, status
148
149
150if __name__ == "__main__":
151    print(abs_catalogue())
ABS_CATALOGUE_URL = 'https://www.abs.gov.au/about/data-services/help/abs-time-series-directory'
ABS_STATISTICS_ROOT = 'https://www.abs.gov.au/statistics/'
EXPECTED_COLUMNS = ['Theme', 'Parent Topic', 'Topic']
CATALOGUE_INDEX_NAME = 'Catalogue ID'
CEASED_MARKER = 'Ceased'
DEFAULT_ENCODING = 'utf-8'
class CatalogueError(builtins.Exception):
20class CatalogueError(Exception):
21    """Error processing ABS catalogue data."""

Error processing ABS catalogue data.

@cache
def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> pandas.DataFrame:
 24@cache
 25def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame:
 26    """Return a DataFrame of ABS Catalogue numbers.
 27
 28    Downloads catalogue data from the ABS website on first call and caches
 29    for future use. The returned DataFrame contains catalogue numbers with
 30    their topics, themes, URLs, and status.
 31
 32    Parameters
 33    ----------
 34    cache_only : bool, default False
 35        If True, only use cached data and don't attempt to download.
 36    verbose : bool, default False
 37        If True, print progress messages.
 38
 39    Returns
 40    -------
 41    DataFrame
 42        DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status']
 43        and index of catalogue IDs.
 44
 45    Raises
 46    ------
 47    CatalogueError
 48        If the catalogue data cannot be retrieved or parsed.
 49    HttpError
 50        If there's a network error downloading the catalogue.
 51    CacheError
 52        If cache_only=True but no cached data is available.
 53
 54    Example
 55    -------
 56    >>> import readabs as ra
 57    >>> catalogue = ra.abs_catalogue()
 58    >>> print(catalogue.head())
 59
 60    """
 61    try:
 62        # Download ABS catalogue page
 63        abs_bytes = get_file(ABS_CATALOGUE_URL, cache_only=cache_only, verbose=verbose)
 64
 65        if not abs_bytes:
 66            raise CatalogueError("No data retrieved from ABS catalogue URL")
 67
 68        # Parse HTML content
 69        try:
 70            html_content = abs_bytes.decode(DEFAULT_ENCODING, errors="replace")
 71        except UnicodeDecodeError as e:
 72            raise CatalogueError(f"Failed to decode HTML content: {e}") from e
 73
 74        # Extract tables from HTML
 75        try:
 76            tables = read_html(StringIO(html_content), extract_links="body")
 77            if not tables:
 78                raise CatalogueError("No tables found in HTML content")
 79            links = tables[-1]  # Get the last table
 80        except (ValueError, IndexError) as e:
 81            raise CatalogueError(f"Failed to parse HTML tables: {e}") from e
 82
 83        # Validate required columns exist
 84        required_cols = ["Catalogue number", "Topic"]
 85        missing_cols = [col for col in required_cols if col not in links.columns]
 86        if missing_cols:
 87            raise CatalogueError(f"Missing required columns: {missing_cols}")
 88
 89        # Extract catalogue numbers and URLs
 90        try:
 91            cats = links["Catalogue number"].apply(Series)[0]
 92            urls = links["Topic"].apply(Series)[1]
 93        except (KeyError, IndexError) as e:
 94            raise CatalogueError(f"Failed to extract catalogue data: {e}") from e
 95
 96        # Process topic URLs to create hierarchical structure
 97        url_snippets = _process_topic_urls(urls)
 98
 99        # Create main DataFrame with hierarchical topic structure
100        frame = _create_topic_frame(url_snippets)
101        frame["URL"] = urls
102
103        # Align catalogue numbers with processed frame
104        cats = cats[frame.index]
105
106        # Process catalogue status (active vs ceased)
107        cat_index, status = _process_catalogue_status(cats)
108
109        frame["Status"] = status
110        frame.index = Index(cat_index)
111        frame.index.name = CATALOGUE_INDEX_NAME
112
113    except (HttpError, CacheError, ValueError) as e:
114        raise CatalogueError(f"Error retrieving ABS catalogue: {e}") from e
115
116    return frame

Return a DataFrame of ABS Catalogue numbers.

Downloads catalogue data from the ABS website on first call and caches for future use. The returned DataFrame contains catalogue numbers with their topics, themes, URLs, and status.

Parameters

cache_only : bool, default False If True, only use cached data and don't attempt to download. verbose : bool, default False If True, print progress messages.

Returns

DataFrame DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status'] and index of catalogue IDs.

Raises

CatalogueError If the catalogue data cannot be retrieved or parsed. HttpError If there's a network error downloading the catalogue. CacheError If cache_only=True but no cached data is available.

Example

>>> import readabs as ra
>>> catalogue = ra.abs_catalogue()
>>> print(catalogue.head())