readabs.abs_catalogue
Catalogue map for ABS data.
1"""Catalogue map for ABS data.""" 2 3from functools import cache 4from io import StringIO 5 6from pandas import DataFrame, Index, Series, read_html 7 8from readabs.download_cache import CacheError, HttpError, get_file 9 10# Constants 11ABS_CATALOGUE_URL = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory" 12ABS_STATISTICS_ROOT = "https://www.abs.gov.au/statistics/" 13EXPECTED_COLUMNS = ["Theme", "Parent Topic", "Topic"] 14CATALOGUE_INDEX_NAME = "Catalogue ID" 15CEASED_MARKER = "Ceased" 16DEFAULT_ENCODING = "utf-8" 17 18 19class CatalogueError(Exception): 20 """Error processing ABS catalogue data.""" 21 22 23@cache 24def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame: 25 """Return a DataFrame of ABS Catalogue numbers. 26 27 Downloads catalogue data from the ABS website on first call and caches 28 for future use. The returned DataFrame contains catalogue numbers with 29 their topics, themes, URLs, and status. 30 31 Parameters 32 ---------- 33 cache_only : bool, default False 34 If True, only use cached data and don't attempt to download. 35 verbose : bool, default False 36 If True, print progress messages. 37 38 Returns 39 ------- 40 DataFrame 41 DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status'] 42 and index of catalogue IDs. 43 44 Raises 45 ------ 46 CatalogueError 47 If the catalogue data cannot be retrieved or parsed. 48 HttpError 49 If there's a network error downloading the catalogue. 50 CacheError 51 If cache_only=True but no cached data is available. 52 53 Example 54 ------- 55 >>> import readabs as ra 56 >>> catalogue = ra.abs_catalogue() 57 >>> print(catalogue.head()) 58 59 """ 60 try: 61 # Download ABS catalogue page 62 abs_bytes = get_file(ABS_CATALOGUE_URL, cache_only=cache_only, verbose=verbose) 63 64 if not abs_bytes: 65 raise CatalogueError("No data retrieved from ABS catalogue URL") 66 67 # Parse HTML content 68 try: 69 html_content = abs_bytes.decode(DEFAULT_ENCODING, errors="replace") 70 except UnicodeDecodeError as e: 71 raise CatalogueError(f"Failed to decode HTML content: {e}") from e 72 73 # Extract tables from HTML 74 try: 75 tables = read_html(StringIO(html_content), extract_links="body") 76 if not tables: 77 raise CatalogueError("No tables found in HTML content") 78 links = tables[-1] # Get the last table 79 except (ValueError, IndexError) as e: 80 raise CatalogueError(f"Failed to parse HTML tables: {e}") from e 81 82 # Validate required columns exist 83 required_cols = ["Catalogue number", "Topic"] 84 missing_cols = [col for col in required_cols if col not in links.columns] 85 if missing_cols: 86 raise CatalogueError(f"Missing required columns: {missing_cols}") 87 88 # Extract catalogue numbers and URLs 89 try: 90 cats = links["Catalogue number"].apply(Series)[0] 91 urls = links["Topic"].apply(Series)[1] 92 except (KeyError, IndexError) as e: 93 raise CatalogueError(f"Failed to extract catalogue data: {e}") from e 94 95 # Process topic URLs to create hierarchical structure 96 url_snippets = _process_topic_urls(urls) 97 98 # Create main DataFrame with hierarchical topic structure 99 frame = _create_topic_frame(url_snippets) 100 frame["URL"] = urls 101 102 # Align catalogue numbers with processed frame 103 cats = cats[frame.index] 104 105 # Process catalogue status (active vs ceased) 106 cat_index, status = _process_catalogue_status(cats) 107 108 frame["Status"] = status 109 frame.index = Index(cat_index) 110 frame.index.name = CATALOGUE_INDEX_NAME 111 112 except (HttpError, CacheError, ValueError) as e: 113 raise CatalogueError(f"Error retrieving ABS catalogue: {e}") from e 114 115 return frame 116 117 118def _process_topic_urls(urls: Series) -> Series: 119 """Process topic URLs to extract clean topic hierarchy.""" 120 # Remove root URL prefix 121 snippets = urls.str.replace(ABS_STATISTICS_ROOT, "", regex=False) 122 123 # Filter out invalid URLs and clean formatting 124 valid_snippets = snippets[~snippets.str.contains("http", na=False)] 125 return valid_snippets.str.replace("-", " ").str.title() 126 127 128def _create_topic_frame(snippets: Series) -> DataFrame: 129 """Create DataFrame with topic hierarchy from URL snippets.""" 130 # Split URL paths into hierarchical components 131 frame = snippets.str.split("/", expand=True).iloc[:, :3] 132 frame.columns = Index(EXPECTED_COLUMNS) 133 134 return frame 135 136 137def _process_catalogue_status(cats: Series) -> tuple[Series, Series]: 138 """Process catalogue numbers to extract IDs and status.""" 139 # Extract clean catalogue IDs (remove ceased marker) 140 cat_index = cats.str.replace(CEASED_MARKER, "", regex=False).str.strip() 141 142 # Determine status based on presence of ceased marker 143 status = Series("Active", index=cats.index) 144 ceased_mask = cats.str.contains(CEASED_MARKER, na=False) 145 status.loc[ceased_mask] = "Ceased" 146 147 return cat_index, status 148 149 150if __name__ == "__main__": 151 print(abs_catalogue())
Error processing ABS catalogue data.
24@cache 25def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame: 26 """Return a DataFrame of ABS Catalogue numbers. 27 28 Downloads catalogue data from the ABS website on first call and caches 29 for future use. The returned DataFrame contains catalogue numbers with 30 their topics, themes, URLs, and status. 31 32 Parameters 33 ---------- 34 cache_only : bool, default False 35 If True, only use cached data and don't attempt to download. 36 verbose : bool, default False 37 If True, print progress messages. 38 39 Returns 40 ------- 41 DataFrame 42 DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status'] 43 and index of catalogue IDs. 44 45 Raises 46 ------ 47 CatalogueError 48 If the catalogue data cannot be retrieved or parsed. 49 HttpError 50 If there's a network error downloading the catalogue. 51 CacheError 52 If cache_only=True but no cached data is available. 53 54 Example 55 ------- 56 >>> import readabs as ra 57 >>> catalogue = ra.abs_catalogue() 58 >>> print(catalogue.head()) 59 60 """ 61 try: 62 # Download ABS catalogue page 63 abs_bytes = get_file(ABS_CATALOGUE_URL, cache_only=cache_only, verbose=verbose) 64 65 if not abs_bytes: 66 raise CatalogueError("No data retrieved from ABS catalogue URL") 67 68 # Parse HTML content 69 try: 70 html_content = abs_bytes.decode(DEFAULT_ENCODING, errors="replace") 71 except UnicodeDecodeError as e: 72 raise CatalogueError(f"Failed to decode HTML content: {e}") from e 73 74 # Extract tables from HTML 75 try: 76 tables = read_html(StringIO(html_content), extract_links="body") 77 if not tables: 78 raise CatalogueError("No tables found in HTML content") 79 links = tables[-1] # Get the last table 80 except (ValueError, IndexError) as e: 81 raise CatalogueError(f"Failed to parse HTML tables: {e}") from e 82 83 # Validate required columns exist 84 required_cols = ["Catalogue number", "Topic"] 85 missing_cols = [col for col in required_cols if col not in links.columns] 86 if missing_cols: 87 raise CatalogueError(f"Missing required columns: {missing_cols}") 88 89 # Extract catalogue numbers and URLs 90 try: 91 cats = links["Catalogue number"].apply(Series)[0] 92 urls = links["Topic"].apply(Series)[1] 93 except (KeyError, IndexError) as e: 94 raise CatalogueError(f"Failed to extract catalogue data: {e}") from e 95 96 # Process topic URLs to create hierarchical structure 97 url_snippets = _process_topic_urls(urls) 98 99 # Create main DataFrame with hierarchical topic structure 100 frame = _create_topic_frame(url_snippets) 101 frame["URL"] = urls 102 103 # Align catalogue numbers with processed frame 104 cats = cats[frame.index] 105 106 # Process catalogue status (active vs ceased) 107 cat_index, status = _process_catalogue_status(cats) 108 109 frame["Status"] = status 110 frame.index = Index(cat_index) 111 frame.index.name = CATALOGUE_INDEX_NAME 112 113 except (HttpError, CacheError, ValueError) as e: 114 raise CatalogueError(f"Error retrieving ABS catalogue: {e}") from e 115 116 return frame
Return a DataFrame of ABS Catalogue numbers.
Downloads catalogue data from the ABS website on first call and caches for future use. The returned DataFrame contains catalogue numbers with their topics, themes, URLs, and status.
Parameters
cache_only : bool, default False If True, only use cached data and don't attempt to download. verbose : bool, default False If True, print progress messages.
Returns
DataFrame DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status'] and index of catalogue IDs.
Raises
CatalogueError If the catalogue data cannot be retrieved or parsed. HttpError If there's a network error downloading the catalogue. CacheError If cache_only=True but no cached data is available.
Example
>>> import readabs as ra
>>> catalogue = ra.abs_catalogue()
>>> print(catalogue.head())