readabs
Package to download timeseries data from the Australian Bureau of Statistics and RBA.
This package provides functions to download and process timeseries data from the Australian Bureau of Statistics (ABS) and the Reserve Bank of Australia (RBA).
1"""Package to download timeseries data from the Australian Bureau of Statistics and RBA. 2 3This package provides functions to download and process timeseries data from 4the Australian Bureau of Statistics (ABS) and the Reserve Bank of Australia (RBA). 5""" 6 7import importlib.metadata 8 9# ABS related imports 10from readabs.abs_catalogue import abs_catalogue 11from readabs.abs_meta_data import metacol 12 13# Utility imports 14from readabs.datatype import Datatype 15from readabs.grab_abs_url import grab_abs_url, grab_abs_zip 16from readabs.print_abs_catalogue import print_abs_catalogue 17 18# RBA related imports 19from readabs.rba_catalogue import print_rba_catalogue, rba_catalogue 20from readabs.rba_meta_data import rba_metacol 21from readabs.read_abs_by_desc import read_abs_by_desc 22from readabs.read_abs_cat import read_abs_cat 23from readabs.read_abs_series import read_abs_series 24from readabs.read_rba_table import read_rba_ocr, read_rba_table 25from readabs.read_support import ReadArgs 26from readabs.recalibrate import recalibrate, recalibrate_value 27from readabs.search_abs_meta import find_abs_id, search_abs_meta 28from readabs.utilities import ( 29 annualise_percentages, 30 annualise_rates, 31 monthly_to_qtly, 32 percent_change, 33 qtly_to_monthly, 34) 35 36# Version and author information 37try: 38 __version__ = importlib.metadata.version(__name__) 39except importlib.metadata.PackageNotFoundError: 40 __version__ = "0.0.0" # Fallback for development mode 41__author__ = "Bryan Palmer" 42 43 44# Exposed functions and classes 45__all__ = ( 46 "Datatype", 47 "ReadArgs", 48 "abs_catalogue", 49 "annualise_percentages", 50 "annualise_rates", 51 "find_abs_id", 52 "grab_abs_url", 53 "grab_abs_zip", 54 "metacol", 55 "monthly_to_qtly", 56 "percent_change", 57 "print_abs_catalogue", 58 "print_rba_catalogue", 59 "qtly_to_monthly", 60 "rba_catalogue", 61 "rba_metacol", 62 "read_abs_by_desc", 63 "read_abs_cat", 64 "read_abs_series", 65 "read_rba_ocr", 66 "read_rba_table", 67 "recalibrate", 68 "recalibrate_value", 69 "search_abs_meta", 70) 71__pdoc__ = { 72 "download_cache": False, 73 "get_abs_links": False, 74 "read_support": False, 75 "grab_abs_url": False, 76} # hide submodules from documentation
15class ReadArgs(TypedDict): 16 """Type definition for ABS data reading arguments.""" 17 18 verbose: NotRequired[bool] 19 ignore_errors: NotRequired[bool] 20 get_zip: NotRequired[bool] 21 get_excel_if_no_zip: NotRequired[bool] 22 get_excel: NotRequired[bool] 23 single_zip_only: NotRequired[str] 24 single_excel_only: NotRequired[str] 25 history: NotRequired[str] 26 cache_only: NotRequired[bool] 27 keep_non_ts: NotRequired[bool] 28 zip_file: NotRequired[str] 29 url: NotRequired[str]
Type definition for ABS data reading arguments.
24@cache 25def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame: 26 """Return a DataFrame of ABS Catalogue numbers. 27 28 Downloads catalogue data from the ABS website on first call and caches 29 for future use. The returned DataFrame contains catalogue numbers with 30 their topics, themes, URLs, and status. 31 32 Parameters 33 ---------- 34 cache_only : bool, default False 35 If True, only use cached data and don't attempt to download. 36 verbose : bool, default False 37 If True, print progress messages. 38 39 Returns 40 ------- 41 DataFrame 42 DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status'] 43 and index of catalogue IDs. 44 45 Raises 46 ------ 47 CatalogueError 48 If the catalogue data cannot be retrieved or parsed. 49 HttpError 50 If there's a network error downloading the catalogue. 51 CacheError 52 If cache_only=True but no cached data is available. 53 54 Example 55 ------- 56 >>> import readabs as ra 57 >>> catalogue = ra.abs_catalogue() 58 >>> print(catalogue.head()) 59 60 """ 61 try: 62 # Download ABS catalogue page 63 abs_bytes = get_file(ABS_CATALOGUE_URL, cache_only=cache_only, verbose=verbose) 64 65 if not abs_bytes: 66 raise CatalogueError("No data retrieved from ABS catalogue URL") 67 68 # Parse HTML content 69 try: 70 html_content = abs_bytes.decode(DEFAULT_ENCODING, errors="replace") 71 except UnicodeDecodeError as e: 72 raise CatalogueError(f"Failed to decode HTML content: {e}") from e 73 74 # Extract tables from HTML 75 try: 76 tables = read_html(StringIO(html_content), extract_links="body") 77 if not tables: 78 raise CatalogueError("No tables found in HTML content") 79 links = tables[-1] # Get the last table 80 except (ValueError, IndexError) as e: 81 raise CatalogueError(f"Failed to parse HTML tables: {e}") from e 82 83 # Validate required columns exist 84 required_cols = ["Catalogue number", "Topic"] 85 missing_cols = [col for col in required_cols if col not in links.columns] 86 if missing_cols: 87 raise CatalogueError(f"Missing required columns: {missing_cols}") 88 89 # Extract catalogue numbers and URLs 90 try: 91 cats = links["Catalogue number"].apply(Series)[0] 92 urls = links["Topic"].apply(Series)[1] 93 except (KeyError, IndexError) as e: 94 raise CatalogueError(f"Failed to extract catalogue data: {e}") from e 95 96 # Process topic URLs to create hierarchical structure 97 url_snippets = _process_topic_urls(urls) 98 99 # Create main DataFrame with hierarchical topic structure 100 frame = _create_topic_frame(url_snippets) 101 frame["URL"] = urls 102 103 # Align catalogue numbers with processed frame 104 cats = cats[frame.index] 105 106 # Process catalogue status (active vs ceased) 107 cat_index, status = _process_catalogue_status(cats) 108 109 frame["Status"] = status 110 frame.index = Index(cat_index) 111 frame.index.name = CATALOGUE_INDEX_NAME 112 113 except (HttpError, CacheError, ValueError) as e: 114 raise CatalogueError(f"Error retrieving ABS catalogue: {e}") from e 115 116 return frame
Return a DataFrame of ABS Catalogue numbers.
Downloads catalogue data from the ABS website on first call and caches for future use. The returned DataFrame contains catalogue numbers with their topics, themes, URLs, and status.
Parameters
cache_only : bool, default False If True, only use cached data and don't attempt to download. verbose : bool, default False If True, print progress messages.
Returns
DataFrame DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status'] and index of catalogue IDs.
Raises
CatalogueError If the catalogue data cannot be retrieved or parsed. HttpError If there's a network error downloading the catalogue. CacheError If cache_only=True but no cached data is available.
Example
>>> import readabs as ra
>>> catalogue = ra.abs_catalogue()
>>> print(catalogue.head())
95def annualise_percentages(data: DataT, *, periods_per_year: float) -> DataT: 96 """Annualise a growth rate (expressed as a percentage) for a period. 97 98 Args: 99 data : pandas Series or DataFrame - The growth rate (expressed as a 100 percentage) to annualise. Note a growth percentage of 5% is a growth 101 rate of 0.05. 102 periods_per_year : int or float, default 12 - The number of periods in a 103 year. For monthly data, this is 12. 104 105 Returns: 106 pandas Series or DataFrame - The annualised growth expressed as a percentage. 107 For DataFrame input, the annualised growth rate is calculated for each column. 108 109 Raises: 110 InvalidParameterError - If periods_per_year is not positive. 111 InvalidDataError - If data is not a Series or DataFrame. 112 113 """ 114 if not isinstance(data, (Series, DataFrame)): 115 raise InvalidDataError("data must be a pandas Series or DataFrame") 116 117 if not isinstance(periods_per_year, (int, float)) or periods_per_year <= 0: 118 raise InvalidParameterError("periods_per_year must be a positive number") 119 120 try: 121 rates = data / 100.0 122 return annualise_rates(rates, periods_per_year=periods_per_year) 123 except Exception as e: 124 raise InvalidDataError(f"Error annualising percentages: {e}") from e
Annualise a growth rate (expressed as a percentage) for a period.
Args: data : pandas Series or DataFrame - The growth rate (expressed as a percentage) to annualise. Note a growth percentage of 5% is a growth rate of 0.05. periods_per_year : int or float, default 12 - The number of periods in a year. For monthly data, this is 12.
Returns: pandas Series or DataFrame - The annualised growth expressed as a percentage. For DataFrame input, the annualised growth rate is calculated for each column.
Raises: InvalidParameterError - If periods_per_year is not positive. InvalidDataError - If data is not a Series or DataFrame.
62def annualise_rates(data: DataT, *, periods_per_year: float) -> DataT: 63 """Annualise a growth rate for a period. 64 65 Note: returns a percentage value (and not a rate)! 66 67 Args: 68 data : pandas Series or DataFrame - The growth rate to annualise. 69 Note a growth rate of 0.05 is 5%. 70 periods_per_year : int or float, default 12 - The number of periods in a year. 71 For monthly data, this is 12. 72 73 Returns: 74 pandas Series or DataFrame - The annualised growth expressed as a percentage 75 (not a rate). For DataFrame input, the annualised growth rate is 76 calculated for each column. 77 78 Raises: 79 InvalidParameterError - If periods_per_year is not positive. 80 InvalidDataError - If data is not a Series or DataFrame. 81 82 """ 83 if not isinstance(data, (Series, DataFrame)): 84 raise InvalidDataError("data must be a pandas Series or DataFrame") 85 86 if not isinstance(periods_per_year, (int, float)) or periods_per_year <= 0: 87 raise InvalidParameterError("periods_per_year must be a positive number") 88 89 try: 90 return (((1 + data) ** periods_per_year) - 1) * 100 91 except Exception as e: 92 raise InvalidDataError(f"Error annualising rates: {e}") from e
Annualise a growth rate for a period.
Note: returns a percentage value (and not a rate)!
Args: data : pandas Series or DataFrame - The growth rate to annualise. Note a growth rate of 0.05 is 5%. periods_per_year : int or float, default 12 - The number of periods in a year. For monthly data, this is 12.
Returns: pandas Series or DataFrame - The annualised growth expressed as a percentage (not a rate). For DataFrame input, the annualised growth rate is calculated for each column.
Raises: InvalidParameterError - If periods_per_year is not positive. InvalidDataError - If data is not a Series or DataFrame.
126def find_abs_id( 127 meta: DataFrame, 128 search_terms: dict[str, str], 129 **kwargs: Any, 130) -> tuple[str, str, str]: # table, series_id, units 131 """Find a unique ABS series identifier in the ABS metadata. 132 133 Parameters 134 ---------- 135 meta : DataFrame 136 A pandas DataFrame of metadata from the ABS 137 (via read_abs_cat() or read_abs_series()). 138 search_terms : dict[str, str] 139 A dictionary {search_phrase: meta_column_name, ...} of search terms. 140 Note: the search terms must be unique, as a dictionary cannot hold the 141 same search term to be applied to different columns. 142 **kwargs : Any 143 Additional keyword arguments. The only additional keyword argument 144 that is used is validate_unique. 145 validate_unique : bool = True 146 Raise a ValueError if the search result is not a single 147 unique match. Note: the default is True for safety. 148 149 Returns 150 ------- 151 tuple[str, str, str] 152 A tuple of the table, series_id and units for the unique 153 series_id that matches the search terms. 154 155 Metacol 156 ------- 157 Because the meta data is a DataFrame, the columns can be referenced by either 158 their full textual name, or by the short name defined in the metacol object. 159 For example, if metacol is imported as mc, to refer to the 160 `Data Item Description` column, the user can refer to it as mc.did. 161 162 Example 163 ------- 164 ```python 165 from readabs import metacol as mc # alias for the ABS meta data column names 166 from readabs import read_abs_cat, find_abs_id, recalibrate 167 cat_num = "6202.0" # The ABS labour force survey 168 data, meta = read_abs_cat(cat_num) 169 search_terms = { 170 "Employed total ; Persons ;": mc.did, 171 "Seasonally Adjusted": mc.stype, 172 "6202001": mc.table, 173 } 174 table, series_id, units = find_abs_id(meta, search_terms) 175 print(f"Table: {table} Series ID: {series_id} Units: {units}") 176 recal_series, recal_units = recalibrate(data[table][series_id], units) 177 ``` 178 179 """ 180 validate_unique = kwargs.pop("validate_unique", True) 181 found = search_abs_meta(meta, search_terms, validate_unique=validate_unique, **kwargs).iloc[0] 182 table, series_id, units = ( 183 found[mc.table], 184 found[mc.id], 185 found[mc.unit], 186 ) 187 188 return table, series_id, units
Find a unique ABS series identifier in the ABS metadata.
Parameters
meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. **kwargs : Any Additional keyword arguments. The only additional keyword argument that is used is validate_unique. validate_unique : bool = True Raise a ValueError if the search result is not a single unique match. Note: the default is True for safety.
Returns
tuple[str, str, str] A tuple of the table, series_id and units for the unique series_id that matches the search terms.
Metacol
Because the meta data is a DataFrame, the columns can be referenced by either
their full textual name, or by the short name defined in the metacol object.
For example, if metacol is imported as mc, to refer to the
Data Item Description column, the user can refer to it as mc.did.
Example
from readabs import metacol as mc # alias for the ABS meta data column names
from readabs import read_abs_cat, find_abs_id, recalibrate
cat_num = "6202.0" # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
"Employed total ; Persons ;": mc.did,
"Seasonally Adjusted": mc.stype,
"6202001": mc.table,
}
table, series_id, units = find_abs_id(meta, search_terms)
print(f"Table: {table} Series ID: {series_id} Units: {units}")
recal_series, recal_units = recalibrate(data[table][series_id], units)
37@cache # minimise slowness with repeat business 38def grab_abs_url( 39 cat: str = "", 40 url: str = "", 41 **kwargs: Unpack[ReadArgs], 42) -> dict[str, DataFrame]: 43 """For a given URL, extract the data from the Excel and ZIP file links found on that page. 44 45 The data is returned as a dictionary of DataFrames. The Excel files are converted 46 into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. 47 ZIP files are examined for Excel files, which are similarly converted into 48 DataFrames. The dictionary of DataFrames is returned. 49 50 The preferred mechanism for reading ABS data is to use the `read_abs_cat()` 51 or `read_abs_series()` functions. This function is provided for those 52 cases where the data is not available in the ABS catalogue, where the 53 data is not a timeseries, or where the user wants to extract data from 54 a specific ABS landingpage. 55 56 57 Parameters 58 ---------- 59 url : str = "" 60 A URL for an ABS Catalogue landing page. Either a url or 61 a catalogue number must be provided. If both are provided, the 62 URL will be used. 63 64 cat : str = "" 65 An ABS Catalogue number. If provided, and the URL is not 66 provided, then the Catalogue number will be used to get the URL. 67 68 **kwargs : Unpack[ReadArgs] 69 Accepts the same keyword arguments as `read_abs_cat()`. 70 71 Returns 72 ------- 73 dict[str, DataFrame] 74 A dictionary of DataFrames. 75 76 """ 77 # check/get the keyword arguments 78 url = _get_url(url, cat) 79 check_kwargs(kwargs, "grab_abs_url") # warn if invalid kwargs 80 args = get_args(kwargs, "grab_abs_url") # get the valid kwargs 81 if verbose := args["verbose"]: 82 print(f"grab_abs_url(): {url=}, {args=}") 83 84 # get the URL links to the relevant ABS data files on that webpage 85 links = get_abs_links(url, **args) 86 if not links: 87 print(f"No data files found at URL: {url}") 88 return {} # return an empty Dictionary 89 90 # read the data files into a dictionary of DataFrames 91 abs_dict: dict[str, DataFrame] = {} 92 93 # Process single file requests first 94 abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose) 95 if abs_dict: # If single file was found and processed, return it 96 return abs_dict 97 98 # Process all files based on configuration 99 return _process_all_files(abs_dict, links, args)
For a given URL, extract the data from the Excel and ZIP file links found on that page.
The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.
The preferred mechanism for reading ABS data is to use the read_abs_cat()
or read_abs_series() functions. This function is provided for those
cases where the data is not available in the ABS catalogue, where the
data is not a timeseries, or where the user wants to extract data from
a specific ABS landingpage.
Parameters
url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.
cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.
**kwargs : Unpack[ReadArgs]
Accepts the same keyword arguments as read_abs_cat().
Returns
dict[str, DataFrame] A dictionary of DataFrames.
102def grab_abs_zip( 103 zip_path: Path | str, 104 **kwargs: Unpack[ReadArgs] 105) -> dict[str, DataFrame]: 106 """Grab and process a single ABS ZIP file from a file system location. 107 108 This is a convenience function that opens an ABS ZIP file from a local 109 filesystem path. Expect to be used rarely. 110 111 Parameters 112 ---------- 113 zip_path : Path | str 114 The local filesystem path of the ABS ZIP file to open and process. 115 116 **kwargs : Unpack[ReadArgs] 117 Additional keyword arguments for file retrieval and processing. 118 119 Returns 120 ------- 121 dict[str, DataFrame] 122 A dictionary of DataFrames extracted from the ZIP file. 123 124 """ 125 check_kwargs(kwargs, "grab_abs_zip") # warn if invalid kwargs 126 args = get_args(kwargs, "grab_abs_zip") # get the valid kwargs 127 128 zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path) 129 zip_bytes = zp.read_bytes() 130 abs_dict: dict[str, DataFrame] = {} 131 return _process_zip(abs_dict, zip_bytes, **args)
Grab and process a single ABS ZIP file from a file system location.
This is a convenience function that opens an ABS ZIP file from a local filesystem path. Expect to be used rarely.
Parameters
zip_path : Path | str The local filesystem path of the ABS ZIP file to open and process.
**kwargs : Unpack[ReadArgs] Additional keyword arguments for file retrieval and processing.
Returns
dict[str, DataFrame] A dictionary of DataFrames extracted from the ZIP file.
194def monthly_to_qtly(data: DataT, q_ending: str = "DEC", f: str = "mean") -> DataT: 195 """Convert monthly data to quarterly data. 196 197 This is done by taking the mean (or sum) of the three months in each quarter. 198 Ignore quarters with less than or more than three months data. Drop NA items. 199 Change f to "sum" for a quarterly sum. 200 201 Args: 202 data : pandas Series or DataFrame 203 The data to convert to quarterly frequency. 204 q_ending : str, default "DEC" 205 The month in which the quarter ends. For example, "DEC" for December. 206 f : str, default "mean" 207 The function to apply to the three months in each quarter. 208 Change to "sum" for a quarterly sum. The default is a 209 quarterly mean. 210 211 Returns: 212 pandas Series or DataFrame 213 The data with a quarterly PeriodIndex. If a quarter has less than 214 three months data, the quarter is dropped. If the quarter has more 215 than three months data, the quarter is dropped. Any NA data is removed. 216 For DataFrame input, the function is applied to each column. 217 218 Raises: 219 InvalidDataError - If data is not a Series or DataFrame. 220 InvalidParameterError - If q_ending or f parameters are invalid. 221 222 """ 223 # Validate inputs 224 if not isinstance(data, (Series, DataFrame)): 225 raise InvalidDataError("data must be a pandas Series or DataFrame") 226 227 valid_endings = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] 228 if q_ending.upper() not in valid_endings: 229 raise InvalidParameterError(f"q_ending must be one of {valid_endings}") 230 231 valid_aggregations = ["mean", "sum", "min", "max", "std", "var"] 232 if f not in valid_aggregations: 233 raise InvalidParameterError(f"f must be one of {valid_aggregations}") 234 235 try: 236 if isinstance(data, Series): 237 return _monthly_to_qtly_series(data, q_ending, f) 238 if isinstance(data, DataFrame): 239 result_dict = {} 240 for col in data.columns: 241 result_dict[col] = _monthly_to_qtly_series(data[col], q_ending, f) 242 return data.__class__(result_dict) 243 # This should never be reached due to validation above 244 raise InvalidDataError("Unexpected data type") # noqa: TRY301 245 except Exception as e: 246 raise InvalidDataError(f"Error converting monthly to quarterly data: {e}") from e
Convert monthly data to quarterly data.
This is done by taking the mean (or sum) of the three months in each quarter. Ignore quarters with less than or more than three months data. Drop NA items. Change f to "sum" for a quarterly sum.
Args: data : pandas Series or DataFrame The data to convert to quarterly frequency. q_ending : str, default "DEC" The month in which the quarter ends. For example, "DEC" for December. f : str, default "mean" The function to apply to the three months in each quarter. Change to "sum" for a quarterly sum. The default is a quarterly mean.
Returns: pandas Series or DataFrame The data with a quarterly PeriodIndex. If a quarter has less than three months data, the quarter is dropped. If the quarter has more than three months data, the quarter is dropped. Any NA data is removed. For DataFrame input, the function is applied to each column.
Raises: InvalidDataError - If data is not a Series or DataFrame. InvalidParameterError - If q_ending or f parameters are invalid.
31def percent_change(data: DataT, n_periods: int) -> DataT: 32 """Calculate a percentage change in a contiguous, ordered series over n_periods. 33 34 Args: 35 data : pandas Series or DataFrame 36 The data to calculate the percentage change for. 37 n_periods : int 38 The number of periods to calculate the percentage change over. 39 Typically 4 for quarterly data, and 12 for monthly data. 40 41 Returns: 42 pandas Series or DataFrame - The percentage change in the data over n_periods. 43 For DataFrame input, the percentage change is calculated for each column. 44 45 Raises: 46 InvalidParameterError - If n_periods is not a positive integer. 47 InvalidDataError - If data is not a Series or DataFrame. 48 49 """ 50 if not isinstance(n_periods, int) or n_periods <= 0: 51 raise InvalidParameterError("n_periods must be a positive integer") 52 53 if not isinstance(data, (Series, DataFrame)): 54 raise InvalidDataError("data must be a pandas Series or DataFrame") 55 56 try: 57 return (data / data.shift(n_periods) - 1) * 100 58 except Exception as e: 59 raise InvalidDataError(f"Error calculating percentage change: {e}") from e
Calculate a percentage change in a contiguous, ordered series over n_periods.
Args: data : pandas Series or DataFrame The data to calculate the percentage change for. n_periods : int The number of periods to calculate the percentage change over. Typically 4 for quarterly data, and 12 for monthly data.
Returns: pandas Series or DataFrame - The percentage change in the data over n_periods. For DataFrame input, the percentage change is calculated for each column.
Raises: InvalidParameterError - If n_periods is not a positive integer. InvalidDataError - If data is not a Series or DataFrame.
11def print_abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> None: 12 """Print a table of ABS Catalogue Numbers with their metadata. 13 14 Displays catalogue numbers that contain time-series data along with 15 their theme, parent topic, topic, and status information. The URL 16 column is excluded from the display for readability. 17 18 This is primarily a convenience function to help users identify the 19 correct catalogue number for data retrieval functions. 20 21 Parameters 22 ---------- 23 cache_only : bool, default False 24 If True, only use cached catalogue data. 25 verbose : bool, default False 26 If True, print progress messages during catalogue retrieval. 27 28 Raises 29 ------ 30 CatalogueError 31 If the catalogue data cannot be retrieved or processed. 32 HttpError 33 If there's a network error downloading the catalogue. 34 CacheError 35 If cache_only=True but no cached data is available. 36 37 Example 38 ------- 39 >>> import readabs as ra 40 >>> ra.print_abs_catalogue() 41 42 """ 43 try: 44 # Retrieve the catalogue data 45 catalogue = abs_catalogue(cache_only=cache_only, verbose=verbose) 46 47 # Validate catalogue is not empty 48 if catalogue.empty: 49 print("No catalogue data available.") 50 return 51 52 # Select columns for display (exclude URL for readability) 53 available_columns = [col for col in DISPLAY_COLUMNS if col in catalogue.columns] 54 if not available_columns: 55 print("Catalogue data does not contain expected columns.") 56 return 57 58 display_data = catalogue[available_columns] 59 60 # Generate and print markdown table 61 try: 62 markdown_output = display_data.to_markdown() 63 print(markdown_output) 64 except Exception: # noqa: BLE001 65 print(display_data.to_string()) 66 67 except (CatalogueError, HttpError, CacheError) as e: 68 print(f"Error retrieving catalogue: {e}")
Print a table of ABS Catalogue Numbers with their metadata.
Displays catalogue numbers that contain time-series data along with their theme, parent topic, topic, and status information. The URL column is excluded from the display for readability.
This is primarily a convenience function to help users identify the correct catalogue number for data retrieval functions.
Parameters
cache_only : bool, default False If True, only use cached catalogue data. verbose : bool, default False If True, print progress messages during catalogue retrieval.
Raises
CatalogueError If the catalogue data cannot be retrieved or processed. HttpError If there's a network error downloading the catalogue. CacheError If cache_only=True but no cached data is available.
Example
>>> import readabs as ra
>>> ra.print_abs_catalogue()
47def print_rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> None: 48 """Print to standard output a table of the RBA Catalogue Numbers. 49 50 This function prints a formatted table of RBA catalogue numbers 51 to standard output. 52 53 Parameters 54 ---------- 55 cache_only : bool = False 56 If True, only use the cache. 57 verbose : bool = False 58 If True, print progress messages. 59 60 Returns 61 ------- 62 None 63 This function does not return anything. 64 65 Example 66 ------- 67 68 ```python 69 import readabs as ra 70 ra.print_rba_catalogue() 71 ``` 72 73 """ 74 rba_catalog = rba_catalogue(cache_only=cache_only, verbose=verbose) 75 print(rba_catalog.loc[:, rba_catalog.columns != "URL"].to_markdown())
Print to standard output a table of the RBA Catalogue Numbers.
This function prints a formatted table of RBA catalogue numbers to standard output.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Returns
None This function does not return anything.
Example
import readabs as ra
ra.print_rba_catalogue()
127def qtly_to_monthly( 128 data: DataT, 129 *, 130 interpolate: bool = True, 131 limit: int | None = 2, # only used if interpolate is True 132 dropna: bool = True, 133) -> DataT: 134 """Convert data from Quarterly PeriodIndex to a Monthly PeriodIndex. 135 136 Args: 137 data: Series or DataFrame with quarterly PeriodIndex. Assumes the index is unique. 138 The data to convert to monthly frequency. 139 interpolate: bool, default True 140 Whether to interpolate the missing monthly data. 141 limit: int, default 2 - The maximum number of consecutive missing months 142 to interpolate. 143 dropna: bool, default True - Whether to drop NA data 144 145 Returns: 146 pandas Series or DataFrame - The data with a Monthly PeriodIndex. 147 If interpolate is True, the missing monthly data is interpolated. 148 If dropna is True, any NA data is removed. 149 150 Raises: 151 InvalidDataError - If data index is not a quarterly PeriodIndex or has issues. 152 InvalidParameterError - If limit parameter is invalid. 153 154 """ 155 # Validate input data 156 if not isinstance(data, (Series, DataFrame)): 157 raise InvalidDataError("data must be a pandas Series or DataFrame") 158 159 if not isinstance(data.index, PeriodIndex): 160 raise InvalidDataError("data index must be a PeriodIndex") 161 162 if not (data.index.freqstr and data.index.freqstr[0] == "Q"): 163 raise InvalidDataError("data index must have quarterly frequency") 164 165 if not data.index.is_unique: 166 raise InvalidDataError("data index must be unique") 167 168 if not data.index.is_monotonic_increasing: 169 raise InvalidDataError("data index must be monotonic increasing") 170 171 if limit is not None and (not isinstance(limit, int) or limit < 0): 172 raise InvalidParameterError("limit must be a non-negative integer or None") 173 174 # do the heavy lifting 175 try: 176 data = ( 177 data.set_axis(labels=data.index.to_timestamp(how="end"), axis="index", copy=True) 178 .resample(rule="ME") # adds in every missing month 179 .first(min_count=1) # generates nans for new months 180 # assumes only one value per quarter (ie. unique index) 181 .pipe(_set_axis_monthly_periods) 182 ) 183 except Exception as e: 184 raise InvalidDataError(f"Error in quarterly to monthly conversion: {e}") from e 185 186 if interpolate: 187 data = data.interpolate(limit_area="inside", limit=limit) 188 if dropna: 189 data = data.dropna() 190 191 return data
Convert data from Quarterly PeriodIndex to a Monthly PeriodIndex.
Args: data: Series or DataFrame with quarterly PeriodIndex. Assumes the index is unique. The data to convert to monthly frequency. interpolate: bool, default True Whether to interpolate the missing monthly data. limit: int, default 2 - The maximum number of consecutive missing months to interpolate. dropna: bool, default True - Whether to drop NA data
Returns: pandas Series or DataFrame - The data with a Monthly PeriodIndex. If interpolate is True, the missing monthly data is interpolated. If dropna is True, any NA data is removed.
Raises: InvalidDataError - If data index is not a quarterly PeriodIndex or has issues. InvalidParameterError - If limit parameter is invalid.
17@cache 18def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame: 19 """Return a DataFrame of RBA Catalogue numbers. 20 21 In the first instance, this is downloaded from the RBA website, and 22 cached for future use. 23 24 Parameters 25 ---------- 26 cache_only : bool = False 27 If True, only use the cache. 28 verbose : bool = False 29 If True, print progress messages. 30 31 Returns 32 ------- 33 DataFrame 34 A DataFrame of RBA Catalogue numbers. 35 36 Example 37 ------- 38 ```python 39 import readabs as ra 40 catalogue = ra.rba_catalogue() 41 ``` 42 43 """ 44 return _get_rba_links(cache_only=cache_only, verbose=verbose)
Return a DataFrame of RBA Catalogue numbers.
In the first instance, this is downloaded from the RBA website, and cached for future use.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Returns
DataFrame A DataFrame of RBA Catalogue numbers.
Example
import readabs as ra
catalogue = ra.rba_catalogue()
143def read_abs_by_desc( 144 wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], 145 **kwargs: Any, 146) -> tuple[dict[str, pd.Series], pd.DataFrame]: 147 """Get specific ABS data series by searching the ABS meta data. 148 149 Parameters 150 ---------- 151 wanted : list of str, dict of str:str, or dict of str:dict 152 The data 153 item descriptions to search for. If a list, it will be a list of 154 descriptions to search for. If a dictionary, the keys will a name. 155 The dictionary values can be either a string (the data item 156 description to search for) or a dictionary of keyword arguments, one of 157 which would be the data item description to search for. 158 **kwargs : Any 159 Keyword arguments to control the data retrieval. 160 The keyword arguments can include the following: 161 - abs_dict : dict - the dictionary of ABS data to search (from 162 read_abs_cat()). 163 - abs_meta : DataFrame - the metadata for the ABS data (from 164 read_abs_cat()). 165 - for the retrieval of data, the "cat" argument must be present. 166 The following arguments, if present, will also be used (ie. 167 passed to read_abs_cat()): ["ignore_errors", "get_zip", 168 "get_excel_if_no_zip", "get_excel", "cache_only", 169 "single_excel_only", "single_zip_only", "verbose"]. 170 - for the selection of data, the following metacol names, if present, 171 will be used to construct the selector: "cat", "did" 172 "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 173 "cmonth", "table", "tdesc". 174 - finally, the following arguments will be passed to the find_abs_id() 175 and search_abs_meta() functions: ["validate_unique", "exact_match", 176 "regex", "verbose"]. 177 178 Notes 179 ----- 180 - if "wanted" is of type list[str] or dict[str, str], the kwargs should 181 include sufficient keys from the metacol dataclass to get the data. 182 Typically, the "cat" key, the "table" key, and the "stype" key would 183 be required. The did key would taken from the wanted list or 184 dictionary. 185 if wanted is of type dict[str, dict[str, Any]], the inner dictionary 186 must contain a "did" key. The other keys that can be used for the 187 data retrieval are the same as the metacol dataclass fileds, namely: 188 "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 189 "cmonth", "table", "tdesc". 190 - if abs_dict and abs_meta are provided within the kwargs, they will be 191 used to locate and extract the selected data. 192 - if abs_dict and abs_meta are not provided, then, (1) wanted must be of 193 type dict[str, dict[str, Any]] and (2) the inner dictionary must 194 contain a "cat" key so the data can be retrieved. Other keys that 195 can be used for the data retrieval are the same as for read_abs_cat(), 196 namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", 197 "get_excel", "single_excel_only", "single_zip_only", "cache_only"]. 198 199 200 Returns 201 ------- 202 Returns a tuple of two items: 203 - A dictionary of pandas Series objects, where the keys are the series 204 descriptions. The series.name attribute will be the ABS series-id. 205 - A pandas DataFrame containing the metadata for the series. 206 207 Example 208 ------- 209 210 ```python 211 import readabs as ra 212 from pandas import DataFrame 213 cat_num = "5206.0" # The ABS National Accounts 214 data, meta = ra.read_abs_cat(cat=cat_num) 215 wanted = ["Gross domestic product: Chain volume measures ;",] 216 selected, selected_meta = ra.read_abs_by_desc( 217 wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates" 218 ) 219 ``` 220 221 """ 222 # - preparation 223 if not _work_to_do(wanted): 224 return {}, pd.DataFrame() 225 if isinstance(wanted, list): 226 wanted = _wlist_to_wdict(wanted) 227 abs_dict = kwargs.get("abs_dict", {}) 228 abs_meta = kwargs.get("abs_meta", pd.DataFrame()) 229 kwarg_selector = _get_search_terms(kwargs, {}) 230 search_args = _get_search_args(kwargs, {}) 231 232 return_dict = {} 233 return_meta = pd.DataFrame() 234 for key, value in wanted.items(): 235 item_selector = kwarg_selector.copy() 236 item_search_args = search_args.copy() 237 if isinstance(value, str): 238 series, meta = _get_item_from_str( 239 item=value, 240 data_dict=abs_dict, 241 data_meta=abs_meta, 242 item_selector=item_selector, 243 search_args=item_search_args, 244 ) 245 246 elif isinstance(value, dict): 247 series, meta = _get_item_from_dict( 248 item_dict=value, 249 data_dict=abs_dict, 250 data_meta=abs_meta, 251 item_selector=item_selector, 252 search_args=item_search_args, 253 **kwargs, 254 ) 255 else: 256 raise TypeError( 257 "Each value in the wanted list/dictionary must be either a string " + "or a dictionary." 258 ) 259 260 # save search results 261 return_dict[key] = series 262 return_meta = pd.concat([return_meta, meta]) 263 264 return return_dict, return_meta
Get specific ABS data series by searching the ABS meta data.
Parameters
wanted : list of str, dict of str:str, or dict of str:dict The data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dictionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for. **kwargs : Any Keyword arguments to control the data retrieval. The keyword arguments can include the following: - abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()). - abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()). - for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "single_zip_only", "verbose"]. - for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc". - finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].
Notes
- if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
- if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
- if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
Returns
Returns a tuple of two items:
- A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
- A pandas DataFrame containing the metadata for the series.
Example
import readabs as ra
from pandas import DataFrame
cat_num = "5206.0" # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)
27@cache # minimise slowness for any repeat business 28def read_abs_cat( 29 cat: str, 30 **kwargs: Unpack[ReadArgs], 31) -> tuple[dict[str, DataFrame], DataFrame]: 32 """For a specific catalogue identifier, return the complete ABS Catalogue information as DataFrames. 33 34 This function returns the complete ABS Catalogue information as a 35 python dictionary of pandas DataFrames, as well as the associated metadata 36 in a separate DataFrame. The function automates the collection of zip and 37 excel files from the ABS website. If necessary, these files are downloaded, 38 and saved into a cache directory. The files are then parsed to extract time 39 series data, and the associated metadata. 40 41 By default, the cache directory is `./.readabs_cache/`. You can change the 42 default directory name by setting the shell environment variable 43 `READABS_CACHE_DIR` with the name of the preferred directory. 44 45 Parameters 46 ---------- 47 cat : str 48 The ABS Catalogue Number for the data to be downloaded and made 49 available by this function. This argument must be specified in the 50 function call. 51 52 **kwargs : Unpack[ReadArgs] 53 The following parameters may be passed as optional keyword arguments. 54 55 url : str = "" 56 The URL of an ABS landing page. Use this for discontinued series 57 that are no longer in the ABS Time Series Directory. If provided, 58 data will be retrieved from this URL instead of looking up the 59 catalogue number. Example: 60 `read_abs_cat(cat="8501.0", url="https://www.abs.gov.au/.../jun-2025")` 61 62 keep_non_ts : bool = False 63 A flag for whether to keep the non-time-series tables 64 that might form part of an ABS catalogue item. Normally, the 65 non-time-series information is ignored, and not made available to 66 the user. 67 68 history : str = "" 69 Provide a month-year string to extract historical ABS data. 70 For example, you can set history="dec-2023" to the get the ABS data 71 for a catalogue identifier that was originally published in respect 72 of Q4 of 2023. Note: not all ABS data sources are structured so that 73 this technique works in every case; but most are. 74 75 verbose : bool = False 76 Setting this to true may help diagnose why something 77 might be going wrong with the data retrieval process. 78 79 ignore_errors : bool = False 80 Normally, this function will cease downloading when 81 an error in encountered. However, sometimes the ABS website has 82 malformed links, and changing this setting is necessitated. (Note: 83 if you drop a message to the ABS, they will usually fix broken 84 links with a business day). 85 86 get_zip : bool = True 87 Download the excel files in .zip files. 88 89 get_excel_if_no_zip : bool = True 90 Only try to download .xlsx files if there are no zip 91 files available to be downloaded. Only downloading individual excel 92 files when there are no zip files to download can speed up the 93 download process. 94 95 get_excel : bool = False 96 The default value means that excel files are not 97 automatically download. Note: at least one of `get_zip`, 98 `get_excel_if_no_zip`, or `get_excel` must be true. For most ABS 99 catalogue items, it is sufficient to just download the one zip 100 file. But note, some catalogue items do not have a zip file. 101 Others have quite a number of zip files. 102 103 single_excel_only : str = "" 104 If this argument is set to a table name (without the 105 .xlsx extension), only that excel file will be downloaded. If 106 set, and only a limited subset of available data is needed, 107 this can speed up download times significantly. Note: overrides 108 `get_zip`, `get_excel_if_no_zip`, `get_excel` and `single_zip_only`. 109 110 single_zip_only : str = "" 111 If this argument is set to a zip file name (without 112 the .zip extension), only that zip file will be downloaded. 113 If set, and only a limited subset of available data is needed, 114 this can speed up download times significantly. Note: overrides 115 `get_zip`, `get_excel_if_no_zip`, and `get_excel`. 116 117 cache_only : bool = False 118 If set to True, this function will only access 119 data that has been previously cached. Normally, the function 120 checks the date of the cache data against the date of the data 121 on the ABS website, before deciding whether the ABS has fresher 122 data that needs to be downloaded to the cache. 123 124 zip_file: str | Path = "" 125 If set to a specific zip file name (with or without the .zip 126 extension), this function will only extract data from that zip file 127 on the local file system. This may be useful for debugging purposes. 128 129 Returns 130 ------- 131 tuple[dict[str, DataFrame], DataFrame] 132 The function returns a tuple of two items. The first item is a 133 python dictionary of pandas DataFrames (which is the primary data 134 associated with the ABS catalogue item). The second item is a 135 DataFrame of ABS metadata for the ABS collection. 136 137 Note: 138 You can retrieve non-timeseries data using the grab_abs_url() 139 function. That takes the URL for the ABS landing page for the ABS 140 collection you are interested in. The read_abs_cat function is for 141 ABS catalogue identifiers which are timeseries data, for which the 142 metadata can be extracted. 143 144 Example 145 ------- 146 147 ```python 148 import readabs as ra 149 from pandas import DataFrame 150 cat_num = "6202.0" # The ABS labour force survey 151 data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num) 152 abs_dict, meta = data 153 ``` 154 155 """ 156 # --- get the time series data --- 157 if kwargs.get("zip_file"): 158 raw_abs_dict = grab_abs_zip(kwargs["zip_file"], **kwargs) 159 else: 160 raw_abs_dict = grab_abs_url(cat=cat, **kwargs) 161 response = _get_time_series_data(cat, raw_abs_dict, **kwargs) 162 163 if not response: 164 response = {}, DataFrame() 165 166 return response # dictionary of DataFrames, and a DataFrame of metadata
For a specific catalogue identifier, return the complete ABS Catalogue information as DataFrames.
This function returns the complete ABS Catalogue information as a python dictionary of pandas DataFrames, as well as the associated metadata in a separate DataFrame. The function automates the collection of zip and excel files from the ABS website. If necessary, these files are downloaded, and saved into a cache directory. The files are then parsed to extract time series data, and the associated metadata.
By default, the cache directory is ./.readabs_cache/. You can change the
default directory name by setting the shell environment variable
READABS_CACHE_DIR with the name of the preferred directory.
Parameters
cat : str The ABS Catalogue Number for the data to be downloaded and made available by this function. This argument must be specified in the function call.
**kwargs : Unpack[ReadArgs] The following parameters may be passed as optional keyword arguments.
url : str = ""
The URL of an ABS landing page. Use this for discontinued series
that are no longer in the ABS Time Series Directory. If provided,
data will be retrieved from this URL instead of looking up the
catalogue number. Example:
read_abs_cat(cat="8501.0", url="https://www.abs.gov.au/.../jun-2025")
keep_non_ts : bool = False A flag for whether to keep the non-time-series tables that might form part of an ABS catalogue item. Normally, the non-time-series information is ignored, and not made available to the user.
history : str = "" Provide a month-year string to extract historical ABS data. For example, you can set history="dec-2023" to the get the ABS data for a catalogue identifier that was originally published in respect of Q4 of 2023. Note: not all ABS data sources are structured so that this technique works in every case; but most are.
verbose : bool = False Setting this to true may help diagnose why something might be going wrong with the data retrieval process.
ignore_errors : bool = False Normally, this function will cease downloading when an error in encountered. However, sometimes the ABS website has malformed links, and changing this setting is necessitated. (Note: if you drop a message to the ABS, they will usually fix broken links with a business day).
get_zip : bool = True Download the excel files in .zip files.
get_excel_if_no_zip : bool = True Only try to download .xlsx files if there are no zip files available to be downloaded. Only downloading individual excel files when there are no zip files to download can speed up the download process.
get_excel : bool = False
The default value means that excel files are not
automatically download. Note: at least one of get_zip,
get_excel_if_no_zip, or get_excel must be true. For most ABS
catalogue items, it is sufficient to just download the one zip
file. But note, some catalogue items do not have a zip file.
Others have quite a number of zip files.
single_excel_only : str = ""
If this argument is set to a table name (without the
.xlsx extension), only that excel file will be downloaded. If
set, and only a limited subset of available data is needed,
this can speed up download times significantly. Note: overrides
get_zip, get_excel_if_no_zip, get_excel and single_zip_only.
single_zip_only : str = ""
If this argument is set to a zip file name (without
the .zip extension), only that zip file will be downloaded.
If set, and only a limited subset of available data is needed,
this can speed up download times significantly. Note: overrides
get_zip, get_excel_if_no_zip, and get_excel.
cache_only : bool = False If set to True, this function will only access data that has been previously cached. Normally, the function checks the date of the cache data against the date of the data on the ABS website, before deciding whether the ABS has fresher data that needs to be downloaded to the cache.
zip_file: str | Path = "" If set to a specific zip file name (with or without the .zip extension), this function will only extract data from that zip file on the local file system. This may be useful for debugging purposes.
Returns
tuple[dict[str, DataFrame], DataFrame] The function returns a tuple of two items. The first item is a python dictionary of pandas DataFrames (which is the primary data associated with the ABS catalogue item). The second item is a DataFrame of ABS metadata for the ABS collection.
Note:
You can retrieve non-timeseries data using the grab_abs_url()
function. That takes the URL for the ABS landing page for the ABS
collection you are interested in. The read_abs_cat function is for
ABS catalogue identifiers which are timeseries data, for which the
metadata can be extracted.
Example
import readabs as ra
from pandas import DataFrame
cat_num = "6202.0" # The ABS labour force survey
data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num)
abs_dict, meta = data
15def read_abs_series( 16 cat: str, 17 series_id: str | Sequence[str], 18 **kwargs: Unpack[ReadArgs], 19) -> tuple[DataFrame, DataFrame]: 20 """Get specific ABS data series by their ABS catalogue and series identifiers. 21 22 Parameters 23 ---------- 24 cat : str 25 The ABS catalogue ID. 26 27 series_id : str | Sequence[str] 28 An ABS series ID or a sequence of ABS series IDs. 29 30 **kwargs : Any 31 Keyword arguments for the read_abs_series function, 32 which are the same as the keyword arguments for the 33 read_abs_cat function. 34 35 Returns 36 ------- 37 tuple[DataFrame, DataFrame] 38 A tuple of two DataFrames, one for the primary data and one for the metadata. 39 40 Example 41 ------- 42 43 ```python 44 import readabs as ra 45 from pandas import DataFrame 46 cat_num = "6202.0" # The ABS labour force survey 47 unemployment_rate = "A84423050A" 48 seo = "6202001" # The ABS table name 49 data, meta = ra.read_abs_series( 50 cat=cat_num, series_id=unemployment_rate, single_excel_only=seo 51 ) 52 ``` 53 54 """ 55 # check for unexpected keyword arguments/get defaults 56 check_kwargs(kwargs, "read_abs_series") 57 args = get_args(kwargs, "read_abs_series") 58 59 # read the ABS category data 60 cat_data, cat_meta = read_abs_cat(cat, **args) 61 62 # drop repeated series_ids in the meta data, 63 # make unique series_ids the index 64 cat_meta.index = Index(cat_meta[metacol.id]) 65 cat_meta = cat_meta.groupby(cat_meta.index).first() 66 67 # get the ABS series data 68 if isinstance(series_id, str): 69 series_id = [series_id] 70 return_data, return_meta = DataFrame(), DataFrame() 71 for identifier in series_id: 72 # confirm that the series ID is in the catalogue 73 if identifier not in cat_meta.index: 74 if args["verbose"]: 75 print(f"Series ID {identifier} not found in ABS catalogue ID {cat}") 76 if args["ignore_errors"]: 77 continue 78 raise ValueError(f"Series ID {identifier} not found in catalogue {cat}") 79 80 # confirm thay the index of the series is compatible 81 table = str(cat_meta.loc[identifier, metacol.table]) # str for mypy 82 data_series = cat_data[table][identifier] 83 if ( 84 len(return_data) > 0 85 and cast("PeriodIndex", return_data.index).freq != cast("PeriodIndex", data_series.index).freq 86 ): 87 if args["verbose"]: 88 print(f"Frequency mismatch for series ID {identifier}") 89 if args["ignore_errors"]: 90 continue 91 raise ValueError(f"Frequency mismatch for series ID {identifier}") 92 93 # add the series data and meta data to the return values 94 if len(return_data) > 0: 95 return_data = return_data.reindex(return_data.index.union(data_series.index)) 96 return_data[identifier] = data_series 97 return_meta = concat([return_meta, cat_meta.loc[identifier]], axis=1) 98 99 return return_data, return_meta.T
Get specific ABS data series by their ABS catalogue and series identifiers.
Parameters
cat : str The ABS catalogue ID.
series_id : str | Sequence[str] An ABS series ID or a sequence of ABS series IDs.
**kwargs : Any Keyword arguments for the read_abs_series function, which are the same as the keyword arguments for the read_abs_cat function.
Returns
tuple[DataFrame, DataFrame] A tuple of two DataFrames, one for the primary data and one for the metadata.
Example
import readabs as ra
from pandas import DataFrame
cat_num = "6202.0" # The ABS labour force survey
unemployment_rate = "A84423050A"
seo = "6202001" # The ABS table name
data, meta = ra.read_abs_series(
cat=cat_num, series_id=unemployment_rate, single_excel_only=seo
)
171def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> Series: # ignore_errors 172 """Read the Official Cash Rate (OCR) from the RBA website. 173 174 Return it in a pandas Series, with either a daily or monthly PeriodIndex, 175 depending on the value of the monthly parameter. The default is monthly. 176 177 Parameters 178 ---------- 179 monthly : bool = True 180 If True, then the data will be returned with a monthly PeriodIndex. 181 If False, then the data will be returned with a daily PeriodIndex. 182 **kwargs : Any 183 Additional keyword arguments. The only keyword argument that is used is ignore_errors. 184 ignore_errors : bool = False 185 If True, then any major errors encountered will be printed and the function 186 will return an empty Series. If False, then any major errors encountered 187 will raise an exception. 188 189 Returns 190 ------- 191 Series 192 The OCR data in a pandas Series, with an index of either daily or monthly Periods. 193 194 Examples 195 -------- 196 ```python 197 ocr = read_rba_ocr(monthly=True) 198 ``` 199 200 """ 201 # read the OCR table from the RBA website, make float and sort, name the series 202 rba, _rba_meta = read_rba_table("A2", **kwargs) # should have a daily PeriodIndex 203 ocr_series = rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"] 204 ocr = ocr_series.astype(float).sort_index() # pyright: ignore[reportAttributeAccessIssue] 205 ocr.name = "RBA Official Cash Rate" 206 207 # bring up to date 208 today = Period(Timestamp.today(), freq=cast("PeriodIndex", ocr.index).freqstr) 209 last_period = cast("Period", ocr.index[-1]) 210 if last_period < today: 211 ocr[today] = ocr.iloc[-1] 212 213 if not monthly: 214 # fill in missing days and return daily data 215 daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D") 216 return ocr.reindex(daily_index).ffill() 217 218 # convert to monthly data, keeping last value if duplicates in month 219 # fill in missing months 220 ocr.index = PeriodIndex(ocr.index, freq="M") 221 ocr = ocr[~ocr.index.duplicated(keep="last")] 222 monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M") 223 return ocr.reindex(monthly_index, method="ffill")
Read the Official Cash Rate (OCR) from the RBA website.
Return it in a pandas Series, with either a daily or monthly PeriodIndex, depending on the value of the monthly parameter. The default is monthly.
Parameters
monthly : bool = True If True, then the data will be returned with a monthly PeriodIndex. If False, then the data will be returned with a daily PeriodIndex. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return an empty Series. If False, then any major errors encountered will raise an exception.
Returns
Series The OCR data in a pandas Series, with an index of either daily or monthly Periods.
Examples
ocr = read_rba_ocr(monthly=True)
89def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]: # ignore_errors 90 """Read a table from the RBA website and return the actual data and meta data. 91 92 Returns the actual data and the meta data in a tuple of two DataFrames. 93 94 Parameters 95 ---------- 96 table : str 97 The table to read from the RBA website. 98 **kwargs : Any 99 Additional keyword arguments. 100 The only keyword argument that is used is ignore_errors. 101 ignore_errors : bool = False 102 If True, then any major errors encountered will be printed and the function 103 will return empty DataFrames. If False, then any major errors encountered 104 will raise an exception. 105 106 Returns 107 ------- 108 tuple[DataFrame, DataFrame] 109 The primary data and the meta data in a tuple of two DataFrames. 110 111 Examples 112 -------- 113 ```python 114 data, meta = read_rba_table("C1") 115 ``` 116 117 """ 118 # set-up 119 ignore_errors = kwargs.get("ignore_errors", False) 120 data, meta = DataFrame(), DataFrame() 121 122 # get the Excel file 123 excel = _get_excel_file(table, ignore_errors=ignore_errors, **kwargs) 124 if excel is None: 125 return data, meta 126 127 # read Excel file into DataFrame 128 try: 129 raw = read_excel(BytesIO(excel), header=None, index_col=None) 130 except Exception as e: 131 if ignore_errors: 132 print(f"Ignoring error: {e}") 133 return data, meta 134 raise 135 136 # extract the meta data 137 meta = raw.iloc[1:11, :].T.copy() 138 meta.columns = Index(meta.iloc[0]) 139 renamer = { 140 "Mnemonic": rm.id, 141 } # historical data is inconsistent 142 meta = meta.rename(columns=renamer) 143 meta = meta.iloc[1:, :] 144 meta.index = Index(meta[rm.id]) 145 meta[rm.table] = table 146 meta[rm.tdesc] = raw.iloc[0, 0] 147 meta = meta.dropna(how="all", axis=1) # drop columns with all NaNs 148 149 # extract the data 150 data = raw.iloc[10:, :].copy() 151 data.columns = Index(data.iloc[0]) 152 data = data.iloc[1:, :] 153 data.index = DatetimeIndex(data.iloc[:, 0]) 154 data = data.iloc[:, 1:] 155 data = data.dropna(how="all", axis=1) # drop columns with all NaNs 156 157 # can we make the index into a PeriodIndex? 158 days = data.index.to_series().diff(1).dropna().dt.days 159 if days.min() >= MONTHLY_MIN_DAYS and days.max() <= MONTHLY_MAX_DAYS: 160 data.index = PeriodIndex(data.index, freq="M") 161 elif days.min() >= QUARTERLY_MIN_DAYS and days.max() <= QUARTERLY_MAX_DAYS: 162 data.index = PeriodIndex(data.index, freq="Q") 163 elif days.min() >= YEARLY_MIN_DAYS and days.max() <= YEARLY_MAX_DAYS: 164 data.index = PeriodIndex(data.index, freq="Y") 165 else: 166 data.index = PeriodIndex(data.index, freq="D") 167 168 return data, meta
Read a table from the RBA website and return the actual data and meta data.
Returns the actual data and the meta data in a tuple of two DataFrames.
Parameters
table : str The table to read from the RBA website. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return empty DataFrames. If False, then any major errors encountered will raise an exception.
Returns
tuple[DataFrame, DataFrame] The primary data and the meta data in a tuple of two DataFrames.
Examples
data, meta = read_rba_table("C1")
24def recalibrate( 25 data: DataT, 26 units: str, 27) -> tuple[DataT, str]: 28 """Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000. 29 30 Change the name of the units to reflect the recalibration. 31 32 Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. 33 If you provide a Series, you will get a Series back. If you provide a DataFrame, 34 you will get a DataFrame back. 35 36 Parameters 37 ---------- 38 data : Series or DataFrame 39 The data to recalibrate. 40 units : str 41 The units of the data. This string should be in the form of 42 "Number", "Thousands", "Millions", "Billions", etc. The units 43 should be in title case. 44 45 Returns 46 ------- 47 Series or DataFrame 48 The recalibrated data will be a Series if a Series was provided, 49 or a DataFrame if a DataFrame was provided. 50 51 Examples 52 -------- 53 ```python 54 from pandas import Series 55 from readabs import recalibrate 56 s = Series([1_000, 10_000, 100_000, 1_000_000]) 57 recalibrated, units = recalibrate(s, "$") 58 print(f"{recalibrated=}, {units=}") 59 ``` 60 61 """ 62 if not isinstance(data, (Series, DataFrame)): 63 raise TypeError("data must be a Series or DataFrame") 64 units, restore_name = _prepare_units(units) 65 flat_data = data.to_numpy().flatten() 66 flat_data, units = _recalibrate(flat_data, units) 67 68 if restore_name: 69 units = f"{restore_name} {units}" 70 for n in "numbers", "number": 71 if n in units: 72 units = units.replace(n, "").strip() 73 break 74 units = units.title() 75 76 result = data.__class__(flat_data.reshape(data.shape)) 77 result.index = data.index 78 if len(data.shape) == NDIM_DATAFRAME: 79 result.columns = data.columns 80 if len(data.shape) == NDIM_SERIES: 81 result.name = data.name # pyright: ignore[reportAttributeAccessIssue] 82 return result, units
Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.
Change the name of the units to reflect the recalibration.
Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.
Parameters
data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.
Returns
Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.
Examples
from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
85def recalibrate_value(value: float, units: str) -> tuple[float, str]: 86 """Recalibrate a floating point value. 87 88 The value will be recalibrated so it is in the range -1000 to 1000. 89 The units will be changed to reflect the recalibration. 90 91 Parameters 92 ---------- 93 value : float 94 The value to recalibrate. 95 units : str 96 The units of the value. This string should be in the form of 97 "Number", "Thousands", "Millions", "Billions", etc. The units 98 should be in title case. 99 100 Returns 101 ------- 102 tuple[float, str] 103 A tuple containing the recalibrated value and the recalibrated units. 104 105 Examples 106 -------- 107 ```python 108 from readabs import recalibrate_value 109 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 110 print(recalibrated, units) 111 ``` 112 113 """ 114 series = Series([value]) 115 output, units = recalibrate(series, units) 116 return output.to_numpy()[0], units
Recalibrate a floating point value.
The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.
Parameters
value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.
Returns
tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.
Examples
from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)
17def search_abs_meta( 18 meta: DataFrame, # sourced from read_abs_series() or read_abs_cat() 19 search_terms: dict[str, str], # {search_term: meta_data_column_name, ...} 20 *, 21 exact_match: bool = False, 22 regex: bool = False, 23 validate_unique: bool = False, # useful safety-net if you expect only one match 24 **kwargs: Any, # verbose flag 25) -> DataFrame: 26 """Extract from the ABS meta data those rows that match the search_terms. 27 28 Iteratively search the meta data one search_term at a time. 29 30 Parameters 31 ---------- 32 meta : DataFrame 33 A pandas DataFrame of metadata from the ABS 34 (via read_abs_cat() or read_abs_series()). 35 search_terms : dict[str, str] 36 A dictionary {search_phrase: meta_column_name, ...} of search terms. 37 Note: the search terms must be unique, as a dictionary cannot hold the 38 same search term to be applied to different columns. 39 exact_match : bool = False 40 Whether to match using == (exact) or .str.contains() (inexact). 41 regex : bool = False 42 Whether to use regular expressions in the search. 43 validate_unique : bool = False 44 Raise a ValueError if the search result is not unique. 45 **kwargs : Any 46 Additional keyword arguments. The only keyword argument 47 that is used is verbose. 48 verbose : bool = False 49 Print additional information while searching; which can 50 be useful when diagnosing problems with search terms. 51 52 Returns 53 ------- 54 DataFrame 55 Returns a pandas DataFrame of matching rows (subseted from meta). 56 Note, The index for the returned meta data will always comprise ABS 57 series_ids. Duplicate indexes will be removed from the meta data 58 (ie. where the same ABS series appears in more than one table, this 59 function will only report the first match). 60 61 Metacol 62 ------- 63 Because the meta data is a DataFrame, the columns can be referenced by either 64 their full textual name, or by the short name defined in the metacol object. 65 For example, if metacol is imported as mc, to refer to the 66 `Data Item Description` column, the user can refer to it as mc.did. 67 68 Example 69 ------- 70 ```python 71 from readabs import metacol as mc # alias for the ABS meta data column names 72 from readabs import read_abs_cat, search_abs_meta 73 cat_num = "6202.0" # The ABS labour force survey 74 data, meta = read_abs_cat(cat_num) 75 search_terms = { 76 "Unemployment rate": mc.did, # the data item description 77 "Persons": mc.did, 78 "Seasonally Adjusted": mc.stype, 79 "Percent": mc.unit, 80 "6202001": mc.table, 81 } 82 rows = search_abs_meta(meta, search_terms, verbose=True) 83 print(rows) # should have three rows : FT/PT/All Unemployment rates 84 ``` 85 86 """ 87 # get the verbose-flag from kwargs 88 verbose = kwargs.get("verbose", False) 89 90 # establish the starting point 91 meta_select = meta.copy() # preserve the original meta data 92 if verbose: 93 print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}") 94 print(f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data.") 95 96 # iteratively search 97 for phrase, column in search_terms.items(): 98 if verbose: 99 print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}") 100 101 pick_me = ( 102 (meta_select[column] == phrase) 103 if (exact_match or column == mc.table) 104 else meta_select[column].str.contains(phrase, regex=regex) 105 ) 106 meta_select = meta_select[pick_me] 107 if verbose: 108 print(f"In find_rows() have found {len(meta_select)}") 109 110 # search complete - check results - and return 111 meta_select.index = Index(meta_select[mc.id]) 112 meta_select = meta_select[~meta_select.index.duplicated(keep="first")] 113 114 if verbose: 115 print(f"Final selection is {len(meta_select)} rows.") 116 117 elif len(meta_select) == 0: 118 print("Nothing selected?") 119 120 if validate_unique and len(meta_select) != 1: 121 raise ValueError("The selected meta data should only contain one row.") 122 123 return meta_select
Extract from the ABS meta data those rows that match the search_terms.
Iteratively search the meta data one search_term at a time.
Parameters
meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. exact_match : bool = False Whether to match using == (exact) or .str.contains() (inexact). regex : bool = False Whether to use regular expressions in the search. validate_unique : bool = False Raise a ValueError if the search result is not unique. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is verbose. verbose : bool = False Print additional information while searching; which can be useful when diagnosing problems with search terms.
Returns
DataFrame Returns a pandas DataFrame of matching rows (subseted from meta). Note, The index for the returned meta data will always comprise ABS series_ids. Duplicate indexes will be removed from the meta data (ie. where the same ABS series appears in more than one table, this function will only report the first match).
Metacol
Because the meta data is a DataFrame, the columns can be referenced by either
their full textual name, or by the short name defined in the metacol object.
For example, if metacol is imported as mc, to refer to the
Data Item Description column, the user can refer to it as mc.did.
Example
from readabs import metacol as mc # alias for the ABS meta data column names
from readabs import read_abs_cat, search_abs_meta
cat_num = "6202.0" # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
"Unemployment rate": mc.did, # the data item description
"Persons": mc.did,
"Seasonally Adjusted": mc.stype,
"Percent": mc.unit,
"6202001": mc.table,
}
rows = search_abs_meta(meta, search_terms, verbose=True)
print(rows) # should have three rows : FT/PT/All Unemployment rates