readabs

Package to download timeseries data from the Australian Bureau of Statistics and RBA.

This package provides functions to download and process timeseries data from the Australian Bureau of Statistics (ABS) and the Reserve Bank of Australia (RBA).

 1"""Package to download timeseries data from the Australian Bureau of Statistics and RBA.
 2
 3This package provides functions to download and process timeseries data from
 4the Australian Bureau of Statistics (ABS) and the Reserve Bank of Australia (RBA).
 5"""
 6
 7import importlib.metadata
 8
 9# ABS related imports
10from readabs.abs_catalogue import abs_catalogue
11from readabs.abs_meta_data import metacol
12
13# Utility imports
14from readabs.datatype import Datatype
15from readabs.grab_abs_url import grab_abs_url, grab_abs_zip
16from readabs.print_abs_catalogue import print_abs_catalogue
17
18# RBA related imports
19from readabs.rba_catalogue import print_rba_catalogue, rba_catalogue
20from readabs.rba_meta_data import rba_metacol
21from readabs.read_abs_by_desc import read_abs_by_desc
22from readabs.read_abs_cat import read_abs_cat
23from readabs.read_abs_series import read_abs_series
24from readabs.read_rba_table import read_rba_ocr, read_rba_table
25from readabs.read_support import ReadArgs
26from readabs.recalibrate import recalibrate, recalibrate_value
27from readabs.search_abs_meta import find_abs_id, search_abs_meta
28from readabs.utilities import (
29    annualise_percentages,
30    annualise_rates,
31    monthly_to_qtly,
32    percent_change,
33    qtly_to_monthly,
34)
35
36# Version and author information
37try:
38    __version__ = importlib.metadata.version(__name__)
39except importlib.metadata.PackageNotFoundError:
40    __version__ = "0.0.0"  # Fallback for development mode
41__author__ = "Bryan Palmer"
42
43
44# Exposed functions and classes
45__all__ = (
46    "Datatype",
47    "ReadArgs",
48    "abs_catalogue",
49    "annualise_percentages",
50    "annualise_rates",
51    "find_abs_id",
52    "grab_abs_url",
53    "grab_abs_zip",
54    "metacol",
55    "monthly_to_qtly",
56    "percent_change",
57    "print_abs_catalogue",
58    "print_rba_catalogue",
59    "qtly_to_monthly",
60    "rba_catalogue",
61    "rba_metacol",
62    "read_abs_by_desc",
63    "read_abs_cat",
64    "read_abs_series",
65    "read_rba_ocr",
66    "read_rba_table",
67    "recalibrate",
68    "recalibrate_value",
69    "search_abs_meta",
70)
71__pdoc__ = {
72    "download_cache": False,
73    "get_abs_links": False,
74    "read_support": False,
75    "grab_abs_url": False,
76}  # hide submodules from documentation
class ReadArgs(typing.TypedDict):
15class ReadArgs(TypedDict):
16    """Type definition for ABS data reading arguments."""
17
18    verbose: NotRequired[bool]
19    ignore_errors: NotRequired[bool]
20    get_zip: NotRequired[bool]
21    get_excel_if_no_zip: NotRequired[bool]
22    get_excel: NotRequired[bool]
23    single_zip_only: NotRequired[str]
24    single_excel_only: NotRequired[str]
25    history: NotRequired[str]
26    cache_only: NotRequired[bool]
27    keep_non_ts: NotRequired[bool]
28    zip_file: NotRequired[str]
29    url: NotRequired[str]

Type definition for ABS data reading arguments.

verbose: NotRequired[bool]
ignore_errors: NotRequired[bool]
get_zip: NotRequired[bool]
get_excel_if_no_zip: NotRequired[bool]
get_excel: NotRequired[bool]
single_zip_only: NotRequired[str]
single_excel_only: NotRequired[str]
history: NotRequired[str]
cache_only: NotRequired[bool]
keep_non_ts: NotRequired[bool]
zip_file: NotRequired[str]
url: NotRequired[str]
@cache
def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> pandas.DataFrame:
 24@cache
 25def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame:
 26    """Return a DataFrame of ABS Catalogue numbers.
 27
 28    Downloads catalogue data from the ABS website on first call and caches
 29    for future use. The returned DataFrame contains catalogue numbers with
 30    their topics, themes, URLs, and status.
 31
 32    Parameters
 33    ----------
 34    cache_only : bool, default False
 35        If True, only use cached data and don't attempt to download.
 36    verbose : bool, default False
 37        If True, print progress messages.
 38
 39    Returns
 40    -------
 41    DataFrame
 42        DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status']
 43        and index of catalogue IDs.
 44
 45    Raises
 46    ------
 47    CatalogueError
 48        If the catalogue data cannot be retrieved or parsed.
 49    HttpError
 50        If there's a network error downloading the catalogue.
 51    CacheError
 52        If cache_only=True but no cached data is available.
 53
 54    Example
 55    -------
 56    >>> import readabs as ra
 57    >>> catalogue = ra.abs_catalogue()
 58    >>> print(catalogue.head())
 59
 60    """
 61    try:
 62        # Download ABS catalogue page
 63        abs_bytes = get_file(ABS_CATALOGUE_URL, cache_only=cache_only, verbose=verbose)
 64
 65        if not abs_bytes:
 66            raise CatalogueError("No data retrieved from ABS catalogue URL")
 67
 68        # Parse HTML content
 69        try:
 70            html_content = abs_bytes.decode(DEFAULT_ENCODING, errors="replace")
 71        except UnicodeDecodeError as e:
 72            raise CatalogueError(f"Failed to decode HTML content: {e}") from e
 73
 74        # Extract tables from HTML
 75        try:
 76            tables = read_html(StringIO(html_content), extract_links="body")
 77            if not tables:
 78                raise CatalogueError("No tables found in HTML content")
 79            links = tables[-1]  # Get the last table
 80        except (ValueError, IndexError) as e:
 81            raise CatalogueError(f"Failed to parse HTML tables: {e}") from e
 82
 83        # Validate required columns exist
 84        required_cols = ["Catalogue number", "Topic"]
 85        missing_cols = [col for col in required_cols if col not in links.columns]
 86        if missing_cols:
 87            raise CatalogueError(f"Missing required columns: {missing_cols}")
 88
 89        # Extract catalogue numbers and URLs
 90        try:
 91            cats = links["Catalogue number"].apply(Series)[0]
 92            urls = links["Topic"].apply(Series)[1]
 93        except (KeyError, IndexError) as e:
 94            raise CatalogueError(f"Failed to extract catalogue data: {e}") from e
 95
 96        # Process topic URLs to create hierarchical structure
 97        url_snippets = _process_topic_urls(urls)
 98
 99        # Create main DataFrame with hierarchical topic structure
100        frame = _create_topic_frame(url_snippets)
101        frame["URL"] = urls
102
103        # Align catalogue numbers with processed frame
104        cats = cats[frame.index]
105
106        # Process catalogue status (active vs ceased)
107        cat_index, status = _process_catalogue_status(cats)
108
109        frame["Status"] = status
110        frame.index = Index(cat_index)
111        frame.index.name = CATALOGUE_INDEX_NAME
112
113    except (HttpError, CacheError, ValueError) as e:
114        raise CatalogueError(f"Error retrieving ABS catalogue: {e}") from e
115
116    return frame

Return a DataFrame of ABS Catalogue numbers.

Downloads catalogue data from the ABS website on first call and caches for future use. The returned DataFrame contains catalogue numbers with their topics, themes, URLs, and status.

Parameters

cache_only : bool, default False If True, only use cached data and don't attempt to download. verbose : bool, default False If True, print progress messages.

Returns

DataFrame DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status'] and index of catalogue IDs.

Raises

CatalogueError If the catalogue data cannot be retrieved or parsed. HttpError If there's a network error downloading the catalogue. CacheError If cache_only=True but no cached data is available.

Example

>>> import readabs as ra
>>> catalogue = ra.abs_catalogue()
>>> print(catalogue.head())
def annualise_percentages(data: ~Datatype, *, periods_per_year: float) -> ~Datatype:
 95def annualise_percentages(data: DataT, *, periods_per_year: float) -> DataT:
 96    """Annualise a growth rate (expressed as a percentage) for a period.
 97
 98    Args:
 99        data : pandas Series or DataFrame - The growth rate (expressed as a
100            percentage) to annualise. Note a growth percentage of 5% is a growth
101            rate of 0.05.
102        periods_per_year : int or float, default 12 - The number of periods in a
103            year. For monthly data, this is 12.
104
105    Returns:
106        pandas Series or DataFrame - The annualised growth expressed as a percentage.
107            For DataFrame input, the annualised growth rate is calculated for each column.
108
109    Raises:
110        InvalidParameterError - If periods_per_year is not positive.
111        InvalidDataError - If data is not a Series or DataFrame.
112
113    """
114    if not isinstance(data, (Series, DataFrame)):
115        raise InvalidDataError("data must be a pandas Series or DataFrame")
116
117    if not isinstance(periods_per_year, (int, float)) or periods_per_year <= 0:
118        raise InvalidParameterError("periods_per_year must be a positive number")
119
120    try:
121        rates = data / 100.0
122        return annualise_rates(rates, periods_per_year=periods_per_year)
123    except Exception as e:
124        raise InvalidDataError(f"Error annualising percentages: {e}") from e

Annualise a growth rate (expressed as a percentage) for a period.

Args: data : pandas Series or DataFrame - The growth rate (expressed as a percentage) to annualise. Note a growth percentage of 5% is a growth rate of 0.05. periods_per_year : int or float, default 12 - The number of periods in a year. For monthly data, this is 12.

Returns: pandas Series or DataFrame - The annualised growth expressed as a percentage. For DataFrame input, the annualised growth rate is calculated for each column.

Raises: InvalidParameterError - If periods_per_year is not positive. InvalidDataError - If data is not a Series or DataFrame.

def annualise_rates(data: ~Datatype, *, periods_per_year: float) -> ~Datatype:
62def annualise_rates(data: DataT, *, periods_per_year: float) -> DataT:
63    """Annualise a growth rate for a period.
64
65    Note: returns a percentage value (and not a rate)!
66
67    Args:
68        data : pandas Series or DataFrame - The growth rate to annualise.
69            Note a growth rate of 0.05 is 5%.
70        periods_per_year : int or float, default 12 - The number of periods in a year.
71            For monthly data, this is 12.
72
73    Returns:
74        pandas Series or DataFrame - The annualised growth expressed as a percentage
75            (not a rate). For DataFrame input, the annualised growth rate is
76            calculated for each column.
77
78    Raises:
79        InvalidParameterError - If periods_per_year is not positive.
80    InvalidDataError - If data is not a Series or DataFrame.
81
82    """
83    if not isinstance(data, (Series, DataFrame)):
84        raise InvalidDataError("data must be a pandas Series or DataFrame")
85
86    if not isinstance(periods_per_year, (int, float)) or periods_per_year <= 0:
87        raise InvalidParameterError("periods_per_year must be a positive number")
88
89    try:
90        return (((1 + data) ** periods_per_year) - 1) * 100
91    except Exception as e:
92        raise InvalidDataError(f"Error annualising rates: {e}") from e

Annualise a growth rate for a period.

Note: returns a percentage value (and not a rate)!

Args: data : pandas Series or DataFrame - The growth rate to annualise. Note a growth rate of 0.05 is 5%. periods_per_year : int or float, default 12 - The number of periods in a year. For monthly data, this is 12.

Returns: pandas Series or DataFrame - The annualised growth expressed as a percentage (not a rate). For DataFrame input, the annualised growth rate is calculated for each column.

Raises: InvalidParameterError - If periods_per_year is not positive. InvalidDataError - If data is not a Series or DataFrame.

def find_abs_id( meta: pandas.DataFrame, search_terms: dict[str, str], **kwargs: Any) -> tuple[str, str, str]:
126def find_abs_id(
127    meta: DataFrame,
128    search_terms: dict[str, str],
129    **kwargs: Any,
130) -> tuple[str, str, str]:  # table, series_id, units
131    """Find a unique ABS series identifier in the ABS metadata.
132
133    Parameters
134    ----------
135    meta : DataFrame
136        A pandas DataFrame of metadata from the ABS
137        (via read_abs_cat() or read_abs_series()).
138    search_terms : dict[str, str]
139        A dictionary {search_phrase: meta_column_name, ...} of search terms.
140        Note: the search terms must be unique, as a dictionary cannot hold the
141        same search term to be applied to different columns.
142    **kwargs : Any
143        Additional keyword arguments. The only additional keyword argument
144        that is used is validate_unique.
145    validate_unique : bool = True
146        Raise a ValueError if the search result is not a single
147        unique match. Note: the default is True for safety.
148
149    Returns
150    -------
151    tuple[str, str, str]
152        A tuple of the table, series_id and units for the unique
153        series_id that matches the search terms.
154
155    Metacol
156    -------
157    Because the meta data is a DataFrame, the columns can be referenced by either
158    their full textual name, or by the short name defined in the metacol object.
159    For example, if metacol is imported as mc, to refer to the
160    `Data Item Description` column, the user can refer to it as mc.did.
161
162    Example
163    -------
164    ```python
165    from readabs import metacol as mc  # alias for the ABS meta data column names
166    from readabs import read_abs_cat, find_abs_id, recalibrate
167    cat_num = "6202.0"  # The ABS labour force survey
168    data, meta = read_abs_cat(cat_num)
169    search_terms = {
170        "Employed total ;  Persons ;": mc.did,
171        "Seasonally Adjusted": mc.stype,
172        "6202001": mc.table,
173    }
174    table, series_id, units = find_abs_id(meta, search_terms)
175    print(f"Table: {table} Series ID: {series_id} Units: {units}")
176    recal_series, recal_units = recalibrate(data[table][series_id], units)
177    ```
178
179    """
180    validate_unique = kwargs.pop("validate_unique", True)
181    found = search_abs_meta(meta, search_terms, validate_unique=validate_unique, **kwargs).iloc[0]
182    table, series_id, units = (
183        found[mc.table],
184        found[mc.id],
185        found[mc.unit],
186    )
187
188    return table, series_id, units

Find a unique ABS series identifier in the ABS metadata.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. **kwargs : Any Additional keyword arguments. The only additional keyword argument that is used is validate_unique. validate_unique : bool = True Raise a ValueError if the search result is not a single unique match. Note: the default is True for safety.

Returns

tuple[str, str, str] A tuple of the table, series_id and units for the unique series_id that matches the search terms.

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, find_abs_id, recalibrate
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Employed total ;  Persons ;": mc.did,
    "Seasonally Adjusted": mc.stype,
    "6202001": mc.table,
}
table, series_id, units = find_abs_id(meta, search_terms)
print(f"Table: {table} Series ID: {series_id} Units: {units}")
recal_series, recal_units = recalibrate(data[table][series_id], units)
@cache
def grab_abs_url( cat: str = '', url: str = '', **kwargs: Unpack[ReadArgs]) -> dict[str, pandas.DataFrame]:
37@cache  # minimise slowness with repeat business
38def grab_abs_url(
39    cat: str = "",
40    url: str = "",
41    **kwargs: Unpack[ReadArgs],
42) -> dict[str, DataFrame]:
43    """For a given URL, extract the data from the Excel and ZIP file links found on that page.
44
45    The data is returned as a dictionary of DataFrames. The Excel files are converted
46    into DataFrames, with each sheet in each Excel file becoming a separate DataFrame.
47    ZIP files are examined for Excel files, which are similarly converted into
48    DataFrames. The dictionary of DataFrames is returned.
49
50    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
51    or `read_abs_series()` functions. This function is provided for those
52    cases where the data is not available in the ABS catalogue, where the
53    data is not a timeseries, or where the user wants to extract data from
54    a specific ABS landingpage.
55
56
57    Parameters
58    ----------
59    url : str = ""
60        A URL for an ABS Catalogue landing page. Either a url or
61        a catalogue number must be provided. If both are provided, the
62        URL will be used.
63
64    cat : str = ""
65        An ABS Catalogue number. If provided, and the URL is not
66        provided, then the Catalogue number will be used to get the URL.
67
68    **kwargs : Unpack[ReadArgs]
69        Accepts the same keyword arguments as `read_abs_cat()`.
70
71    Returns
72    -------
73    dict[str, DataFrame]
74        A dictionary of DataFrames.
75
76    """
77    # check/get the keyword arguments
78    url = _get_url(url, cat)
79    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
80    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
81    if verbose := args["verbose"]:
82        print(f"grab_abs_url(): {url=}, {args=}")
83
84    # get the URL links to the relevant ABS data files on that webpage
85    links = get_abs_links(url, **args)
86    if not links:
87        print(f"No data files found at URL: {url}")
88        return {}  # return an empty Dictionary
89
90    # read the data files into a dictionary of DataFrames
91    abs_dict: dict[str, DataFrame] = {}
92
93    # Process single file requests first
94    abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose)
95    if abs_dict:  # If single file was found and processed, return it
96        return abs_dict
97
98    # Process all files based on configuration
99    return _process_all_files(abs_dict, links, args)

For a given URL, extract the data from the Excel and ZIP file links found on that page.

The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.

The preferred mechanism for reading ABS data is to use the read_abs_cat() or read_abs_series() functions. This function is provided for those cases where the data is not available in the ABS catalogue, where the data is not a timeseries, or where the user wants to extract data from a specific ABS landingpage.

Parameters

url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.

cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.

**kwargs : Unpack[ReadArgs] Accepts the same keyword arguments as read_abs_cat().

Returns

dict[str, DataFrame] A dictionary of DataFrames.

def grab_abs_zip( zip_path: pathlib.Path | str, **kwargs: Unpack[ReadArgs]) -> dict[str, pandas.DataFrame]:
102def grab_abs_zip(
103    zip_path: Path | str,
104    **kwargs: Unpack[ReadArgs]
105) -> dict[str, DataFrame]:
106    """Grab and process a single ABS ZIP file from a file system location.
107
108    This is a convenience function that opens an ABS ZIP file from a local
109    filesystem path. Expect to be used rarely.
110
111    Parameters
112    ----------
113    zip_path : Path | str
114        The local filesystem path of the ABS ZIP file to open and process.
115
116    **kwargs : Unpack[ReadArgs]
117        Additional keyword arguments for file retrieval and processing.
118
119    Returns
120    -------
121    dict[str, DataFrame]
122        A dictionary of DataFrames extracted from the ZIP file.
123
124    """
125    check_kwargs(kwargs, "grab_abs_zip")  # warn if invalid kwargs
126    args = get_args(kwargs, "grab_abs_zip")  # get the valid kwargs
127
128    zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path)
129    zip_bytes = zp.read_bytes()
130    abs_dict: dict[str, DataFrame] = {}
131    return _process_zip(abs_dict, zip_bytes, **args)

Grab and process a single ABS ZIP file from a file system location.

This is a convenience function that opens an ABS ZIP file from a local filesystem path. Expect to be used rarely.

Parameters

zip_path : Path | str The local filesystem path of the ABS ZIP file to open and process.

**kwargs : Unpack[ReadArgs] Additional keyword arguments for file retrieval and processing.

Returns

dict[str, DataFrame] A dictionary of DataFrames extracted from the ZIP file.

metacol = Metacol(did='Data Item Description', stype='Series Type', id='Series ID', start='Series Start', end='Series End', num='No. Obs.', unit='Unit', dtype='Data Type', freq='Freq.', cmonth='Collection Month', table='Table', tdesc='Table Description', cat='Catalogue number')
def monthly_to_qtly(data: ~Datatype, q_ending: str = 'DEC', f: str = 'mean') -> ~Datatype:
194def monthly_to_qtly(data: DataT, q_ending: str = "DEC", f: str = "mean") -> DataT:
195    """Convert monthly data to quarterly data.
196
197    This is done by taking the mean (or sum) of the three months in each quarter.
198    Ignore quarters with less than or more than three months data. Drop NA items.
199    Change f to "sum" for a quarterly sum.
200
201    Args:
202        data : pandas Series or DataFrame
203            The data to convert to quarterly frequency.
204        q_ending : str, default "DEC"
205            The month in which the quarter ends. For example, "DEC" for December.
206        f : str, default "mean"
207            The function to apply to the three months in each quarter.
208            Change to "sum" for a quarterly sum. The default is a
209            quarterly mean.
210
211    Returns:
212        pandas Series or DataFrame
213            The data with a quarterly PeriodIndex. If a quarter has less than
214            three months data, the quarter is dropped. If the quarter has more
215            than three months data, the quarter is dropped. Any NA data is removed.
216        For DataFrame input, the function is applied to each column.
217
218    Raises:
219        InvalidDataError - If data is not a Series or DataFrame.
220        InvalidParameterError - If q_ending or f parameters are invalid.
221
222    """
223    # Validate inputs
224    if not isinstance(data, (Series, DataFrame)):
225        raise InvalidDataError("data must be a pandas Series or DataFrame")
226
227    valid_endings = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
228    if q_ending.upper() not in valid_endings:
229        raise InvalidParameterError(f"q_ending must be one of {valid_endings}")
230
231    valid_aggregations = ["mean", "sum", "min", "max", "std", "var"]
232    if f not in valid_aggregations:
233        raise InvalidParameterError(f"f must be one of {valid_aggregations}")
234
235    try:
236        if isinstance(data, Series):
237            return _monthly_to_qtly_series(data, q_ending, f)
238        if isinstance(data, DataFrame):
239            result_dict = {}
240            for col in data.columns:
241                result_dict[col] = _monthly_to_qtly_series(data[col], q_ending, f)
242            return data.__class__(result_dict)
243        # This should never be reached due to validation above
244        raise InvalidDataError("Unexpected data type")  # noqa: TRY301
245    except Exception as e:
246        raise InvalidDataError(f"Error converting monthly to quarterly data: {e}") from e

Convert monthly data to quarterly data.

This is done by taking the mean (or sum) of the three months in each quarter. Ignore quarters with less than or more than three months data. Drop NA items. Change f to "sum" for a quarterly sum.

Args: data : pandas Series or DataFrame The data to convert to quarterly frequency. q_ending : str, default "DEC" The month in which the quarter ends. For example, "DEC" for December. f : str, default "mean" The function to apply to the three months in each quarter. Change to "sum" for a quarterly sum. The default is a quarterly mean.

Returns: pandas Series or DataFrame The data with a quarterly PeriodIndex. If a quarter has less than three months data, the quarter is dropped. If the quarter has more than three months data, the quarter is dropped. Any NA data is removed. For DataFrame input, the function is applied to each column.

Raises: InvalidDataError - If data is not a Series or DataFrame. InvalidParameterError - If q_ending or f parameters are invalid.

def percent_change(data: ~Datatype, n_periods: int) -> ~Datatype:
31def percent_change(data: DataT, n_periods: int) -> DataT:
32    """Calculate a percentage change in a contiguous, ordered series over n_periods.
33
34    Args:
35        data : pandas Series or DataFrame
36            The data to calculate the percentage change for.
37        n_periods : int
38            The number of periods to calculate the percentage change over.
39            Typically 4 for quarterly data, and 12 for monthly data.
40
41    Returns:
42        pandas Series or DataFrame - The percentage change in the data over n_periods.
43            For DataFrame input, the percentage change is calculated for each column.
44
45    Raises:
46        InvalidParameterError - If n_periods is not a positive integer.
47        InvalidDataError - If data is not a Series or DataFrame.
48
49    """
50    if not isinstance(n_periods, int) or n_periods <= 0:
51        raise InvalidParameterError("n_periods must be a positive integer")
52
53    if not isinstance(data, (Series, DataFrame)):
54        raise InvalidDataError("data must be a pandas Series or DataFrame")
55
56    try:
57        return (data / data.shift(n_periods) - 1) * 100
58    except Exception as e:
59        raise InvalidDataError(f"Error calculating percentage change: {e}") from e

Calculate a percentage change in a contiguous, ordered series over n_periods.

Args: data : pandas Series or DataFrame The data to calculate the percentage change for. n_periods : int The number of periods to calculate the percentage change over. Typically 4 for quarterly data, and 12 for monthly data.

Returns: pandas Series or DataFrame - The percentage change in the data over n_periods. For DataFrame input, the percentage change is calculated for each column.

Raises: InvalidParameterError - If n_periods is not a positive integer. InvalidDataError - If data is not a Series or DataFrame.

def qtly_to_monthly( data: ~Datatype, *, interpolate: bool = True, limit: int | None = 2, dropna: bool = True) -> ~Datatype:
127def qtly_to_monthly(
128    data: DataT,
129    *,
130    interpolate: bool = True,
131    limit: int | None = 2,  # only used if interpolate is True
132    dropna: bool = True,
133) -> DataT:
134    """Convert data from Quarterly PeriodIndex to a Monthly PeriodIndex.
135
136    Args:
137        data: Series or DataFrame with quarterly PeriodIndex. Assumes the index is unique.
138            The data to convert to monthly frequency.
139        interpolate: bool, default True
140            Whether to interpolate the missing monthly data.
141        limit: int, default 2 - The maximum number of consecutive missing months
142            to interpolate.
143        dropna: bool, default True - Whether to drop NA data
144
145    Returns:
146        pandas Series or DataFrame - The data with a Monthly PeriodIndex.
147            If interpolate is True, the missing monthly data is interpolated.
148            If dropna is True, any NA data is removed.
149
150    Raises:
151        InvalidDataError - If data index is not a quarterly PeriodIndex or has issues.
152        InvalidParameterError - If limit parameter is invalid.
153
154    """
155    # Validate input data
156    if not isinstance(data, (Series, DataFrame)):
157        raise InvalidDataError("data must be a pandas Series or DataFrame")
158
159    if not isinstance(data.index, PeriodIndex):
160        raise InvalidDataError("data index must be a PeriodIndex")
161
162    if not (data.index.freqstr and data.index.freqstr[0] == "Q"):
163        raise InvalidDataError("data index must have quarterly frequency")
164
165    if not data.index.is_unique:
166        raise InvalidDataError("data index must be unique")
167
168    if not data.index.is_monotonic_increasing:
169        raise InvalidDataError("data index must be monotonic increasing")
170
171    if limit is not None and (not isinstance(limit, int) or limit < 0):
172        raise InvalidParameterError("limit must be a non-negative integer or None")
173
174    # do the heavy lifting
175    try:
176        data = (
177            data.set_axis(labels=data.index.to_timestamp(how="end"), axis="index", copy=True)
178            .resample(rule="ME")  # adds in every missing month
179            .first(min_count=1)  # generates nans for new months
180            # assumes only one value per quarter (ie. unique index)
181            .pipe(_set_axis_monthly_periods)
182        )
183    except Exception as e:
184        raise InvalidDataError(f"Error in quarterly to monthly conversion: {e}") from e
185
186    if interpolate:
187        data = data.interpolate(limit_area="inside", limit=limit)
188    if dropna:
189        data = data.dropna()
190
191    return data

Convert data from Quarterly PeriodIndex to a Monthly PeriodIndex.

Args: data: Series or DataFrame with quarterly PeriodIndex. Assumes the index is unique. The data to convert to monthly frequency. interpolate: bool, default True Whether to interpolate the missing monthly data. limit: int, default 2 - The maximum number of consecutive missing months to interpolate. dropna: bool, default True - Whether to drop NA data

Returns: pandas Series or DataFrame - The data with a Monthly PeriodIndex. If interpolate is True, the missing monthly data is interpolated. If dropna is True, any NA data is removed.

Raises: InvalidDataError - If data index is not a quarterly PeriodIndex or has issues. InvalidParameterError - If limit parameter is invalid.

@cache
def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> pandas.DataFrame:
17@cache
18def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame:
19    """Return a DataFrame of RBA Catalogue numbers.
20
21    In the first instance, this is downloaded from the RBA website, and
22    cached for future use.
23
24    Parameters
25    ----------
26    cache_only : bool = False
27        If True, only use the cache.
28    verbose : bool = False
29        If True, print progress messages.
30
31    Returns
32    -------
33    DataFrame
34        A DataFrame of RBA Catalogue numbers.
35
36    Example
37    -------
38    ```python
39    import readabs as ra
40    catalogue = ra.rba_catalogue()
41    ```
42
43    """
44    return _get_rba_links(cache_only=cache_only, verbose=verbose)

Return a DataFrame of RBA Catalogue numbers.

In the first instance, this is downloaded from the RBA website, and cached for future use.

Parameters

cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.

Returns

DataFrame A DataFrame of RBA Catalogue numbers.

Example

import readabs as ra
catalogue = ra.rba_catalogue()
rba_metacol = _RbaMetacol(title='Title', desc='Description', freq='Frequency', type='Type', unit='Units', src='Source', pub='Publication date', id='Series ID', table='Table', tdesc='Table Description')
def read_abs_by_desc( wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], **kwargs: Any) -> tuple[dict[str, pandas.Series], pandas.DataFrame]:
143def read_abs_by_desc(
144    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
145    **kwargs: Any,
146) -> tuple[dict[str, pd.Series], pd.DataFrame]:
147    """Get specific ABS data series by searching the ABS meta data.
148
149    Parameters
150    ----------
151    wanted : list of str, dict of str:str, or dict of str:dict
152        The data
153        item descriptions to search for. If a list, it will be a list of
154        descriptions to search for. If a dictionary, the keys will a name.
155        The dictionary values can be either a string (the data item
156        description to search for) or a dictionary of keyword arguments, one of
157        which would be the data item description to search for.
158    **kwargs : Any
159        Keyword arguments to control the data retrieval.
160        The keyword arguments can include the following:
161        - abs_dict : dict - the dictionary of ABS data to search (from
162            read_abs_cat()).
163        - abs_meta : DataFrame - the metadata for the ABS data (from
164            read_abs_cat()).
165        - for the retrieval of data, the "cat" argument must be present.
166            The following arguments, if present, will also be used (ie.
167            passed to read_abs_cat()): ["ignore_errors", "get_zip",
168            "get_excel_if_no_zip", "get_excel", "cache_only",
169            "single_excel_only", "single_zip_only", "verbose"].
170        - for the selection of data, the following metacol names, if present,
171            will be used to construct the selector: "cat", "did"
172            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
173            "cmonth", "table", "tdesc".
174        - finally, the following arguments will be passed to the find_abs_id()
175            and search_abs_meta() functions: ["validate_unique", "exact_match",
176            "regex", "verbose"].
177
178    Notes
179    -----
180    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
181        include sufficient keys from the metacol dataclass to get the data.
182        Typically, the "cat" key, the "table" key, and the "stype" key would
183        be required. The did key would taken from the wanted list or
184        dictionary.
185    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
186        must contain a "did" key. The other keys that can be used for the
187        data retrieval are the same as the metacol dataclass fileds, namely:
188        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
189        "cmonth", "table", "tdesc".
190    - if abs_dict and abs_meta are provided within the kwargs, they will be
191        used to locate and extract the selected data.
192    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
193        type dict[str, dict[str, Any]] and (2) the inner dictionary must
194        contain a "cat" key so the data can be retrieved. Other keys that
195        can be used for the data retrieval are the same as for read_abs_cat(),
196        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
197        "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
198
199
200    Returns
201    -------
202    Returns a tuple of two items:
203    - A dictionary of pandas Series objects, where the keys are the series
204      descriptions. The series.name attribute will be the ABS series-id.
205    - A pandas DataFrame containing the metadata for the series.
206
207    Example
208    -------
209
210    ```python
211    import readabs as ra
212    from pandas import DataFrame
213    cat_num = "5206.0"  # The ABS National Accounts
214    data, meta = ra.read_abs_cat(cat=cat_num)
215    wanted = ["Gross domestic product: Chain volume measures ;",]
216    selected, selected_meta = ra.read_abs_by_desc(
217        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
218    )
219    ```
220
221    """
222    # - preparation
223    if not _work_to_do(wanted):
224        return {}, pd.DataFrame()
225    if isinstance(wanted, list):
226        wanted = _wlist_to_wdict(wanted)
227    abs_dict = kwargs.get("abs_dict", {})
228    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
229    kwarg_selector = _get_search_terms(kwargs, {})
230    search_args = _get_search_args(kwargs, {})
231
232    return_dict = {}
233    return_meta = pd.DataFrame()
234    for key, value in wanted.items():
235        item_selector = kwarg_selector.copy()
236        item_search_args = search_args.copy()
237        if isinstance(value, str):
238            series, meta = _get_item_from_str(
239                item=value,
240                data_dict=abs_dict,
241                data_meta=abs_meta,
242                item_selector=item_selector,
243                search_args=item_search_args,
244            )
245
246        elif isinstance(value, dict):
247            series, meta = _get_item_from_dict(
248                item_dict=value,
249                data_dict=abs_dict,
250                data_meta=abs_meta,
251                item_selector=item_selector,
252                search_args=item_search_args,
253                **kwargs,
254            )
255        else:
256            raise TypeError(
257                "Each value in the wanted list/dictionary must be either a string " + "or a dictionary."
258            )
259
260        # save search results
261        return_dict[key] = series
262        return_meta = pd.concat([return_meta, meta])
263
264    return return_dict, return_meta

Get specific ABS data series by searching the ABS meta data.

Parameters

wanted : list of str, dict of str:str, or dict of str:dict The data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dictionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for. **kwargs : Any Keyword arguments to control the data retrieval. The keyword arguments can include the following: - abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()). - abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()). - for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "single_zip_only", "verbose"]. - for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc". - finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].

Notes

  • if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
  • if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
  • if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "single_zip_only", "cache_only"].

Returns

Returns a tuple of two items:

  • A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
  • A pandas DataFrame containing the metadata for the series.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "5206.0"  # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
    wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)
@cache
def read_abs_cat( cat: str, **kwargs: Unpack[ReadArgs]) -> tuple[dict[str, pandas.DataFrame], pandas.DataFrame]:
 27@cache  # minimise slowness for any repeat business
 28def read_abs_cat(
 29    cat: str,
 30    **kwargs: Unpack[ReadArgs],
 31) -> tuple[dict[str, DataFrame], DataFrame]:
 32    """For a specific catalogue identifier, return the complete ABS Catalogue information as DataFrames.
 33
 34    This function returns the complete ABS Catalogue information as a
 35    python dictionary of pandas DataFrames, as well as the associated metadata
 36    in a separate DataFrame. The function automates the collection of zip and
 37    excel files from the ABS website. If necessary, these files are downloaded,
 38    and saved into a cache directory. The files are then parsed to extract time
 39    series data, and the associated metadata.
 40
 41    By default, the cache directory is `./.readabs_cache/`. You can change the
 42    default directory name by setting the shell environment variable
 43    `READABS_CACHE_DIR` with the name of the preferred directory.
 44
 45    Parameters
 46    ----------
 47    cat : str
 48        The ABS Catalogue Number for the data to be downloaded and made
 49        available by this function. This argument must be specified in the
 50        function call.
 51
 52    **kwargs : Unpack[ReadArgs]
 53        The following parameters may be passed as optional keyword arguments.
 54
 55    url : str = ""
 56        The URL of an ABS landing page. Use this for discontinued series
 57        that are no longer in the ABS Time Series Directory. If provided,
 58        data will be retrieved from this URL instead of looking up the
 59        catalogue number. Example:
 60        `read_abs_cat(cat="8501.0", url="https://www.abs.gov.au/.../jun-2025")`
 61
 62    keep_non_ts : bool = False
 63        A flag for whether to keep the non-time-series tables
 64        that might form part of an ABS catalogue item. Normally, the
 65        non-time-series information is ignored, and not made available to
 66        the user.
 67
 68    history : str = ""
 69        Provide a month-year string to extract historical ABS data.
 70        For example, you can set history="dec-2023" to the get the ABS data
 71        for a catalogue identifier that was originally published in respect
 72        of Q4 of 2023. Note: not all ABS data sources are structured so that
 73        this technique works in every case; but most are.
 74
 75    verbose : bool = False
 76        Setting this to true may help diagnose why something
 77        might be going wrong with the data retrieval process.
 78
 79    ignore_errors : bool = False
 80        Normally, this function will cease downloading when
 81        an error in encountered. However, sometimes the ABS website has
 82        malformed links, and changing this setting is necessitated. (Note:
 83        if you drop a message to the ABS, they will usually fix broken
 84        links with a business day).
 85
 86    get_zip : bool = True
 87        Download the excel files in .zip files.
 88
 89    get_excel_if_no_zip : bool = True
 90        Only try to download .xlsx files if there are no zip
 91        files available to be downloaded. Only downloading individual excel
 92        files when there are no zip files to download can speed up the
 93        download process.
 94
 95    get_excel : bool = False
 96        The default value means that excel files are not
 97        automatically download. Note: at least one of `get_zip`,
 98        `get_excel_if_no_zip`, or `get_excel` must be true. For most ABS
 99        catalogue items, it is sufficient to just download the one zip
100        file. But note, some catalogue items do not have a zip file.
101        Others have quite a number of zip files.
102
103    single_excel_only : str = ""
104        If this argument is set to a table name (without the
105        .xlsx extension), only that excel file will be downloaded. If
106        set, and only a limited subset of available data is needed,
107        this can speed up download times significantly. Note: overrides
108        `get_zip`, `get_excel_if_no_zip`, `get_excel` and `single_zip_only`.
109
110    single_zip_only : str = ""
111        If this argument is set to a zip file name (without
112        the .zip extension), only that zip file will be downloaded.
113        If set, and only a limited subset of available data is needed,
114        this can speed up download times significantly. Note: overrides
115        `get_zip`, `get_excel_if_no_zip`, and `get_excel`.
116
117    cache_only : bool = False
118        If set to True, this function will only access
119        data that has been previously cached. Normally, the function
120        checks the date of the cache data against the date of the data
121        on the ABS website, before deciding whether the ABS has fresher
122        data that needs to be downloaded to the cache.
123
124    zip_file: str | Path = ""
125        If set to a specific zip file name (with or without the .zip
126        extension), this function will only extract data from that zip file
127        on the local file system. This may be useful for debugging purposes.
128
129    Returns
130    -------
131    tuple[dict[str, DataFrame], DataFrame]
132        The function returns a tuple of two items. The first item is a
133        python dictionary of pandas DataFrames (which is the primary data
134        associated with the ABS catalogue item). The second item is a
135        DataFrame of ABS metadata for the ABS collection.
136
137        Note:
138        You can retrieve non-timeseries data using the grab_abs_url()
139        function. That takes the URL for the ABS landing page for the ABS
140        collection you are interested in. The read_abs_cat function is for
141        ABS catalogue identifiers which are timeseries data, for which the
142        metadata can be extracted.
143
144    Example
145    -------
146
147    ```python
148    import readabs as ra
149    from pandas import DataFrame
150    cat_num = "6202.0"  # The ABS labour force survey
151    data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num)
152    abs_dict, meta = data
153    ```
154
155    """
156    # --- get the time series data ---
157    if kwargs.get("zip_file"):
158        raw_abs_dict = grab_abs_zip(kwargs["zip_file"], **kwargs)
159    else:
160        raw_abs_dict = grab_abs_url(cat=cat, **kwargs)
161    response = _get_time_series_data(cat, raw_abs_dict, **kwargs)
162
163    if not response:
164        response = {}, DataFrame()
165
166    return response  # dictionary of DataFrames, and a DataFrame of metadata

For a specific catalogue identifier, return the complete ABS Catalogue information as DataFrames.

This function returns the complete ABS Catalogue information as a python dictionary of pandas DataFrames, as well as the associated metadata in a separate DataFrame. The function automates the collection of zip and excel files from the ABS website. If necessary, these files are downloaded, and saved into a cache directory. The files are then parsed to extract time series data, and the associated metadata.

By default, the cache directory is ./.readabs_cache/. You can change the default directory name by setting the shell environment variable READABS_CACHE_DIR with the name of the preferred directory.

Parameters

cat : str The ABS Catalogue Number for the data to be downloaded and made available by this function. This argument must be specified in the function call.

**kwargs : Unpack[ReadArgs] The following parameters may be passed as optional keyword arguments.

url : str = "" The URL of an ABS landing page. Use this for discontinued series that are no longer in the ABS Time Series Directory. If provided, data will be retrieved from this URL instead of looking up the catalogue number. Example: read_abs_cat(cat="8501.0", url="https://www.abs.gov.au/.../jun-2025")

keep_non_ts : bool = False A flag for whether to keep the non-time-series tables that might form part of an ABS catalogue item. Normally, the non-time-series information is ignored, and not made available to the user.

history : str = "" Provide a month-year string to extract historical ABS data. For example, you can set history="dec-2023" to the get the ABS data for a catalogue identifier that was originally published in respect of Q4 of 2023. Note: not all ABS data sources are structured so that this technique works in every case; but most are.

verbose : bool = False Setting this to true may help diagnose why something might be going wrong with the data retrieval process.

ignore_errors : bool = False Normally, this function will cease downloading when an error in encountered. However, sometimes the ABS website has malformed links, and changing this setting is necessitated. (Note: if you drop a message to the ABS, they will usually fix broken links with a business day).

get_zip : bool = True Download the excel files in .zip files.

get_excel_if_no_zip : bool = True Only try to download .xlsx files if there are no zip files available to be downloaded. Only downloading individual excel files when there are no zip files to download can speed up the download process.

get_excel : bool = False The default value means that excel files are not automatically download. Note: at least one of get_zip, get_excel_if_no_zip, or get_excel must be true. For most ABS catalogue items, it is sufficient to just download the one zip file. But note, some catalogue items do not have a zip file. Others have quite a number of zip files.

single_excel_only : str = "" If this argument is set to a table name (without the .xlsx extension), only that excel file will be downloaded. If set, and only a limited subset of available data is needed, this can speed up download times significantly. Note: overrides get_zip, get_excel_if_no_zip, get_excel and single_zip_only.

single_zip_only : str = "" If this argument is set to a zip file name (without the .zip extension), only that zip file will be downloaded. If set, and only a limited subset of available data is needed, this can speed up download times significantly. Note: overrides get_zip, get_excel_if_no_zip, and get_excel.

cache_only : bool = False If set to True, this function will only access data that has been previously cached. Normally, the function checks the date of the cache data against the date of the data on the ABS website, before deciding whether the ABS has fresher data that needs to be downloaded to the cache.

zip_file: str | Path = "" If set to a specific zip file name (with or without the .zip extension), this function will only extract data from that zip file on the local file system. This may be useful for debugging purposes.

Returns

tuple[dict[str, DataFrame], DataFrame] The function returns a tuple of two items. The first item is a python dictionary of pandas DataFrames (which is the primary data associated with the ABS catalogue item). The second item is a DataFrame of ABS metadata for the ABS collection.

Note:
You can retrieve non-timeseries data using the grab_abs_url()
function. That takes the URL for the ABS landing page for the ABS
collection you are interested in. The read_abs_cat function is for
ABS catalogue identifiers which are timeseries data, for which the
metadata can be extracted.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "6202.0"  # The ABS labour force survey
data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num)
abs_dict, meta = data
def read_abs_series( cat: str, series_id: str | Sequence[str], **kwargs: Unpack[ReadArgs]) -> tuple[pandas.DataFrame, pandas.DataFrame]:
15def read_abs_series(
16    cat: str,
17    series_id: str | Sequence[str],
18    **kwargs: Unpack[ReadArgs],
19) -> tuple[DataFrame, DataFrame]:
20    """Get specific ABS data series by their ABS catalogue and series identifiers.
21
22    Parameters
23    ----------
24    cat : str
25        The ABS catalogue ID.
26
27    series_id : str | Sequence[str]
28        An ABS series ID or a sequence of ABS series IDs.
29
30    **kwargs : Any
31        Keyword arguments for the read_abs_series function,
32        which are the same as the keyword arguments for the
33        read_abs_cat function.
34
35    Returns
36    -------
37    tuple[DataFrame, DataFrame]
38        A tuple of two DataFrames, one for the primary data and one for the metadata.
39
40    Example
41    -------
42
43    ```python
44    import readabs as ra
45    from pandas import DataFrame
46    cat_num = "6202.0"  # The ABS labour force survey
47    unemployment_rate = "A84423050A"
48    seo = "6202001"  # The ABS table name
49    data, meta = ra.read_abs_series(
50        cat=cat_num, series_id=unemployment_rate, single_excel_only=seo
51    )
52    ```
53
54    """
55    # check for unexpected keyword arguments/get defaults
56    check_kwargs(kwargs, "read_abs_series")
57    args = get_args(kwargs, "read_abs_series")
58
59    # read the ABS category data
60    cat_data, cat_meta = read_abs_cat(cat, **args)
61
62    # drop repeated series_ids in the meta data,
63    # make unique series_ids the index
64    cat_meta.index = Index(cat_meta[metacol.id])
65    cat_meta = cat_meta.groupby(cat_meta.index).first()
66
67    # get the ABS series data
68    if isinstance(series_id, str):
69        series_id = [series_id]
70    return_data, return_meta = DataFrame(), DataFrame()
71    for identifier in series_id:
72        # confirm that the series ID is in the catalogue
73        if identifier not in cat_meta.index:
74            if args["verbose"]:
75                print(f"Series ID {identifier} not found in ABS catalogue ID {cat}")
76            if args["ignore_errors"]:
77                continue
78            raise ValueError(f"Series ID {identifier} not found in catalogue {cat}")
79
80        # confirm thay the index of the series is compatible
81        table = str(cat_meta.loc[identifier, metacol.table])  # str for mypy
82        data_series = cat_data[table][identifier]
83        if (
84            len(return_data) > 0
85            and cast("PeriodIndex", return_data.index).freq != cast("PeriodIndex", data_series.index).freq
86        ):
87            if args["verbose"]:
88                print(f"Frequency mismatch for series ID {identifier}")
89            if args["ignore_errors"]:
90                continue
91            raise ValueError(f"Frequency mismatch for series ID {identifier}")
92
93        # add the series data and meta data to the return values
94        if len(return_data) > 0:
95            return_data = return_data.reindex(return_data.index.union(data_series.index))
96        return_data[identifier] = data_series
97        return_meta = concat([return_meta, cat_meta.loc[identifier]], axis=1)
98
99    return return_data, return_meta.T

Get specific ABS data series by their ABS catalogue and series identifiers.

Parameters

cat : str The ABS catalogue ID.

series_id : str | Sequence[str] An ABS series ID or a sequence of ABS series IDs.

**kwargs : Any Keyword arguments for the read_abs_series function, which are the same as the keyword arguments for the read_abs_cat function.

Returns

tuple[DataFrame, DataFrame] A tuple of two DataFrames, one for the primary data and one for the metadata.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "6202.0"  # The ABS labour force survey
unemployment_rate = "A84423050A"
seo = "6202001"  # The ABS table name
data, meta = ra.read_abs_series(
    cat=cat_num, series_id=unemployment_rate, single_excel_only=seo
)
def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> pandas.Series:
171def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> Series:  # ignore_errors
172    """Read the Official Cash Rate (OCR) from the RBA website.
173
174    Return it in a pandas Series, with either a daily or monthly PeriodIndex,
175    depending on the value of the monthly parameter. The default is monthly.
176
177    Parameters
178    ----------
179    monthly : bool = True
180        If True, then the data will be returned with a monthly PeriodIndex.
181        If False, then the data will be returned with a daily PeriodIndex.
182    **kwargs : Any
183        Additional keyword arguments. The only keyword argument that is used is ignore_errors.
184    ignore_errors : bool = False
185        If True, then any major errors encountered will be printed and the function
186        will return an empty Series. If False, then any major errors encountered
187        will raise an exception.
188
189    Returns
190    -------
191    Series
192        The OCR data in a pandas Series, with an index of either daily or monthly Periods.
193
194    Examples
195    --------
196    ```python
197    ocr = read_rba_ocr(monthly=True)
198    ```
199
200    """
201    # read the OCR table from the RBA website, make float and sort, name the series
202    rba, _rba_meta = read_rba_table("A2", **kwargs)  # should have a daily PeriodIndex
203    ocr_series = rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"]
204    ocr = ocr_series.astype(float).sort_index()  # pyright: ignore[reportAttributeAccessIssue]
205    ocr.name = "RBA Official Cash Rate"
206
207    # bring up to date
208    today = Period(Timestamp.today(), freq=cast("PeriodIndex", ocr.index).freqstr)
209    last_period = cast("Period", ocr.index[-1])
210    if last_period < today:
211        ocr[today] = ocr.iloc[-1]
212
213    if not monthly:
214        # fill in missing days and return daily data
215        daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D")
216        return ocr.reindex(daily_index).ffill()
217
218    # convert to monthly data, keeping last value if duplicates in month
219    # fill in missing months
220    ocr.index = PeriodIndex(ocr.index, freq="M")
221    ocr = ocr[~ocr.index.duplicated(keep="last")]
222    monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M")
223    return ocr.reindex(monthly_index, method="ffill")

Read the Official Cash Rate (OCR) from the RBA website.

Return it in a pandas Series, with either a daily or monthly PeriodIndex, depending on the value of the monthly parameter. The default is monthly.

Parameters

monthly : bool = True If True, then the data will be returned with a monthly PeriodIndex. If False, then the data will be returned with a daily PeriodIndex. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return an empty Series. If False, then any major errors encountered will raise an exception.

Returns

Series The OCR data in a pandas Series, with an index of either daily or monthly Periods.

Examples

ocr = read_rba_ocr(monthly=True)
def read_rba_table(table: str, **kwargs: Any) -> tuple[pandas.DataFrame, pandas.DataFrame]:
 89def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]:  # ignore_errors
 90    """Read a table from the RBA website and return the actual data and meta data.
 91
 92    Returns the actual data and the meta data in a tuple of two DataFrames.
 93
 94    Parameters
 95    ----------
 96    table : str
 97        The table to read from the RBA website.
 98    **kwargs : Any
 99        Additional keyword arguments.
100        The only keyword argument that is used is ignore_errors.
101    ignore_errors : bool = False
102        If True, then any major errors encountered will be printed and the function
103        will return empty DataFrames. If False, then any major errors encountered
104        will raise an exception.
105
106    Returns
107    -------
108    tuple[DataFrame, DataFrame]
109        The primary data and the meta data in a tuple of two DataFrames.
110
111    Examples
112    --------
113    ```python
114    data, meta = read_rba_table("C1")
115    ```
116
117    """
118    # set-up
119    ignore_errors = kwargs.get("ignore_errors", False)
120    data, meta = DataFrame(), DataFrame()
121
122    # get the Excel file
123    excel = _get_excel_file(table, ignore_errors=ignore_errors, **kwargs)
124    if excel is None:
125        return data, meta
126
127    # read Excel file into DataFrame
128    try:
129        raw = read_excel(BytesIO(excel), header=None, index_col=None)
130    except Exception as e:
131        if ignore_errors:
132            print(f"Ignoring error: {e}")
133            return data, meta
134        raise
135
136    # extract the meta data
137    meta = raw.iloc[1:11, :].T.copy()
138    meta.columns = Index(meta.iloc[0])
139    renamer = {
140        "Mnemonic": rm.id,
141    }  # historical data is inconsistent
142    meta = meta.rename(columns=renamer)
143    meta = meta.iloc[1:, :]
144    meta.index = Index(meta[rm.id])
145    meta[rm.table] = table
146    meta[rm.tdesc] = raw.iloc[0, 0]
147    meta = meta.dropna(how="all", axis=1)  # drop columns with all NaNs
148
149    # extract the data
150    data = raw.iloc[10:, :].copy()
151    data.columns = Index(data.iloc[0])
152    data = data.iloc[1:, :]
153    data.index = DatetimeIndex(data.iloc[:, 0])
154    data = data.iloc[:, 1:]
155    data = data.dropna(how="all", axis=1)  # drop columns with all NaNs
156
157    # can we make the index into a PeriodIndex?
158    days = data.index.to_series().diff(1).dropna().dt.days
159    if days.min() >= MONTHLY_MIN_DAYS and days.max() <= MONTHLY_MAX_DAYS:
160        data.index = PeriodIndex(data.index, freq="M")
161    elif days.min() >= QUARTERLY_MIN_DAYS and days.max() <= QUARTERLY_MAX_DAYS:
162        data.index = PeriodIndex(data.index, freq="Q")
163    elif days.min() >= YEARLY_MIN_DAYS and days.max() <= YEARLY_MAX_DAYS:
164        data.index = PeriodIndex(data.index, freq="Y")
165    else:
166        data.index = PeriodIndex(data.index, freq="D")
167
168    return data, meta

Read a table from the RBA website and return the actual data and meta data.

Returns the actual data and the meta data in a tuple of two DataFrames.

Parameters

table : str The table to read from the RBA website. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return empty DataFrames. If False, then any major errors encountered will raise an exception.

Returns

tuple[DataFrame, DataFrame] The primary data and the meta data in a tuple of two DataFrames.

Examples

data, meta = read_rba_table("C1")
def recalibrate(data: ~Datatype, units: str) -> tuple[~Datatype, str]:
24def recalibrate(
25    data: DataT,
26    units: str,
27) -> tuple[DataT, str]:
28    """Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.
29
30    Change the name of the units to reflect the recalibration.
31
32    Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar.
33    If you provide a Series, you will get a Series back. If you provide a DataFrame,
34    you will get a DataFrame back.
35
36    Parameters
37    ----------
38    data : Series or DataFrame
39        The data to recalibrate.
40    units : str
41        The units of the data. This string should be in the form of
42        "Number", "Thousands", "Millions", "Billions", etc. The units
43        should be in title case.
44
45    Returns
46    -------
47    Series or DataFrame
48        The recalibrated data will be a Series if a Series was provided,
49        or a DataFrame if a DataFrame was provided.
50
51    Examples
52    --------
53    ```python
54    from pandas import Series
55    from readabs import recalibrate
56    s = Series([1_000, 10_000, 100_000, 1_000_000])
57    recalibrated, units = recalibrate(s, "$")
58    print(f"{recalibrated=}, {units=}")
59    ```
60
61    """
62    if not isinstance(data, (Series, DataFrame)):
63        raise TypeError("data must be a Series or DataFrame")
64    units, restore_name = _prepare_units(units)
65    flat_data = data.to_numpy().flatten()
66    flat_data, units = _recalibrate(flat_data, units)
67
68    if restore_name:
69        units = f"{restore_name} {units}"
70        for n in "numbers", "number":
71            if n in units:
72                units = units.replace(n, "").strip()
73                break
74    units = units.title()
75
76    result = data.__class__(flat_data.reshape(data.shape))
77    result.index = data.index
78    if len(data.shape) == NDIM_DATAFRAME:
79        result.columns = data.columns
80    if len(data.shape) == NDIM_SERIES:
81        result.name = data.name  # pyright: ignore[reportAttributeAccessIssue]
82    return result, units

Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.

Change the name of the units to reflect the recalibration.

Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.

Parameters

data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.

Examples

from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 85def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 86    """Recalibrate a floating point value.
 87
 88    The value will be recalibrated so it is in the range -1000 to 1000.
 89    The units will be changed to reflect the recalibration.
 90
 91    Parameters
 92    ----------
 93    value : float
 94        The value to recalibrate.
 95    units : str
 96        The units of the value. This string should be in the form of
 97        "Number", "Thousands", "Millions", "Billions", etc. The units
 98        should be in title case.
 99
100    Returns
101    -------
102    tuple[float, str]
103        A tuple containing the recalibrated value and the recalibrated units.
104
105    Examples
106    --------
107    ```python
108    from readabs import recalibrate_value
109    recalibrated, units = recalibrate_value(10_000_000, "Thousand")
110    print(recalibrated, units)
111    ```
112
113    """
114    series = Series([value])
115    output, units = recalibrate(series, units)
116    return output.to_numpy()[0], units

Recalibrate a floating point value.

The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.

Parameters

value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.

Examples

from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)
def search_abs_meta( meta: pandas.DataFrame, search_terms: dict[str, str], *, exact_match: bool = False, regex: bool = False, validate_unique: bool = False, **kwargs: Any) -> pandas.DataFrame:
 17def search_abs_meta(
 18    meta: DataFrame,  # sourced from read_abs_series() or read_abs_cat()
 19    search_terms: dict[str, str],  # {search_term: meta_data_column_name, ...}
 20    *,
 21    exact_match: bool = False,
 22    regex: bool = False,
 23    validate_unique: bool = False,  # useful safety-net if you expect only one match
 24    **kwargs: Any,  # verbose flag
 25) -> DataFrame:
 26    """Extract from the ABS meta data those rows that match the search_terms.
 27
 28    Iteratively search the meta data one search_term at a time.
 29
 30    Parameters
 31    ----------
 32    meta : DataFrame
 33        A pandas DataFrame of metadata from the ABS
 34        (via read_abs_cat() or read_abs_series()).
 35    search_terms : dict[str, str]
 36        A dictionary {search_phrase: meta_column_name, ...} of search terms.
 37        Note: the search terms must be unique, as a dictionary cannot hold the
 38        same search term to be applied to different columns.
 39    exact_match : bool = False
 40        Whether to match using == (exact) or .str.contains() (inexact).
 41    regex : bool = False
 42        Whether to use regular expressions in the search.
 43    validate_unique : bool = False
 44        Raise a ValueError if the search result is not unique.
 45    **kwargs : Any
 46        Additional keyword arguments. The only keyword argument
 47        that is used is verbose.
 48    verbose : bool = False
 49        Print additional information while searching; which can
 50        be useful when diagnosing problems with search terms.
 51
 52    Returns
 53    -------
 54    DataFrame
 55        Returns a pandas DataFrame of matching rows (subseted from meta).
 56        Note, The index for the returned meta data will always comprise ABS
 57        series_ids. Duplicate indexes will be removed from the meta data
 58        (ie. where the same ABS series appears in more than one table, this
 59        function will only report the first match).
 60
 61    Metacol
 62    -------
 63    Because the meta data is a DataFrame, the columns can be referenced by either
 64    their full textual name, or by the short name defined in the metacol object.
 65    For example, if metacol is imported as mc, to refer to the
 66    `Data Item Description` column, the user can refer to it as mc.did.
 67
 68    Example
 69    -------
 70    ```python
 71    from readabs import metacol as mc  # alias for the ABS meta data column names
 72    from readabs import read_abs_cat, search_abs_meta
 73    cat_num = "6202.0"  # The ABS labour force survey
 74    data, meta = read_abs_cat(cat_num)
 75    search_terms = {
 76        "Unemployment rate": mc.did,  # the data item description
 77        "Persons": mc.did,
 78        "Seasonally Adjusted": mc.stype,
 79        "Percent": mc.unit,
 80        "6202001": mc.table,
 81    }
 82    rows = search_abs_meta(meta, search_terms, verbose=True)
 83    print(rows)  # should have three rows : FT/PT/All Unemployment rates
 84    ```
 85
 86    """
 87    # get the verbose-flag from kwargs
 88    verbose = kwargs.get("verbose", False)
 89
 90    # establish the starting point
 91    meta_select = meta.copy()  # preserve the original meta data
 92    if verbose:
 93        print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}")
 94        print(f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data.")
 95
 96    # iteratively search
 97    for phrase, column in search_terms.items():
 98        if verbose:
 99            print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}")
100
101        pick_me = (
102            (meta_select[column] == phrase)
103            if (exact_match or column == mc.table)
104            else meta_select[column].str.contains(phrase, regex=regex)
105        )
106        meta_select = meta_select[pick_me]
107        if verbose:
108            print(f"In find_rows() have found {len(meta_select)}")
109
110    # search complete - check results - and return
111    meta_select.index = Index(meta_select[mc.id])
112    meta_select = meta_select[~meta_select.index.duplicated(keep="first")]
113
114    if verbose:
115        print(f"Final selection is {len(meta_select)} rows.")
116
117    elif len(meta_select) == 0:
118        print("Nothing selected?")
119
120    if validate_unique and len(meta_select) != 1:
121        raise ValueError("The selected meta data should only contain one row.")
122
123    return meta_select

Extract from the ABS meta data those rows that match the search_terms.

Iteratively search the meta data one search_term at a time.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. exact_match : bool = False Whether to match using == (exact) or .str.contains() (inexact). regex : bool = False Whether to use regular expressions in the search. validate_unique : bool = False Raise a ValueError if the search result is not unique. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is verbose. verbose : bool = False Print additional information while searching; which can be useful when diagnosing problems with search terms.

Returns

DataFrame Returns a pandas DataFrame of matching rows (subseted from meta). Note, The index for the returned meta data will always comprise ABS series_ids. Duplicate indexes will be removed from the meta data (ie. where the same ABS series appears in more than one table, this function will only report the first match).

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, search_abs_meta
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Unemployment rate": mc.did,  # the data item description
    "Persons": mc.did,
    "Seasonally Adjusted": mc.stype,
    "Percent": mc.unit,
    "6202001": mc.table,
}
rows = search_abs_meta(meta, search_terms, verbose=True)
print(rows)  # should have three rows : FT/PT/All Unemployment rates