sdmxabs

Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.

 1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API."""
 2
 3from importlib.metadata import PackageNotFoundError, version
 4
 5from .download_cache import (
 6    CacheError,
 7    GetFileKwargs,
 8    HttpError,
 9    ModalityType,
10)
11from .fetch import fetch
12from .fetch_gdp import fetch_gdp
13from .fetch_multi import fetch_multi
14from .fetch_pop import fetch_pop, fetch_state_pop
15from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item
16from .flow_metadata import FlowMetaDict, code_list_for_dim, code_lists, data_dimensions, data_flows, frame
17from .measures import measure_names, recalibrate, recalibrate_series
18
19# --- version and author
20try:
21    __version__ = version(__name__)
22except PackageNotFoundError:
23    __version__ = "0.0.0"  # Fallback for development mode
24__author__ = "Bryan Palmer"
25
26# --- establish the package contents
27__all__ = [
28    "CacheError",
29    "FlowMetaDict",
30    "GetFileKwargs",
31    "HttpError",
32    "MatchCriteria",
33    "MatchItem",
34    "MatchType",
35    "ModalityType",
36    "__author__",
37    "__version__",
38    "code_list_for_dim",
39    "code_lists",
40    "data_dimensions",
41    "data_flows",
42    "fetch",
43    "fetch_gdp",
44    "fetch_multi",
45    "fetch_pop",
46    "fetch_selection",
47    "fetch_state_pop",
48    "frame",
49    "make_wanted",
50    "match_item",
51    "measure_names",
52    "recalibrate",
53    "recalibrate_series",
54]
class CacheError(builtins.Exception):
38class CacheError(Exception):
39    """A problem retrieving data from the cache."""

A problem retrieving data from the cache.

FlowMetaDict = dict[str, dict[str, str]]
class GetFileKwargs(typing.TypedDict):
45class GetFileKwargs(TypedDict):
46    """TypedDict for acqure_url function arguments."""
47
48    verbose: NotRequired[bool]
49    """If True, print information about the data retrieval process."""
50    modality: NotRequired[ModalityType]
51    """Kind of retrieval: "prefer_cache", "prefer_url"."""

TypedDict for acqure_url function arguments.

verbose: NotRequired[bool]

If True, print information about the data retrieval process.

modality: NotRequired[Literal['prefer-cache', 'prefer-url']]

Kind of retrieval: "prefer_cache", "prefer_url".

class HttpError(builtins.Exception):
34class HttpError(Exception):
35    """A problem retrieving data using HTTP."""

A problem retrieving data using HTTP.

MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
MatchItem = tuple[str, str, MatchType]
class MatchType(enum.Enum):
17class MatchType(Enum):
18    """Enumeration for match types."""
19
20    EXACT = 1
21    PARTIAL = 2
22    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
ModalityType = typing.Literal['prefer-cache', 'prefer-url']
__author__ = 'Bryan Palmer'
__version__ = '0.1.0'
@cache
def code_list_for_dim( flow_id: str, dim_name: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
155@cache
156def code_list_for_dim(flow_id: str, dim_name: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
157    """Get the code list for a specific dimension or attribute in a dataflow.
158
159    Args:
160        flow_id (str): The ID of the dataflow.
161        dim_name (str): The dimension ID to retrieve the code list for.
162        **kwargs: Additional keyword arguments passed to acquire_url().
163
164    Returns:
165        FlowMetaDict: A dictionary containing the codes and their metadata.
166
167    Raises:
168        ValueError: If the dimension/attribute is not found in the dataflow.
169
170    """
171    dimensions = data_dimensions(flow_id, **kwargs)
172    if dim_name not in dimensions:
173        raise ValueError(f"Dimension '{dim_name}' not found in flow '{flow_id}'")
174
175    codelist_id = dimensions[dim_name].get("id", "")
176    if not codelist_id:
177        raise ValueError(f"No codelist found for dimension/attribute '{dim_name}' in flow '{flow_id}'")
178
179    return code_lists(codelist_id, **kwargs)

Get the code list for a specific dimension or attribute in a dataflow.

Args: flow_id (str): The ID of the dataflow. dim_name (str): The dimension ID to retrieve the code list for. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: FlowMetaDict: A dictionary containing the codes and their metadata.

Raises: ValueError: If the dimension/attribute is not found in the dataflow.

@cache
def code_lists( cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
 96@cache
 97def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
 98    """Get the code list metadata from the ABS SDMX API.
 99
100    Args:
101        cl_id (str): The ID of the code list to retrieve.
102        **kwargs: Additional keyword arguments passed to acquire_url().
103
104    Returns:
105        FlowMetaDict: A dictionary containing the codes and
106            their associated key=value pairs. A "name" key should always
107            be present. A "parent" key may also be present.
108
109    Raises:
110        HttpError: If there is an issue with the HTTP request.
111        CacheError: If there is an issue with the cache.
112        ValueError: If no XML root is found in the response.
113
114    Note:
115        You will get a CacheError if the codelist is not found on the ABS SDMX API.
116        (This package tries the website first, then the cache.)
117
118    Guarantees for the inner dictionary:
119        - The inner dictionary will always have a "name" key.
120        - The inner dictionary may have a "parent" key if the code has a parent.
121
122    """
123    tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs)
124
125    codes: FlowMetaDict = {}
126    for code in tree.findall(".//str:Code", NAME_SPACES):
127        code_id = code.get("id", None)
128        if code_id is None:
129            continue
130        elements: dict[str, str] = {}
131
132        # - get the name
133        name = code.find("com:Name", NAME_SPACES)
134        if name is None or not name.text:
135            # guarantee that we name key and value pair
136            print(f"Warning: Code {code_id} in {cl_id}has no name, skipping.")
137            continue  # skip if no name
138        elements["name"] = name.text
139
140        # - get the parent
141        parent = code.find("str:Parent", NAME_SPACES)
142        parent_id = ""
143        if parent is not None:
144            ref = parent.find("Ref", NAME_SPACES)
145            if ref is not None:
146                parent_id = str(ref.get("id", ""))
147        if parent_id:  # Only add if not empty
148            elements["parent"] = parent_id
149
150        codes[code_id] = elements
151
152    return codes

Get the code list metadata from the ABS SDMX API.

Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)

Guarantees for the inner dictionary: - The inner dictionary will always have a "name" key. - The inner dictionary may have a "parent" key if the code has a parent.

@cache
def data_dimensions( flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
55@cache
56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
57    """Get the data dimensions and attributes metadata from the ABS SDMX API.
58
59    Args:
60        flow_id (str): The ID of the dataflow to retrieve dimensions for.
61        **kwargs: Additional keyword arguments passed to acquire_url().
62
63    Returns:
64        dict[str, dict[str, str]]: A dictionary containing the dimensions and
65            their metadata in key=value pairs.
66
67    Raises:
68        HttpError: If there is an issue with the HTTP request.
69        CacheError: If there is an issue with the cache.
70        ValueError: If no XML root is found in the response.
71
72    Note:
73        The dimensions metadata includes a "position" for each dimmension.
74        The attributes metadata does not have "position" information.
75
76    """
77    tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs)
78
79    elements = {}
80    for ident in ["Dimension", "Attribute"]:
81        for elem in tree.findall(f".//str:{ident}", NAME_SPACES):
82            element_id = elem.get("id")
83            if element_id is None:
84                continue
85            contents = {}
86            if ident == "Dimension":
87                contents["position"] = elem.get("position", "")
88            if (lr := elem.find("str:LocalRepresentation", NAME_SPACES)) is not None and (
89                enumer := lr.find("str:Enumeration/Ref", NAME_SPACES)
90            ) is not None:
91                contents = contents | enumer.attrib
92            elements[element_id] = contents
93    return elements

Get the data dimensions and attributes metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

Note: The dimensions metadata includes a "position" for each dimmension. The attributes metadata does not have "position" information.

@cache
def data_flows( flow_id: str = 'all', **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
22@cache
23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
24    """Get the toplevel metadata from the ABS SDMX API.
25
26    Args:
27        flow_id (str): The ID of the dataflow to retrieve. Defaults to "all".
28        **kwargs: Additional keyword arguments passed to acquire_url().
29
30    Returns:
31        dict[str, dict[str, str]]: A dictionary containing the dataflow IDs
32            and their metadatain key=value pairs.
33
34    Raises:
35        HttpError: If there is an issue with the HTTP request.
36        CacheError: If there is an issue with the cache.
37        ValueError: If no XML root is found in the response.
38
39    """
40    tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs)
41
42    d_flows: FlowMetaDict = {}
43    for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES):
44        attributes: dict[str, str] = dataflow.attrib.copy()
45        if "id" not in attributes:
46            continue
47        df_id = attributes.pop("id")
48        name_elem = dataflow.find("com:Name", NAME_SPACES)
49        df_name = name_elem.text if name_elem is not None else "(missing name)"
50        attributes["name"] = str(df_name)  # str(...) because pylance complains about it being None
51        d_flows[df_id] = attributes
52    return d_flows

Get the toplevel metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

def fetch( flow_id: str, dims: dict[str, str] | None = None, parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
200def fetch(
201    flow_id: str,
202    dims: dict[str, str] | None = None,
203    parameters: dict[str, str] | None = None,
204    *,
205    validate: bool = False,
206    **kwargs: Unpack[GetFileKwargs],
207) -> tuple[pd.DataFrame, pd.DataFrame]:
208    """Fetch data from the ABS SDMX API.
209
210    Args:
211        flow_id (str): The ID of the data flow from which to retrieve data items.
212        dims (dict[str, str], optional): A dictionary of dimensions to select the
213            data items. If None, the ABS fetch request will be for all data items,
214            which can be slow.
215        parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply
216            to the data request. Supported parameters include:
217            - 'startPeriod': Start period for data filtering (e.g., '2020-Q1')
218            - 'endPeriod': End period for data filtering (e.g., '2023-Q4')
219            - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata')
220            If None, no parameters are applied.
221        validate (bool, optional): If True, validate `dims` against the flow's
222            required dimensions when generating the URL key. Defaults to False.
223        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
224
225    Returns: a tuple of two DataFrames:
226        - The first DataFrame contains the fetched data.
227        - The second DataFrame contains the metadata.
228
229    Raises:
230        HttpError: If there is an issue with the HTTP request.
231        CacheError: If there is an issue with the cache.
232        ValueError: If no XML root is found in the response.
233        ValueError: If invalid parameter values are provided.
234
235    Notes:
236        If the `dims` argument is not valid you should get a CacheError or HttpError.
237        If the `flow_id` is not valid, you should get a ValueError.
238
239    """
240    # --- report the parameters used if requested
241    verbose = kwargs.get("verbose", False)
242    if verbose:
243        print(f"fetch(): {flow_id=} {dims=} {parameters=} {validate=} {kwargs=}")
244
245    # --- validate parameters
246    valid_detail_values = {"full", "dataonly", "serieskeysonly", "nodata"}
247    if parameters:
248        detail_value = parameters.get("detail")
249        if detail_value and detail_value not in valid_detail_values:
250            raise ValueError(f"Invalid detail value '{detail_value}'. Must be one of: {valid_detail_values}")
251
252    # --- prepare to get the XML root from the ABS SDMX API
253    # prefer fresh data every time
254    kwargs["modality"] = kwargs.get("modality", "prefer-url")
255    key = build_key(flow_id, dims, validate=validate)
256
257    # --- build URL with optional parameters
258    url = f"{URL_STEM}/data/{flow_id}/{key}"
259    if parameters:
260        url_params = []
261        if "startPeriod" in parameters:
262            url_params.append(f"startPeriod={parameters['startPeriod']}")
263        if "endPeriod" in parameters:
264            url_params.append(f"endPeriod={parameters['endPeriod']}")
265        if "detail" in parameters:
266            url_params.append(f"detail={parameters['detail']}")
267        if url_params:
268            url += "?" + "&".join(url_params)
269
270    xml_root = acquire_xml(url, **kwargs)
271    return _extract(flow_id, xml_root)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply to the data request. Supported parameters include: - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') - 'endPeriod': End period for data filtering (e.g., '2023-Q4') - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') If None, no parameters are applied. validate (bool, optional): If True, validate dims against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. ValueError: If invalid parameter values are provided.

Notes: If the dims argument is not valid you should get a CacheError or HttpError. If the flow_id is not valid, you should get a ValueError.

def fetch_gdp( seasonality: Literal['o', 's', 't'] = 'o', price_measure: Literal['cp', 'cvm'] = 'cp', parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
18def fetch_gdp(
19    seasonality: Literal["o", "s", "t"] = "o",
20    price_measure: Literal["cp", "cvm"] = "cp",
21    parameters: dict[str, str] | None = None,
22    *,
23    validate: bool = False,
24    **kwargs: Unpack[GetFileKwargs],
25) -> tuple[pd.DataFrame, pd.DataFrame]:
26    """Fetch quarterly GDP data in $ from the ABS SDMX API.
27
28    Args:
29        seasonality (str): Type of seasonal adjustment to apply:
30            - "o": Original data without seasonal adjustment (default)
31            - "s": Seasonally adjusted data
32            - "t": Trend data
33        price_measure (str): Price measure type:
34            - "cp": Current prices (default)
35            - "cvm": Chain volume measures
36        parameters (dict[str, str] | None): Additional parameters for the API request,
37            such as 'startPeriod'.
38        validate (bool, optional): If True, validate the selection against the flow's
39            required dimensions when generating the URL key. Defaults to False.
40        **kwargs: Additional arguments passed to the fetch_selection() function
41
42    Returns:
43        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata
44
45    Raises:
46        ValueError: If invalid seasonality or price_measure values are provided
47
48    """
49    # report the parameters used if requested
50    verbose = kwargs.get("verbose", False)
51    if verbose:
52        print(f"fetch_gdp(): {seasonality=}, {price_measure=} {validate=} {kwargs=}")
53
54    # Validate inputs
55    if seasonality not in SEAS_MAP:
56        error = f"Invalid '{seasonality=}'. Must be one of: {list(SEAS_MAP.keys())}"
57        raise ValueError(error)
58    if price_measure not in PRICE_MAP:
59        error = f"Invalid '{price_measure=}'. Must be one of: {list(PRICE_MAP.keys())}"
60        raise ValueError(error)
61
62    # build a selection criteria
63    selection_criteria = [
64        (SEAS_MAP[seasonality], "TSEST", Mt.EXACT),
65        (PRICE_MAP[price_measure], "MEASURE", Mt.EXACT),
66        ("Gross domestic product", "DATA_ITEM", Mt.EXACT),
67    ]
68    # return the data
69    flow_id = "ANA_AGG"
70    return fetch_selection(flow_id, selection_criteria, parameters, validate=validate, **kwargs)

Fetch quarterly GDP data in $ from the ABS SDMX API.

Args: seasonality (str): Type of seasonal adjustment to apply: - "o": Original data without seasonal adjustment (default) - "s": Seasonally adjusted data - "t": Trend data price_measure (str): Price measure type: - "cp": Current prices (default) - "cvm": Chain volume measures parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata

Raises: ValueError: If invalid seasonality or price_measure values are provided

def fetch_multi( wanted: pandas.core.frame.DataFrame, parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
116def fetch_multi(
117    wanted: pd.DataFrame,
118    parameters: dict[str, str] | None = None,
119    *,
120    validate: bool = False,
121    **kwargs: Unpack[GetFileKwargs],
122) -> tuple[pd.DataFrame, pd.DataFrame]:
123    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
124
125    Args:
126        wanted: A DataFrame with rows for each desired data set (of one or more series).
127                Each row should contain the necessary identifiers to fetch the dataset.
128                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
129                The 'flow_id' column is mandatory, and the rest are optional.
130                Note: the DataFrame index is not used in the fetching process.
131        parameters: A dictionary of additional parameters to pass to the fetch function.
132        validate: If True, the function will validate dimensions and values against
133                  the ABS SDMX API codelists. Defaults to False.
134        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
135
136    Returns:
137        A tuple containing two DataFrames:
138        - The first DataFrame contains the fetched data.
139        - The second DataFrame contains metadata about the fetched datasets.
140
141    Raises:
142        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
143
144    Note:
145        CacheError and HttpError are raised by the fetch function.
146        These will be caught and reported to standard output.
147
148    Note:
149        The function validates that all datasets have compatible index types.
150        A ValueError will be raised if incompatible index types are detected
151        (e.g., mixing quarterly and monthly data).
152
153    """
154    # --- report the parameters used if requested
155    verbose = kwargs.get("verbose", False)
156    if verbose:
157        print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}")
158
159    # --- quick sanity checks
160    if wanted.empty:
161        print("wanted DataFrame is empty, returning empty DataFrames.")
162        return pd.DataFrame(), pd.DataFrame()
163    if "flow_id" not in wanted.columns:
164        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
165
166    # --- do the work
167    return _extract(wanted, parameters, validate=validate, **kwargs)

Fetch multiple SDMX datasets based on a DataFrame of desired datasets.

Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.

Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.

Raises: ValueError: If the 'flow_id' column is missing from the wanted DataFrame.

Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.

Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).

def fetch_pop( source: Literal['erp', 'na'] = 'erp', parameters: dict[str, str] | None = None, *, projection: bool = False, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
135def fetch_pop(
136    source: Literal["erp", "na"] = "erp",
137    parameters: dict[str, str] | None = None,
138    *,
139    projection: bool = False,
140    validate: bool = False,
141    **kwargs: Unpack[GetFileKwargs],
142) -> tuple[pd.DataFrame, pd.DataFrame]:
143    """Fetch Estimated Resident Population (ERP) data from the ABS SDMX API.
144
145    Args:
146        source (str): Source of the population data:
147            - "erp": ABS published Estimated Resident Population (default)
148            - "na": Implied population from the ABS National Accounts
149        parameters (dict[str, str] | None): Additional parameters for the API request,
150            such as 'startPeriod'.
151        projection (bool, optional): If True, and data is available for the most recent year,
152            make a projection forward to the current quarter, based on growth over the last 4 quarters.
153        validate (bool, optional): If True, validate the selection against the flow's
154            required dimensions when generating the URL key. Defaults to False.
155        **kwargs: Additional arguments passed to the fetch_selection() function
156
157    Returns:
158        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
159
160    """
161    # report the parameters used if requested
162    verbose = kwargs.get("verbose", False)
163    if verbose:
164        print(f"fetch_pop(): {source=} {validate=} {kwargs=}")
165
166    # build a selection criteria and fetch the relevant data
167    match source:
168        case "erp":
169            data, meta = _erp_population("Australia", parameters, validate=validate, **kwargs)
170        case "na":
171            data, meta = _na_population(parameters, validate=validate, **kwargs)
172        case _:
173            raise ValueError(f"Invalid source '{source}'. Must be one of: ['erp', 'na']")
174
175    # if requested, make a projection of the data
176    if projection:
177        data = _make_projection(data)
178
179    return data, meta

Fetch Estimated Resident Population (ERP) data from the ABS SDMX API.

Args: source (str): Source of the population data: - "erp": ABS published Estimated Resident Population (default) - "na": Implied population from the ABS National Accounts parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, and data is available for the most recent year, make a projection forward to the current quarter, based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata

def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
206def fetch_selection(
207    flow_id: str,
208    criteria: MatchCriteria,
209    parameters: dict[str, str] | None = None,
210    *,
211    validate: bool = False,
212    **kwargs: Unpack[GetFileKwargs],
213) -> tuple[pd.DataFrame, pd.DataFrame]:
214    """Fetch data based on a selection criteria for items.
215
216    Args:
217        flow_id (str): The ID of the data flow to fetch.
218        criteria (MatchCriteria): A sequence of match criteria to filter the data.
219        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
220        validate (bool, optional): If True, validate the selection against the flow's
221            required dimensions when generating the URL key. Defaults to False.
222        **kwargs: Additional keyword arguments for the fetch_multi function.
223
224    Returns:
225        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
226
227    """
228    verbose = kwargs.get("verbose", False)
229    if verbose:
230        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
231
232    selection = make_wanted(flow_id, criteria)
233    return fetch_multi(selection, parameters, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.

def fetch_state_pop( state: str, parameters: dict[str, str] | None = None, *, projection: bool = False, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
182def fetch_state_pop(
183    state: str,
184    parameters: dict[str, str] | None = None,
185    *,
186    projection: bool = False,
187    validate: bool = False,
188    **kwargs: Unpack[GetFileKwargs],
189) -> tuple[pd.DataFrame, pd.DataFrame]:
190    """Fetch state-level ERP population data from the ABS SDMX API.
191
192    Args:
193        state (str): State/territory name or case-insensitive abbreviation (e.g., "NSW", "Vic", "qld", etc.).
194            [Note: Use "" or "all" for the population estimates for all states.]
195        parameters (dict[str, str] | None): Additional parameters for the API request,
196            such as 'startPeriod'.
197        projection (bool, optional): If True, make a projection forward to the current quarter
198            based on growth over the last 4 quarters.
199        validate (bool, optional): If True, validate the selection against the flow's
200            required dimensions when generating the URL key. Defaults to False.
201        **kwargs: Additional arguments passed to the fetch_selection() function
202
203    Returns:
204        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
205
206    """
207    # report the parameters used if requested
208    verbose = kwargs.get("verbose", False)
209    if verbose:
210        print(f"fetch_state_pop(): {state=} {validate=} {kwargs=}")
211
212    if state.lower() in ("", "all"):
213        full_state_name: str = ""
214    else:
215        full_state_name = _state_name_from_abbrev(state)
216
217    data, meta = _erp_population(full_state_name, parameters, validate=validate, **kwargs)
218
219    if projection:
220        data = _make_projection(data)
221
222    return data, meta

Fetch state-level ERP population data from the ABS SDMX API.

Args: state (str): State/territory name or case-insensitive abbreviation (e.g., "NSW", "Vic", "qld", etc.). [Note: Use "" or "all" for the population estimates for all states.] parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, make a projection forward to the current quarter based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata

def frame(f: dict[str, dict[str, str]]) -> pandas.core.frame.DataFrame:
206def frame(f: FlowMetaDict) -> pd.DataFrame:
207    """Convert a FlowMetaDict to a pandas DataFrame.
208
209    Args:
210        f (FlowMetaDict): The flow metadata dictionary to convert.
211
212    Returns:
213        pd.DataFrame: A DataFrame representation of the flow metadata.
214
215    Note: This is a utility function to help visualize the flow metadata.
216
217    """
218    return pd.DataFrame(f).T

Convert a FlowMetaDict to a pandas DataFrame.

Args: f (FlowMetaDict): The flow metadata dictionary to convert.

Returns: pd.DataFrame: A DataFrame representation of the flow metadata.

Note: This is a utility function to help visualize the flow metadata.

def make_wanted( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
172def make_wanted(
173    flow_id: str,
174    criteria: MatchCriteria,
175) -> pd.DataFrame:
176    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
177
178    Args:
179        flow_id (str): The ID of the data flow to select items from.
180        criteria (MatchCriteria): A sequence of tuples containing the pattern,
181            dimension name, and match-type (exact, partial, or regex).
182
183    Returns:
184        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
185            into the call of the function fetch_multi().
186
187    Raises:
188        ValueError: If the flow_id is not valid or if no items match the criteria.
189
190    Notes:
191    -   Should build a one line DataFrame. This Frame may select multiple data series,
192        when passed to fetch_multi. It also can be concatenated with other DataFrames
193        to build a larger selection.
194    -   If two match elements refer to the same dimension, only the `intersection` of the
195        matches will be returned.
196
197    """
198    dimensions = _validate_flow_and_dimensions(flow_id)
199    result_dict = _process_match_criteria(criteria, flow_id, dimensions)
200
201    # Add flow_id and return as DataFrame
202    result_dict["flow_id"] = flow_id
203    return pd.DataFrame([result_dict]).astype(str)

Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def match_item( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
150def match_item(
151    pattern: str,
152    dimension: str,
153    match_type: MatchType = MatchType.PARTIAL,
154) -> MatchItem:
155    """Create a new MatchItem for use in select_items() and fetch_selection().
156
157    Args:
158        pattern (str): The pattern to match.
159        dimension (str): The dimension to match against.
160        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
161
162    Returns:
163        MatchElement: A tuple representing the match element.
164
165    Note:
166        This function is of little value. It is easier to create the tuple directly.
167
168    """
169    return (pattern, dimension, match_type)

Create a new MatchItem for use in select_items() and fetch_selection().

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

Note: This function is of little value. It is easier to create the tuple directly.

def measure_names(meta: pandas.core.frame.DataFrame) -> pandas.core.series.Series:
111def measure_names(meta: pd.DataFrame) -> pd.Series:
112    """Get the measure names for each row in the metadata DataFrame - (for y-axis labels).
113
114    Args:
115        meta (pd.DataFrame): The metadata DataFrame.
116
117    Returns:
118        pd.Series: A Series containing the measure names, indexed by the row labels.
119
120    """
121    series = pd.Series(dtype=str)
122    duplicate_number: str = " Number"  # the space before 'Number' is important
123    for label, row in meta.iterrows():
124        name: str = str(label)  # worst case scenario
125        if "UNIT_MEASURE" in row:
126            name = str(row["UNIT_MEASURE"])  # a better base case
127        if row.get("UNIT_MULT"):
128            try:
129                index = int(row["UNIT_MULT"])
130                if index in INDICIES and index > 0:
131                    name = f"{INDICIES[index]} {name}"  # best case
132            except ValueError:
133                pass
134        name = name.removesuffix(duplicate_number)  # Just in case it is 'Number Numer'
135        series[label] = name
136    return series

Get the measure names for each row in the metadata DataFrame - (for y-axis labels).

Args: meta (pd.DataFrame): The metadata DataFrame.

Returns: pd.Series: A Series containing the measure names, indexed by the row labels.

def recalibrate( data: pandas.core.frame.DataFrame, units: pandas.core.series.Series, *, as_a_whole: bool = False) -> tuple[pandas.core.frame.DataFrame, pandas.core.series.Series]:
139def recalibrate(
140    data: pd.DataFrame, units: pd.Series, *, as_a_whole: bool = False
141) -> tuple[pd.DataFrame, pd.Series]:
142    """Recalibrate the data so that its maximum value is between 1 and 1000.
143
144    Args:
145        units (pd.Series): The units of measure (as returned by measure_names()).
146        data (pd.DataFrame): The data to recalibrate.
147        as_a_whole (bool): If True, recalibrate the data as a whole, otherwise
148            recalibrate each column separately.
149
150    Returns:
151        tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data.
152
153    Why recalibrate?
154        So that the chart is easier to read and interpret, in units that are more familiar.
155
156    """
157    # --- data/argument validation
158    if units.empty:
159        raise ValueError("The units Series is empty.")
160    if len(units) != len(data.columns):
161        raise ValueError("The units Series must have the same length as the data DataFrame's columns.")
162    if as_a_whole and not _is_all_the_same(units):
163        raise ValueError("Cannot recalibrate as a whole when there are multiple units of measure.")
164    if not all(x in data.columns for x in units.index):
165        raise ValueError("The units Series must all be indexed by the data DataFrame's columns.")
166
167    if as_a_whole:
168        str_label: str = units.iloc[0]
169        datax, str_label = _refactor(data, str_label)
170        new_units = pd.Series([str_label] * len(data.columns), index=data.columns)
171        return pd.DataFrame(datax), new_units
172
173    for column in data.columns:
174        str_label = units[column]
175        series = data[column]
176        seriesx, str_label = _refactor(series, str_label)
177        data[column] = cast("pd.Series", seriesx)
178        units[column] = str_label
179
180    return data, units

Recalibrate the data so that its maximum value is between 1 and 1000.

Args: units (pd.Series): The units of measure (as returned by measure_names()). data (pd.DataFrame): The data to recalibrate. as_a_whole (bool): If True, recalibrate the data as a whole, otherwise recalibrate each column separately.

Returns: tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data.

Why recalibrate? So that the chart is easier to read and interpret, in units that are more familiar.

def recalibrate_series( series: pandas.core.series.Series, label: str) -> tuple[pandas.core.series.Series, str]:
183def recalibrate_series(series: pd.Series, label: str) -> tuple[pd.Series, str]:
184    """Recalibrate a Series with a label.
185
186    Args:
187        series (pd.Series): The Series to recalibrate.
188        label (str): The label for the Series.
189
190    Returns:
191        tuple[pd.Series, str]: The recalibrated Series and label.
192
193    """
194    seriesx, label = _refactor(series, label)
195    return cast("pd.Series", seriesx), label

Recalibrate a Series with a label.

Args: series (pd.Series): The Series to recalibrate. label (str): The label for the Series.

Returns: tuple[pd.Series, str]: The recalibrated Series and label.