sdmxabs

Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.

 1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API."""
 2
 3from importlib.metadata import PackageNotFoundError, version
 4
 5from .download_cache import (
 6    CacheError,
 7    GetFileKwargs,
 8    HttpError,
 9    ModalityType,
10)
11from .fetch import fetch
12from .fetch_gdp import fetch_gdp
13from .fetch_multi import fetch_multi
14from .fetch_pop import fetch_pop, fetch_state_pop
15from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item
16from .flow_metadata import (
17    FlowMetaDict,
18    code_list_for,
19    code_lists,
20    data_flows,
21    data_structures,
22    frame,
23    structure_from_flow_id,
24    structure_ident,
25)
26from .measures import measure_names, recalibrate, recalibrate_series
27
28# --- version and author
29try:
30    __version__ = version(__name__)
31except PackageNotFoundError:
32    __version__ = "0.0.0"  # Fallback for development mode
33__author__ = "Bryan Palmer"
34
35# --- establish the package contents
36__all__ = [
37    "CacheError",
38    "FlowMetaDict",
39    "GetFileKwargs",
40    "HttpError",
41    "MatchCriteria",
42    "MatchItem",
43    "MatchType",
44    "ModalityType",
45    "__author__",
46    "__version__",
47    "code_list_for",
48    "code_lists",
49    "data_flows",
50    "data_structures",
51    "fetch",
52    "fetch_gdp",
53    "fetch_multi",
54    "fetch_pop",
55    "fetch_selection",
56    "fetch_state_pop",
57    "frame",
58    "make_wanted",
59    "match_item",
60    "measure_names",
61    "recalibrate",
62    "recalibrate_series",
63    "structure_from_flow_id",
64    "structure_ident",
65]
class CacheError(builtins.Exception):
38class CacheError(Exception):
39    """A problem retrieving data from the cache."""

A problem retrieving data from the cache.

FlowMetaDict = dict[str, dict[str, str]]
class GetFileKwargs(typing.TypedDict):
45class GetFileKwargs(TypedDict):
46    """TypedDict for acqure_url function arguments."""
47
48    verbose: NotRequired[bool]
49    """If True, print information about the data retrieval process."""
50    modality: NotRequired[ModalityType]
51    """Kind of retrieval: "prefer_cache", "prefer_url"."""

TypedDict for acqure_url function arguments.

verbose: NotRequired[bool]

If True, print information about the data retrieval process.

modality: NotRequired[Literal['prefer-cache', 'prefer-url']]

Kind of retrieval: "prefer_cache", "prefer_url".

class HttpError(builtins.Exception):
34class HttpError(Exception):
35    """A problem retrieving data using HTTP."""

A problem retrieving data using HTTP.

MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
MatchItem = tuple[str, str, MatchType]
class MatchType(enum.Enum):
22class MatchType(Enum):
23    """Enumeration for match types."""
24
25    EXACT = 1
26    PARTIAL = 2
27    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
ModalityType = typing.Literal['prefer-cache', 'prefer-url']
__author__ = 'Bryan Palmer'
__version__ = '0.1.0'
@cache
def code_list_for( struct_id: str, dim_name: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
214@cache
215def code_list_for(struct_id: str, dim_name: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
216    """Get the code list for a specific dimension or attribute in a data structure.
217
218    Args:
219        struct_id (str): The data structure ID.
220        dim_name (str): The dimension or attribute ID to retrieve the code list for.
221        **kwargs: Additional keyword arguments passed to acquire_url().
222
223    Returns:
224        FlowMetaDict: A dictionary containing the codes and their metadata.
225
226    Raises:
227        ValueError: If the dimension/attribute is not found in the structure.
228
229    """
230    structure = data_structures(struct_id, **kwargs)
231    if not structure:
232        raise ValueError(f"No structure found for structure ID '{struct_id}'")
233    if dim_name not in structure:
234        raise ValueError(f"Dimension/Attribute '{dim_name}' not found in structure: '{struct_id}'")
235
236    codelist_id = structure[dim_name].get(CODE_LIST_ID, "")
237    if not codelist_id:
238        raise ValueError(
239            f"No codelist found for dimension/attribute '{dim_name}' in structure ID '{struct_id}'"
240        )
241
242    return code_lists(codelist_id, **kwargs)

Get the code list for a specific dimension or attribute in a data structure.

Args: struct_id (str): The data structure ID. dim_name (str): The dimension or attribute ID to retrieve the code list for. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: FlowMetaDict: A dictionary containing the codes and their metadata.

Raises: ValueError: If the dimension/attribute is not found in the structure.

@cache
def code_lists( cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
155@cache
156def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
157    """Get the code list metadata from the ABS SDMX API.
158
159    Args:
160        cl_id (str): The ID of the code list to retrieve.
161        **kwargs: Additional keyword arguments passed to acquire_url().
162
163    Returns:
164        FlowMetaDict: A dictionary containing the codes and
165            their associated key=value pairs. A "name" key should always
166            be present. A "parent" key may also be present.
167
168    Raises:
169        HttpError: If there is an issue with the HTTP request.
170        CacheError: If there is an issue with the cache.
171        ValueError: If no XML root is found in the response.
172
173    Note:
174        You will get a CacheError if the codelist is not found on the ABS SDMX API.
175        (This package tries the website first, then the cache.)
176
177    Guarantees for the inner dictionary:
178        - The inner dictionary will always have a "name" key.
179        - The inner dictionary may have a "parent" key if the code has a parent.
180
181    """
182    tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs)
183
184    codes: FlowMetaDict = {}
185    for code in tree.findall(".//str:Code", NAME_SPACES):
186        code_id = code.get("id", None)
187        if code_id is None:
188            continue
189        elements: dict[str, str] = {}
190
191        # - get the name
192        name = code.find("com:Name", NAME_SPACES)
193        if name is None or not name.text:
194            # guarantee that we name key and value pair
195            print(f"Warning: Code {code_id} in {cl_id}has no name, skipping.")
196            continue  # skip if no name
197        elements["name"] = name.text
198
199        # - get the parent
200        parent = code.find("str:Parent", NAME_SPACES)
201        parent_id = ""
202        if parent is not None:
203            ref = parent.find("Ref", NAME_SPACES)
204            if ref is not None:
205                parent_id = str(ref.get("id", ""))
206        if parent_id:  # Only add if not empty
207            elements["parent"] = parent_id
208
209        codes[code_id] = elements
210
211    return codes

Get the code list metadata from the ABS SDMX API.

Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)

Guarantees for the inner dictionary: - The inner dictionary will always have a "name" key. - The inner dictionary may have a "parent" key if the code has a parent.

@cache
def data_flows( flow_id: str = 'all', **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
41@cache
42def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
43    """Get the toplevel metadata from the ABS SDMX API.
44
45    Args:
46        flow_id (str): The ID of the dataflow to retrieve. Defaults to "all".
47        **kwargs: Additional keyword arguments passed to acquire_url().
48
49    Returns:
50        dict[str, dict[str, str]]: A dictionary containing the dataflow IDs
51            and their metadata in key=value pairs. Importantly, it includes
52            the DATASTRUCTURE identifier, which is used to retrieve
53            the dimensions and attributes metadata.
54
55    Raises:
56        HttpError: If there is an issue with the HTTP request.
57        CacheError: If there is an issue with the cache.
58        ValueError: If no XML root is found in the response.
59
60    Guarantees:
61        - data_flows(): the returned inner dictionary from data_flows() will always contain the keys
62          "flow_name" and "data_structure_id" for each dataflow. Any XML without these
63          keys from the ABS is ignored.
64
65    """
66    tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs)
67
68    data_flows_dict: FlowMetaDict = {}
69    for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES):
70        attributes: dict[str, str] = dataflow.attrib.copy()
71        if "id" not in attributes:
72            continue
73        dataflow_id = attributes.pop("id")
74        name_elem = dataflow.find("com:Name", NAME_SPACES)
75        dataflow_name = name_elem.text if name_elem is not None else "(missing name)"
76        attributes[FLOW_NAME] = str(dataflow_name)
77        ds_elem = dataflow.find("str:Structure/Ref", NAME_SPACES)
78        if ds_elem is None:
79            continue  # skip if no data structure reference
80        ds_id = ds_elem.get("id", "")
81        if not ds_id:
82            continue
83        attributes[DATA_STRUCT_ID] = ds_id
84        data_flows_dict[dataflow_id] = attributes
85    return data_flows_dict

Get the toplevel metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadata in key=value pairs. Importantly, it includes the DATASTRUCTURE identifier, which is used to retrieve the dimensions and attributes metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

Guarantees: - data_flows(): the returned inner dictionary from data_flows() will always contain the keys "flow_name" and "data_structure_id" for each dataflow. Any XML without these keys from the ABS is ignored.

@cache
def data_structures( struct_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
109@cache
110def data_structures(struct_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
111    """Get the data structure for a specific structure ID from the ABS SDMX API.
112
113    Args:
114        struct_id (str): The ID of the data structure to retrieve.
115        **kwargs: Additional keyword arguments passed to acquire_url().
116
117    Returns:
118        dict[str, dict[str, str]]: A dictionary containing the dimensions and
119            their metadata in key=value pairs.
120
121    Raises:
122        HttpError: If there is an issue with the HTTP request.
123        CacheError: If there is an issue with the cache.
124        ValueError: If no XML root is found in the response.
125
126    Note:
127        The dimensions metadata includes a "position" for each dimmension.
128        The attributes metadata does not have "position" information.
129
130    """
131    tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{struct_id}", **kwargs)
132
133    elements = {}
134    for ident in ("Dimension", "Attribute"):
135        for elem in tree.findall(f".//str:{ident}", NAME_SPACES):
136            element_id = elem.get("id")
137            if element_id is None:
138                continue
139            contents = {}
140            if ident == "Dimension":
141                contents[POSITION] = elem.get(POSITION, "")
142            if (lr := elem.find("str:LocalRepresentation", NAME_SPACES)) is not None and (
143                enumer := lr.find("str:Enumeration/Ref", NAME_SPACES)
144            ) is not None:
145                contents = contents | enumer.attrib
146            # --- check we have a code list, and give it a better name
147            code_list_id = contents.pop("id", "")
148            if not code_list_id or contents.get("package") != "codelist":
149                continue
150            contents[CODE_LIST_ID] = code_list_id
151            elements[element_id] = contents
152    return elements

Get the data structure for a specific structure ID from the ABS SDMX API.

Args: struct_id (str): The ID of the data structure to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

Note: The dimensions metadata includes a "position" for each dimmension. The attributes metadata does not have "position" information.

def fetch( flow_id: str, selection: dict[str, str] | None = None, parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
202def fetch(
203    flow_id: str,
204    selection: dict[str, str] | None = None,
205    parameters: dict[str, str] | None = None,
206    *,
207    validate: bool = False,
208    **kwargs: Unpack[GetFileKwargs],
209) -> tuple[pd.DataFrame, pd.DataFrame]:
210    """Fetch data from the ABS SDMX API.
211
212    Args:
213        flow_id (str): The ID of the data flow from which to retrieve data items.
214        selection (dict[str, str], optional): A dictionary of dimension=value pairs
215            to select the data items. If None, the ABS fetch request will be for all
216            data items, which can be slow.
217        parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply
218            to the data request. Supported parameters include:
219            - 'startPeriod': Start period for data filtering (e.g., '2020-Q1')
220            - 'endPeriod': End period for data filtering (e.g., '2023-Q4')
221            - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata')
222            If None, no parameters are applied.
223        validate (bool, optional): If True, validate  against the flow's
224            required dimensions when generating the URL key. Defaults to False.
225        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
226
227    Returns: a tuple of two DataFrames:
228        - The first DataFrame contains the fetched data.
229        - The second DataFrame contains the metadata.
230
231    Raises:
232        HttpError: If there is an issue with the HTTP request.
233        CacheError: If there is an issue with the cache.
234        ValueError: If no XML root is found in the response.
235        ValueError: If invalid parameter values are provided.
236
237    Notes:
238        If the `dims` argument is not valid you should get a CacheError or HttpError.
239        If the `flow_id` is not valid, you should get a ValueError.
240
241    """
242    # --- report the parameters used if requested
243    verbose = kwargs.get("verbose", False)
244    if verbose:
245        print(f"fetch(): {flow_id=} {selection=} {parameters=} {validate=} {kwargs=}")
246
247    # --- validate parameters
248    valid_detail_values = {"full", "dataonly", "serieskeysonly", "nodata"}
249    if parameters:
250        detail_value = parameters.get("detail")
251        if detail_value and detail_value not in valid_detail_values:
252            raise ValueError(f"Invalid detail value '{detail_value}'. Must be one of: {valid_detail_values}")
253
254    # --- prepare to get the XML root from the ABS SDMX API
255    # prefer fresh data every time
256    kwargs["modality"] = kwargs.get("modality", "prefer-url")
257    key = build_key(flow_id, selection, validate=validate)
258
259    # --- build URL with optional parameters
260    url = f"{URL_STEM}/data/{flow_id}/{key}"
261    if parameters:
262        url_params = []
263        if "startPeriod" in parameters:
264            url_params.append(f"startPeriod={parameters['startPeriod']}")
265        if "endPeriod" in parameters:
266            url_params.append(f"endPeriod={parameters['endPeriod']}")
267        if "detail" in parameters:
268            url_params.append(f"detail={parameters['detail']}")
269        if url_params:
270            url += "?" + "&".join(url_params)
271
272    xml_root = acquire_xml(url, **kwargs)
273    return _extract(flow_id, xml_root)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. selection (dict[str, str], optional): A dictionary of dimension=value pairs to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply to the data request. Supported parameters include: - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') - 'endPeriod': End period for data filtering (e.g., '2023-Q4') - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') If None, no parameters are applied. validate (bool, optional): If True, validate against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. ValueError: If invalid parameter values are provided.

Notes: If the dims argument is not valid you should get a CacheError or HttpError. If the flow_id is not valid, you should get a ValueError.

def fetch_gdp( seasonality: Literal['o', 's', 't'] = 'o', price_measure: Literal['cp', 'cvm'] = 'cp', parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
18def fetch_gdp(
19    seasonality: Literal["o", "s", "t"] = "o",
20    price_measure: Literal["cp", "cvm"] = "cp",
21    parameters: dict[str, str] | None = None,
22    *,
23    validate: bool = False,
24    **kwargs: Unpack[GetFileKwargs],
25) -> tuple[pd.DataFrame, pd.DataFrame]:
26    """Fetch quarterly GDP data in $ from the ABS SDMX API.
27
28    Args:
29        seasonality (str): Type of seasonal adjustment to apply:
30            - "o": Original data without seasonal adjustment (default)
31            - "s": Seasonally adjusted data
32            - "t": Trend data
33        price_measure (str): Price measure type:
34            - "cp": Current prices (default)
35            - "cvm": Chain volume measures
36        parameters (dict[str, str] | None): Additional parameters for the API request,
37            such as 'startPeriod'.
38        validate (bool, optional): If True, validate the selection against the flow's
39            required dimensions when generating the URL key. Defaults to False.
40        **kwargs: Additional arguments passed to the fetch_selection() function
41
42    Returns:
43        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata
44
45    Raises:
46        ValueError: If invalid seasonality or price_measure values are provided
47
48    """
49    # report the parameters used if requested
50    verbose = kwargs.get("verbose", False)
51    if verbose:
52        print(f"fetch_gdp(): {seasonality=}, {price_measure=} {validate=} {kwargs=}")
53
54    # Validate inputs
55    if seasonality not in SEAS_MAP:
56        error = f"Invalid '{seasonality=}'. Must be one of: {list(SEAS_MAP.keys())}"
57        raise ValueError(error)
58    if price_measure not in PRICE_MAP:
59        error = f"Invalid '{price_measure=}'. Must be one of: {list(PRICE_MAP.keys())}"
60        raise ValueError(error)
61
62    # build a selection criteria
63    selection_criteria = [
64        (SEAS_MAP[seasonality], "TSEST", Mt.EXACT),
65        (PRICE_MAP[price_measure], "MEASURE", Mt.EXACT),
66        ("Gross domestic product", "DATA_ITEM", Mt.EXACT),
67    ]
68    # return the data
69    flow_id = "ANA_AGG"
70    return fetch_selection(flow_id, selection_criteria, parameters, validate=validate, **kwargs)

Fetch quarterly GDP data in $ from the ABS SDMX API.

Args: seasonality (str): Type of seasonal adjustment to apply: - "o": Original data without seasonal adjustment (default) - "s": Seasonally adjusted data - "t": Trend data price_measure (str): Price measure type: - "cp": Current prices (default) - "cvm": Chain volume measures parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata

Raises: ValueError: If invalid seasonality or price_measure values are provided

def fetch_multi( wanted: pandas.core.frame.DataFrame, parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
118def fetch_multi(
119    wanted: pd.DataFrame,
120    parameters: dict[str, str] | None = None,
121    *,
122    validate: bool = False,
123    **kwargs: Unpack[GetFileKwargs],
124) -> tuple[pd.DataFrame, pd.DataFrame]:
125    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
126
127    Args:
128        wanted: A DataFrame with rows for each desired data set (of one or more series).
129                Each row should contain the necessary identifiers to fetch the dataset.
130                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
131                The 'flow_id' column is mandatory, and the rest are optional.
132                Note: the DataFrame index is not used in the fetching process.
133        parameters: A dictionary of additional parameters to pass to the fetch function.
134        validate: If True, the function will validate dimensions and values against
135                  the ABS SDMX API codelists. Defaults to False.
136        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
137
138    Returns:
139        A tuple containing two DataFrames:
140        - The first DataFrame contains the fetched data.
141        - The second DataFrame contains metadata about the fetched datasets.
142
143    Raises:
144        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
145
146    Note:
147        CacheError and HttpError are raised by the fetch function.
148        These will be caught and reported to standard output.
149
150    Note:
151        The function validates that all datasets have compatible index types.
152        A ValueError will be raised if incompatible index types are detected
153        (e.g., mixing quarterly and monthly data).
154
155    """
156    # --- report the parameters used if requested
157    verbose = kwargs.get("verbose", False)
158    if verbose:
159        print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}")
160
161    # --- quick sanity checks
162    if wanted.empty:
163        print("wanted DataFrame is empty, returning empty DataFrames.")
164        return pd.DataFrame(), pd.DataFrame()
165    if "flow_id" not in wanted.columns:
166        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
167
168    # --- do the work
169    return _extract(wanted, parameters, validate=validate, **kwargs)

Fetch multiple SDMX datasets based on a DataFrame of desired datasets.

Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.

Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.

Raises: ValueError: If the 'flow_id' column is missing from the wanted DataFrame.

Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.

Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).

def fetch_pop( source: Literal['erp', 'na'] = 'erp', parameters: dict[str, str] | None = None, *, projection: bool = False, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
136def fetch_pop(
137    source: Literal["erp", "na"] = "erp",
138    parameters: dict[str, str] | None = None,
139    *,
140    projection: bool = False,
141    validate: bool = False,
142    **kwargs: Unpack[GetFileKwargs],
143) -> tuple[pd.DataFrame, pd.DataFrame]:
144    """Fetch Estimated Resident Population (ERP) data from the ABS SDMX API.
145
146    Args:
147        source (str): Source of the population data:
148            - "erp": ABS published Estimated Resident Population (default)
149            - "na": Implied population from the ABS National Accounts
150        parameters (dict[str, str] | None): Additional parameters for the API request,
151            such as 'startPeriod'.
152        projection (bool, optional): If True, and data is available for the most recent year,
153            make a projection forward to the current quarter, based on growth over the last 4 quarters.
154        validate (bool, optional): If True, validate the selection against the flow's
155            required dimensions when generating the URL key. Defaults to False.
156        **kwargs: Additional arguments passed to the fetch_selection() function
157
158    Returns:
159        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
160
161    """
162    # report the parameters used if requested
163    verbose = kwargs.get("verbose", False)
164    if verbose:
165        print(f"fetch_pop(): {source=} {validate=} {kwargs=}")
166
167    # build a selection criteria and fetch the relevant data
168    match source:
169        case "erp":
170            data, meta = _erp_population("Australia", parameters, validate=validate, **kwargs)
171        case "na":
172            data, meta = _na_population(parameters, validate=validate, **kwargs)
173        case _:
174            raise ValueError(f"Invalid source '{source}'. Must be one of: ['erp', 'na']")
175
176    # if requested, make a projection of the data
177    if projection:
178        data = _make_projection(data)
179
180    return data, meta

Fetch Estimated Resident Population (ERP) data from the ABS SDMX API.

Args: source (str): Source of the population data: - "erp": ABS published Estimated Resident Population (default) - "na": Implied population from the ABS National Accounts parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, and data is available for the most recent year, make a projection forward to the current quarter, based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata

def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
169def fetch_selection(
170    flow_id: str,
171    criteria: MatchCriteria,
172    parameters: dict[str, str] | None = None,
173    *,
174    validate: bool = False,
175    **kwargs: Unpack[GetFileKwargs],
176) -> tuple[pd.DataFrame, pd.DataFrame]:
177    """Fetch data based on a selection criteria for items.
178
179    Args:
180        flow_id (str): The ID of the data flow to fetch.
181        criteria (MatchCriteria): A sequence of match criteria to filter the data.
182        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
183        validate (bool, optional): If True, validate the selection against the flow's
184            required dimensions when generating the URL key. Defaults to False.
185        **kwargs: Additional keyword arguments for the fetch_multi function.
186
187    Returns:
188        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
189
190    """
191    verbose = kwargs.get("verbose", False)
192    if verbose:
193        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
194
195    selection = make_wanted(flow_id, criteria)
196    return fetch_multi(selection, parameters, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.

def fetch_state_pop( state: str, parameters: dict[str, str] | None = None, *, projection: bool = False, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
183def fetch_state_pop(
184    state: str,
185    parameters: dict[str, str] | None = None,
186    *,
187    projection: bool = False,
188    validate: bool = False,
189    **kwargs: Unpack[GetFileKwargs],
190) -> tuple[pd.DataFrame, pd.DataFrame]:
191    """Fetch state-level ERP population data from the ABS SDMX API.
192
193    Args:
194        state (str): State/territory name or case-insensitive abbreviation (e.g., "NSW", "Vic", "qld", etc.).
195            [Note: Use "" or "all" for the population estimates for all states.]
196        parameters (dict[str, str] | None): Additional parameters for the API request,
197            such as 'startPeriod'.
198        projection (bool, optional): If True, make a projection forward to the current quarter
199            based on growth over the last 4 quarters.
200        validate (bool, optional): If True, validate the selection against the flow's
201            required dimensions when generating the URL key. Defaults to False.
202        **kwargs: Additional arguments passed to the fetch_selection() function
203
204    Returns:
205        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
206
207    """
208    # report the parameters used if requested
209    verbose = kwargs.get("verbose", False)
210    if verbose:
211        print(f"fetch_state_pop(): {state=} {validate=} {kwargs=}")
212
213    if state.lower() in ("", "all"):
214        full_state_name: str = ""
215    else:
216        full_state_name = _state_name_from_abbrev(state)
217
218    data, meta = _erp_population(full_state_name, parameters, validate=validate, **kwargs)
219
220    if projection:
221        data = _make_projection(data)
222
223    return data, meta

Fetch state-level ERP population data from the ABS SDMX API.

Args: state (str): State/territory name or case-insensitive abbreviation (e.g., "NSW", "Vic", "qld", etc.). [Note: Use "" or "all" for the population estimates for all states.] parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, make a projection forward to the current quarter based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata

def frame(f: dict[str, dict[str, str]]) -> pandas.core.frame.DataFrame:
270def frame(f: FlowMetaDict) -> pd.DataFrame:
271    """Convert a FlowMetaDict to a pandas DataFrame.
272
273    Args:
274        f (FlowMetaDict): The flow metadata dictionary to convert.
275
276    Returns:
277        pd.DataFrame: A DataFrame representation of the flow metadata.
278
279    Note: This is a utility function to help visualize the flow metadata.
280
281    """
282    return pd.DataFrame(f).T

Convert a FlowMetaDict to a pandas DataFrame.

Args: f (FlowMetaDict): The flow metadata dictionary to convert.

Returns: pd.DataFrame: A DataFrame representation of the flow metadata.

Note: This is a utility function to help visualize the flow metadata.

def make_wanted( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
135def make_wanted(
136    flow_id: str,
137    criteria: MatchCriteria,
138) -> pd.DataFrame:
139    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
140
141    Args:
142        flow_id (str): The ID of the data flow to select items from.
143        criteria (MatchCriteria): A sequence of tuples containing the pattern,
144            dimension name, and match-type (exact, partial, or regex).
145
146    Returns:
147        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
148            into the call of the function fetch_multi().
149
150    Raises:
151        ValueError: If the flow_id is not valid or if no items match the criteria.
152
153    Notes:
154    -   Should build a one line DataFrame. This Frame may select multiple data series,
155        when passed to fetch_multi. It also can be concatenated with other DataFrames
156        to build a larger selection.
157    -   If two match elements refer to the same dimension, only the `intersection` of the
158        matches will be returned.
159
160    """
161    structure = structure_from_flow_id(flow_id)
162    result_dict = _process_match_criteria(criteria, structure)
163
164    # Add flow_id and return as DataFrame
165    result_dict["flow_id"] = flow_id
166    return pd.DataFrame([result_dict]).astype(str)

Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def match_item( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
113def match_item(
114    pattern: str,
115    dimension: str,
116    match_type: MatchType = MatchType.PARTIAL,
117) -> MatchItem:
118    """Create a new MatchItem for use in select_items() and fetch_selection().
119
120    Args:
121        pattern (str): The pattern to match.
122        dimension (str): The dimension to match against.
123        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
124
125    Returns:
126        MatchElement: A tuple representing the match element.
127
128    Note:
129        This function is of little value. It is much easier to create the tuple directly.
130
131    """
132    return (pattern, dimension, match_type)

Create a new MatchItem for use in select_items() and fetch_selection().

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

Note: This function is of little value. It is much easier to create the tuple directly.

def measure_names(meta: pandas.core.frame.DataFrame) -> pandas.core.series.Series:
111def measure_names(meta: pd.DataFrame) -> pd.Series:
112    """Get the measure names for each row in the metadata DataFrame - (for y-axis labels).
113
114    Args:
115        meta (pd.DataFrame): The metadata DataFrame.
116
117    Returns:
118        pd.Series: A Series containing the measure names, indexed by the row labels.
119
120    """
121    series = pd.Series(dtype=str)
122    duplicate_number: str = " Number"  # the space before 'Number' is important
123    for label, row in meta.iterrows():
124        name: str = str(label)  # worst case scenario
125        if "UNIT_MEASURE" in row:
126            name = str(row["UNIT_MEASURE"])  # a better base case
127        if row.get("UNIT_MULT"):
128            try:
129                index = int(row["UNIT_MULT"])
130                if index in INDICIES and index > 0:
131                    name = f"{INDICIES[index]} {name}"  # best case
132            except ValueError:
133                pass
134        name = name.removesuffix(duplicate_number)  # Just in case it is 'Number Numer'
135        series[label] = name
136    return series

Get the measure names for each row in the metadata DataFrame - (for y-axis labels).

Args: meta (pd.DataFrame): The metadata DataFrame.

Returns: pd.Series: A Series containing the measure names, indexed by the row labels.

def recalibrate( data: pandas.core.frame.DataFrame, units: pandas.core.series.Series, *, as_a_whole: bool = False) -> tuple[pandas.core.frame.DataFrame, pandas.core.series.Series]:
139def recalibrate(
140    data: pd.DataFrame, units: pd.Series, *, as_a_whole: bool = False
141) -> tuple[pd.DataFrame, pd.Series]:
142    """Recalibrate the data so that its maximum value is between 1 and 1000.
143
144    Args:
145        units (pd.Series): The units of measure (as returned by measure_names()).
146        data (pd.DataFrame): The data to recalibrate.
147        as_a_whole (bool): If True, recalibrate the data as a whole, otherwise
148            recalibrate each column separately.
149
150    Returns:
151        tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data.
152
153    Why recalibrate?
154        So that the chart is easier to read and interpret, in units that are more familiar.
155
156    """
157    # --- data/argument validation
158    if units.empty:
159        raise ValueError("The units Series is empty.")
160    if len(units) != len(data.columns):
161        raise ValueError("The units Series must have the same length as the data DataFrame's columns.")
162    if as_a_whole and not _is_all_the_same(units):
163        raise ValueError("Cannot recalibrate as a whole when there are multiple units of measure.")
164    if not all(x in data.columns for x in units.index):
165        raise ValueError("The units Series must all be indexed by the data DataFrame's columns.")
166
167    if as_a_whole:
168        str_label: str = units.iloc[0]
169        datax, str_label = _refactor(data, str_label)
170        new_units = pd.Series([str_label] * len(data.columns), index=data.columns)
171        return pd.DataFrame(datax), new_units
172
173    for column in data.columns:
174        str_label = units[column]
175        series = data[column]
176        seriesx, str_label = _refactor(series, str_label)
177        data[column] = cast("pd.Series", seriesx)
178        units[column] = str_label
179
180    return data, units

Recalibrate the data so that its maximum value is between 1 and 1000.

Args: units (pd.Series): The units of measure (as returned by measure_names()). data (pd.DataFrame): The data to recalibrate. as_a_whole (bool): If True, recalibrate the data as a whole, otherwise recalibrate each column separately.

Returns: tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data.

Why recalibrate? So that the chart is easier to read and interpret, in units that are more familiar.

def recalibrate_series( series: pandas.core.series.Series, label: str) -> tuple[pandas.core.series.Series, str]:
183def recalibrate_series(series: pd.Series, label: str) -> tuple[pd.Series, str]:
184    """Recalibrate a Series with a label.
185
186    Args:
187        series (pd.Series): The Series to recalibrate.
188        label (str): The label for the Series.
189
190    Returns:
191        tuple[pd.Series, str]: The recalibrated Series and label.
192
193    """
194    seriesx, label = _refactor(series, label)
195    return cast("pd.Series", seriesx), label

Recalibrate a Series with a label.

Args: series (pd.Series): The Series to recalibrate. label (str): The label for the Series.

Returns: tuple[pd.Series, str]: The recalibrated Series and label.

@cache
def structure_from_flow_id( flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
245@cache
246def structure_from_flow_id(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
247    """Get the data structure directly from the flow identifier.
248
249    Args:
250        flow_id (str): The ID of the data flow to validate.
251        **kwargs: Additional keyword arguments, ultimately passed to acquire_url().
252
253    Returns:
254        FlowMetaDict: Dictionary containing the flow's structure.
255
256    Raises:
257        ValueError: If the flow_id is not valid.
258        ValueError: If the structure_id or structure cannot be found.
259
260    """
261    if flow_id not in data_flows(**kwargs):
262        raise ValueError(f"Invalid flow_id: {flow_id}.")
263    structure_id = structure_ident(flow_id, **kwargs)
264    structure = data_structures(structure_id, **kwargs)
265    if not structure:
266        raise ValueError(f"No structure found for structure ID: {structure_id}.")
267    return structure

Get the data structure directly from the flow identifier.

Args: flow_id (str): The ID of the data flow to validate. **kwargs: Additional keyword arguments, ultimately passed to acquire_url().

Returns: FlowMetaDict: Dictionary containing the flow's structure.

Raises: ValueError: If the flow_id is not valid. ValueError: If the structure_id or structure cannot be found.

@cache
def structure_ident( flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> str:
 88@cache
 89def structure_ident(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> str:
 90    """Get the data structure ID for a specific dataflow.
 91
 92    Args:
 93        flow_id (str): The ID of the dataflow to retrieve the structure ID for.
 94        **kwargs: Additional keyword arguments passed to acquire_url().
 95
 96    Returns:
 97        str: The data structure ID for the specified dataflow.
 98
 99    Raises:
100        ValueError: If the flow_id is not found or has no associated structure ID.
101
102    """
103    flow = data_flows(flow_id, **kwargs)
104    if flow_id not in flow or DATA_STRUCT_ID not in flow[flow_id]:
105        raise ValueError(f"No data structure found for flow '{flow_id}'")
106    return flow[flow_id][DATA_STRUCT_ID]

Get the data structure ID for a specific dataflow.

Args: flow_id (str): The ID of the dataflow to retrieve the structure ID for. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: str: The data structure ID for the specified dataflow.

Raises: ValueError: If the flow_id is not found or has no associated structure ID.