sdmxabs

Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.

 1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API."""
 2
 3from importlib.metadata import PackageNotFoundError, version
 4
 5from .download_cache import (
 6    CacheError,
 7    GetFileKwargs,
 8    HttpError,
 9    ModalityType,
10)
11from .fetch import fetch
12from .fetch_multi import fetch_multi
13from .flow_metadata import code_lists, data_dimensions, data_flows
14from .select_items import MatchCriteria, MatchItem, MatchType, fetch_selection, match_criterion, select_items
15
16# --- version and author
17try:
18    __version__ = version(__name__)
19except PackageNotFoundError:
20    __version__ = "0.0.0"  # Fallback for development mode
21__author__ = "Bryan Palmer"
22
23# --- establish the package contents
24__all__ = [
25    "CacheError",
26    "GetFileKwargs",
27    "HttpError",
28    "MatchCriteria",
29    "MatchItem",
30    "MatchType",
31    "ModalityType",
32    "__author__",
33    "__version__",
34    "code_lists",
35    "data_dimensions",
36    "data_flows",
37    "fetch",
38    "fetch_multi",
39    "fetch_selection",
40    "match_criterion",
41    "select_items",
42]
class CacheError(builtins.Exception):
30class CacheError(Exception):
31    """A problem retrieving data from the cache."""

A problem retrieving data from the cache.

class GetFileKwargs(typing.TypedDict):
37class GetFileKwargs(TypedDict):
38    """TypedDict for acqure_url function arguments."""
39
40    verbose: NotRequired[bool]
41    """If True, print information about the data retrieval process."""
42    modality: NotRequired[ModalityType]
43    """Kind of retrieval: "prefer_cache", "prefer_url"."""

TypedDict for acqure_url function arguments.

verbose: NotRequired[bool]

If True, print information about the data retrieval process.

modality: NotRequired[Literal['prefer-cache', 'prefer-url']]

Kind of retrieval: "prefer_cache", "prefer_url".

class HttpError(builtins.Exception):
26class HttpError(Exception):
27    """A problem retrieving data from HTTP."""

A problem retrieving data from HTTP.

MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
MatchItem = tuple[str, str, MatchType]
class MatchType(enum.Enum):
15class MatchType(Enum):
16    """Enumeration for match types."""
17
18    EXACT = 1
19    PARTIAL = 2
20    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
ModalityType = typing.Literal['prefer-cache', 'prefer-url']
__author__ = 'Bryan Palmer'
__version__ = '0.1.0'
@cache
def code_lists( cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
 90@cache
 91def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
 92    """Get the code list metadata from the ABS SDMX API.
 93
 94    Args:
 95        cl_id (str): The ID of the code list to retrieve.
 96        **kwargs: Additional keyword arguments passed to acquire_url().
 97
 98    Returns:
 99        FlowMetaDict: A dictionary containing the codes and
100            their associated key=value pairs. A "name" key should always
101            be present. A "parent" key may also be present.
102
103    Raises:
104        HttpError: If there is an issue with the HTTP request.
105        CacheError: If there is an issue with the cache.
106        ValueError: If no XML root is found in the response.
107
108    """
109    tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs)
110
111    codes: FlowMetaDict = {}
112    for code in tree.findall(".//str:Code", NAME_SPACES):
113        code_id = code.get("id", None)
114        if code_id is None:
115            continue
116        elements: dict[str, str] = {}
117        name = code.find("com:Name", NAME_SPACES)
118        elements["name"] = str(name.text) if name is not None else "(missing)"
119        parent = code.find("str:Parent", NAME_SPACES)
120        parent_id = ""
121        if parent is not None:
122            ref = parent.find("Ref", NAME_SPACES)
123            if ref is not None:
124                parent_id = str(ref.get("id", ""))
125            elements["parent"] = parent_id
126        codes[code_id] = elements
127
128    return codes

Get the code list metadata from the ABS SDMX API.

Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

@cache
def data_dimensions( flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
55@cache
56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
57    """Get the data dimensions metadata from the ABS SDMX API.
58
59    Args:
60        flow_id (str): The ID of the dataflow to retrieve dimensions for.
61        **kwargs: Additional keyword arguments passed to acquire_url().
62
63    Returns:
64        dict[str, dict[str, str]]: A dictionary containing the dimensions and
65            their metadata in key=value pairs.
66
67    Raises:
68        HttpError: If there is an issue with the HTTP request.
69        CacheError: If there is an issue with the cache.
70        ValueError: If no XML root is found in the response.
71
72    """
73    tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs)
74
75    dimensions = {}
76    for dim in tree.findall(".//str:Dimension", NAME_SPACES):
77        dim_id = dim.get("id")
78        dim_pos = dim.get("position")
79        if dim_id is None or dim_pos is None:
80            continue
81        contents = {"position": dim_pos}
82        if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and (
83            enumer := lr.find("str:Enumeration/Ref", NAME_SPACES)
84        ) is not None:
85            contents = contents | enumer.attrib
86        dimensions[dim_id] = contents
87    return dimensions

Get the data dimensions metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

@cache
def data_flows( flow_id: str = 'all', **kwargs: Unpack[GetFileKwargs]) -> dict[str, dict[str, str]]:
22@cache
23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict:
24    """Get the toplevel metadata from the ABS SDMX API.
25
26    Args:
27        flow_id (str): The ID of the dataflow to retrieve. Defaults to "all".
28        **kwargs: Additional keyword arguments passed to acquire_url().
29
30    Returns:
31        dict[str, dict[str, str]]: A dictionary containing the dataflow IDs
32            and their metadatain key=value pairs.
33
34    Raises:
35        HttpError: If there is an issue with the HTTP request.
36        CacheError: If there is an issue with the cache.
37        ValueError: If no XML root is found in the response.
38
39    """
40    tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs)
41
42    d_flows: FlowMetaDict = {}
43    for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES):
44        attributes: dict[str, str] = dataflow.attrib.copy()
45        if "id" not in attributes:
46            continue
47        df_id = attributes.pop("id")
48        name_elem = dataflow.find("com:Name", NAME_SPACES)
49        df_name = name_elem.text if name_elem is not None else "(no name)"
50        attributes["name"] = str(df_name)
51        d_flows[df_id] = attributes
52    return d_flows

Get the toplevel metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().

Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

def fetch( flow_id: str, dims: dict[str, str] | None = None, constraints: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
135def fetch(
136    flow_id: str,
137    dims: dict[str, str] | None = None,
138    constraints: dict[str, str] | None = None,  # not implemented yet
139    *,
140    validate: bool = False,
141    **kwargs: Unpack[GetFileKwargs],
142) -> tuple[pd.DataFrame, pd.DataFrame]:
143    """Fetch data from the ABS SDMX API.
144
145    Args:
146        flow_id (str): The ID of the data flow from which to retrieve data items.
147        dims (dict[str, str], optional): A dictionary of dimensions to select the
148            data items. If None, the ABS fetch request will be for all data items,
149            which can be slow.
150        constraints (dict[str, str], optional): A dictionary of constraints to apply
151            to the data items. If None, no constraints are applied.
152        validate (bool): If True, print validation diagnostics for the proposed
153            dimensions against the metadata requirements. Defaults to False.
154        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
155
156    Returns: a tuple of two DataFrames:
157        - The first DataFrame contains the fetched data.
158        - The second DataFrame contains the metadata.
159
160    Raises:
161        HttpError: If there is an issue with the HTTP request.
162        CacheError: If there is an issue with the cache.
163        ValueError: If no XML root is found in the response.
164
165    """
166    # --- prepare to get the XML root from the ABS SDMX API
167    kwargs["modality"] = kwargs.get("modality", "prefer-cache")
168    key = build_key(
169        flow_id,
170        dims,
171        validate=validate,
172    )
173    _not_implemented = constraints
174    url = f"{URL_STEM}/data/{flow_id}/{key}"
175    xml_root = acquire_xml(url, **kwargs)
176    return extract(flow_id, xml_root)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

def fetch_multi( wanted: pandas.core.frame.DataFrame, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
 75def fetch_multi(
 76    wanted: pd.DataFrame,
 77    *,
 78    validate: bool = False,
 79    **kwargs: Unpack[GetFileKwargs],
 80) -> tuple[pd.DataFrame, pd.DataFrame]:
 81    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
 82
 83    Args:
 84        wanted: A DataFrame with rows for each desired data set (of one or more series).
 85                Each row should contain the necessary identifiers to fetch the dataset.
 86                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
 87                The 'flow_id' column is mandatory, and the rest are optional.
 88                Note: the DataFrame index is not used in the fetching process.
 89        validate: If True, the function will validate dimensions and values against
 90                  the ABS SDMX API codelists. Defaults to False.
 91        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
 92
 93    Returns:
 94        A tuple containing two DataFrames:
 95        - The first DataFrame contains the fetched data.
 96        - The second DataFrame contains metadata about the fetched datasets.
 97
 98    Raises:
 99        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
100
101    Note:
102        CacheError and HttpError are raised by the fetch function.
103        These will be caught and reported to standard output.
104
105    Caution:
106        The selected data should all have the same index. You cannot mix (for example)
107        Quarterly and Monthly data in the same DataFrame.
108
109    """
110    # --- quick sanity checks
111    if wanted.empty:
112        print("wanted DataFrame is empty, returning empty DataFrames.")
113        return pd.DataFrame(), pd.DataFrame()
114    if "flow_id" not in wanted.columns:
115        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
116
117    # --- do the work
118    return extract(wanted, validate=validate, **kwargs)

Fetch multiple SDMX datasets based on a DataFrame of desired datasets.

Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.

Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.

Raises: ValueError: If the 'flow_id' column is missing from the wanted DataFrame.

Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.

Caution: The selected data should all have the same index. You cannot mix (for example) Quarterly and Monthly data in the same DataFrame.

def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
139def fetch_selection(
140    flow_id: str,
141    criteria: MatchCriteria,
142) -> tuple[pd.DataFrame, pd.DataFrame]:
143    """Fetch data based on a selection criteria for items.
144
145    Args:
146        flow_id (str): The ID of the data flow to fetch.
147        criteria (MatchCriteria): A sequence of match criteria to filter the data.
148
149    Returns:
150        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
151
152    """
153    # --- select items based on the criteria
154    selection = select_items(flow_id, criteria)
155
156    # --- fetch the data using the selected items
157    return fetch_multi(selection)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.

def match_criterion( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
60def match_criterion(
61    pattern: str,
62    dimension: str,
63    match_type: MatchType = MatchType.PARTIAL,
64) -> MatchItem:
65    """Create a new match criterion for use in selection.
66
67    Args:
68        pattern (str): The pattern to match.
69        dimension (str): The dimension to match against.
70        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
71
72    Returns:
73        MatchElement: A tuple representing the match element.
74
75    """
76    return (pattern, dimension, match_type)

Create a new match criterion for use in selection.

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

def select_items( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
 79def select_items(
 80    flow_id: str,
 81    criteria: MatchCriteria,
 82) -> pd.DataFrame:
 83    """Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata.
 84
 85    Args:
 86        flow_id (str): The ID of the data flow to select items from.
 87        criteria (MatchElements): A sequence of tuples containing the element name,
 88            the value to match, and the match type (exact, partial, or regex).
 89
 90    Returns:
 91        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
 92            into the call of the function fetch_multi().
 93
 94    Raises:
 95        ValueError: If the flow_id is not valid or if no items match the criteria.
 96
 97    Notes:
 98    -   Should build a one line DataFrame. This Frame may select multiple data series,
 99        when passed to fetch_multi. It also can be concatenated with other DataFrames
100        to build a larger selection.
101    -   If two match elements refer to the same dimension, only the `intersection` of the
102        matches will be returned.
103
104    """
105    # --- some sanity checks
106    if flow_id not in data_flows():
107        raise ValueError(f"Invalid flow_id: {flow_id}.")
108    dimensions = data_dimensions(flow_id)
109    if not dimensions:
110        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
111
112    # --- lets build the codelist dictionary
113    return_dict: dict[str, str] = {}
114    for pattern, dimension, match_type in criteria:
115        if dimension not in dimensions:
116            print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)")
117            continue
118        dim_dict = dimensions[dimension]
119        code_list_dict = get_code_list_dict(dimension, dim_dict)
120        if not code_list_dict:
121            continue
122
123        codes = get_codes(code_list_dict, pattern, match_type)
124
125        # --- combine (as an intersection) with previous matches for this dimension
126        if dimension in return_dict:
127            previous = return_dict[dimension].split("+")
128            codes = list(set(previous).intersection(set(codes)))
129            if not codes:
130                del return_dict[dimension]  # no matches, remove dimension
131        if codes:
132            return_dict[dimension] = "+".join(list(set(codes)))
133
134    # --- return a DataFrame
135    return_dict["flow_id"] = flow_id
136    return pd.DataFrame([return_dict])

Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.