sdmxabs

Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.

 1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API."""
 2
 3from importlib.metadata import PackageNotFoundError, version
 4
 5from .data import fetch
 6from .download_cache import (
 7    CacheError,
 8    GetFileKwargs,
 9    HttpError,
10    ModalityType,
11)
12from .metadata import code_lists, data_dimensions, data_flows
13
14# --- version and author
15try:
16    __version__ = version(__name__)
17except PackageNotFoundError:
18    __version__ = "0.0.0"  # Fallback for development mode
19__author__ = "Bryan Palmer"
20
21# --- establish the package contents
22__all__ = [
23    "CacheError",
24    "GetFileKwargs",
25    "HttpError",
26    "ModalityType",
27    "__author__",
28    "__version__",
29    "code_lists",
30    "data_dimensions",
31    "data_flows",
32    "fetch",
33]
class CacheError(builtins.Exception):
30class CacheError(Exception):
31    """A problem retrieving data from the cache."""

A problem retrieving data from the cache.

class GetFileKwargs(typing.TypedDict):
37class GetFileKwargs(TypedDict):
38    """TypedDict for acqure_url function arguments."""
39
40    verbose: NotRequired[bool]
41    """If True, print information about the retrieval process."""
42    modality: NotRequired[ModalityType]
43    """Kind of retrieval: "prefer_cache", "prefer_url"."""

TypedDict for acqure_url function arguments.

verbose: NotRequired[bool]

If True, print information about the retrieval process.

modality: NotRequired[Literal['prefer_cache', 'prefer_url']]

Kind of retrieval: "prefer_cache", "prefer_url".

class HttpError(builtins.Exception):
26class HttpError(Exception):
27    """A problem retrieving data from HTTP."""

A problem retrieving data from HTTP.

ModalityType = typing.Literal['prefer_cache', 'prefer_url']
__author__ = 'Bryan Palmer'
__version__ = '0.1.0'
@cache
def code_lists( cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> pandas.core.frame.DataFrame:
 84@cache
 85def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> pd.DataFrame:
 86    """Get the code list metadata from the ABS SDMX API.
 87
 88    Args:
 89        cl_id (str): The ID of the code list to retrieve.
 90        **kwargs: Additional keyword arguments passed to acquire_url().
 91
 92    Raises:
 93        HttpError: If there is an issue with the HTTP request.
 94        CacheError: If there is an issue with the cache.
 95        ValueError: If no XML root is found in the response.
 96
 97    """
 98    tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs)
 99
100    codes = {}
101    for code in tree.findall(".//str:Code", NAME_SPACES):
102        code_id = code.get("id")
103        if code_id is None:
104            continue
105        elements = {}
106        name = code.find("com:Name", NAME_SPACES)
107        elements["name"] = name.text if name is not None else None
108        parent = code.find("str:Parent", NAME_SPACES)
109        parent_id = None
110        if parent is not None:
111            ref = parent.find("Ref", NAME_SPACES)
112            if ref is not None:
113                parent_id = ref.get("id")
114            elements["parent"] = parent_id
115        codes[code_id] = elements
116
117    return pd.DataFrame(codes).T.sort_index().rename_axis(index=cl_id)

Get the code list metadata from the ABS SDMX API.

Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

@cache
def data_dimensions( flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> pandas.core.frame.DataFrame:
53@cache
54def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> pd.DataFrame:
55    """Get the data dimensions metadata from the ABS SDMX API.
56
57    Args:
58        flow_id (str): The ID of the dataflow to retrieve dimensions for.
59        **kwargs: Additional keyword arguments passed to acquire_url().
60
61    Raises:
62        HttpError: If there is an issue with the HTTP request.
63        CacheError: If there is an issue with the cache.
64        ValueError: If no XML root is found in the response.
65
66    """
67    tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs)
68
69    dimensions = {}
70    for dim in tree.findall(".//str:Dimension", NAME_SPACES):
71        dim_id = dim.get("id")
72        dim_pos = dim.get("position")
73        if dim_id is None or dim_pos is None:
74            continue
75        contents = {"position": dim_pos}
76        if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and (
77            enumer := lr.find("str:Enumeration/Ref", NAME_SPACES)
78        ) is not None:
79            contents = contents | enumer.attrib
80        dimensions[dim_id] = contents
81    return pd.DataFrame(dimensions).T.rename_axis(index="dimensions")

Get the data dimensions metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

@cache
def data_flows( flow_id: str = 'all', **kwargs: Unpack[GetFileKwargs]) -> pandas.core.frame.DataFrame:
19@cache
20def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> pd.DataFrame:
21    """Get the toplevel metadata from the ABS SDMX API.
22
23    Args:
24        flow_id (str): The ID of the dataflow to retrieve. Defaults to "all".
25        **kwargs: Additional keyword arguments passed to acquire_url().
26
27    Returns:
28        pd.Series: A Series containing the dataflow IDs and names.
29
30    Raises:
31        HttpError: If there is an issue with the HTTP request.
32        CacheError: If there is an issue with the cache.
33        ValueError: If no XML root is found in the response.
34
35    """
36    tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs)
37
38    df = {}
39    for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES):
40        attributes = dataflow.attrib.copy()
41        if "id" not in attributes:
42            continue
43        df_id = attributes.pop("id")
44        name_elem = dataflow.find("com:Name", NAME_SPACES)
45        df_name = name_elem.text if name_elem is not None else "(no name)"
46        attributes["name"] = str(df_name)
47        df[df_id] = attributes
48    return pd.DataFrame(df).T.sort_index().rename_axis(index="dataflows")
49    # Note: The returned DataFrame has the dataflow IDs as the index and
50    # the attributes (like name, etc.) as columns.

Get the toplevel metadata from the ABS SDMX API.

Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().

Returns: pd.Series: A Series containing the dataflow IDs and names.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

def fetch( flow_id: str, dims: dict[str, str] | None = None, constraints: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
123def fetch(
124    flow_id: str,
125    dims: dict[str, str] | None = None,
126    constraints: dict[str, str] | None = None,  # not implemented yet
127    *,
128    validate: bool = False,
129    **kwargs: Unpack[GetFileKwargs],
130) -> tuple[pd.DataFrame, pd.DataFrame]:
131    """Fetch data from the ABS SDMX API.
132
133    Args:
134        flow_id (str): The ID of the data flow from which to retrieve data items.
135        dims (dict[str, str], optional): A dictionary of dimensions to select the
136            data items. If None, the ABS fetch request will be for all data items,
137            which can be slow.
138        constraints (dict[str, str], optional): A dictionary of constraints to apply
139            to the data items. If None, no constraints are applied.
140        validate (bool): If True, print validation diagnostics for the proposed
141            dimensions against the metadata requirements. Defaults to False.
142        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
143
144    Returns: a tuple of two DataFrames:
145        - The first DataFrame contains the fetched data.
146        - The second DataFrame contains the metadata.
147
148    Raises:
149        HttpError: If there is an issue with the HTTP request.
150        CacheError: If there is an issue with the cache.
151        ValueError: If no XML tree is found in the response.
152
153    """
154    # --- prepare to get the XML tree from the ABS SDMX API
155    kwargs["modality"] = kwargs.get("modality", "prefer_cache")  # default prefer_cache
156    key = build_key(
157        flow_id,
158        dims,
159        validate=validate,
160    )
161
162    # --- get the XML tree from the ABS SDMX API
163    _not_implemented = constraints
164    url = f"{URL_STEM}/data/{flow_id}/{key}"
165    tree = acquire_xml(url, **kwargs)
166
167    # --- extract and return metadata and data from the XML tree
168    return populate(flow_id, tree)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML tree is found in the response.