sdmxabs.fetch

Obtain data from the ABS SDMX API.

  1"""Obtain data from the ABS SDMX API."""
  2
  3from typing import Unpack
  4from xml.etree.ElementTree import Element
  5
  6import numpy as np
  7import pandas as pd
  8
  9from sdmxabs.download_cache import GetFileKwargs
 10from sdmxabs.flow_metadata import FlowMetaDict, build_key, code_lists, data_dimensions, data_flows
 11from sdmxabs.xml_base import NAME_SPACES, URL_STEM, acquire_xml
 12
 13
 14def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series:
 15    """Extract observed data from the XML tree for a given single series."""
 16    series_elements = {}
 17    for item in xml_series.findall("gen:Obs", NAME_SPACES):
 18        # --- get the index and value from the XML item, or nan if not found
 19        index_container = item.find("gen:ObsDimension", NAME_SPACES)
 20        index_obs = index_container.attrib.get("value", None) if index_container is not None else None
 21        value_container = item.find("gen:ObsValue", NAME_SPACES)
 22        value_obs = value_container.attrib.get("value", None) if value_container is not None else None
 23        if index_obs is None or value_obs is None:
 24            continue
 25        series_elements[index_obs] = value_obs
 26    series: pd.Series = pd.Series(series_elements).sort_index()
 27
 28    # --- if we can, make the series values numeric
 29    series = series.replace("", np.nan)
 30    try:
 31        series = pd.to_numeric(series)
 32    except ValueError:
 33        # If conversion fails, keep the series as is (it may contain useful non-numeric data)
 34        print(f"Could not convert series {meta.name} to numeric, keeping as is.")
 35
 36    # --- if we can, make the index a PeriodIndex based on the frequency
 37    if "FREQ" in meta.index:
 38        freq = meta["FREQ"]
 39        if freq == "Annual":
 40            series.index = pd.PeriodIndex(series.index, freq="Y")
 41        elif freq == "Quarterly":
 42            series.index = pd.PeriodIndex(series.index, freq="Q")
 43        elif freq == "Monthly":
 44            series.index = pd.PeriodIndex(series.index, freq="M")
 45        elif freq in ("Daily", "Daily or businessweek"):
 46            series.index = pd.PeriodIndex(series.index, freq="D")
 47        else:
 48            print(f"Unknown frequency {freq}, leaving index as is.")
 49
 50    return series
 51
 52
 53def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str:
 54    """Decode a metadata value based on its ID and the relevant ABS codelist."""
 55    return_value = meta_value  # default to returning the raw value
 56    if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]:
 57        cl_id = dims[meta_id]["id"]
 58        cl_package_type = dims[meta_id]["package"]
 59        if cl_id and cl_package_type == "codelist":
 60            cl = code_lists(cl_id)
 61            if meta_value in cl and "name" in cl[meta_value]:
 62                return_value = cl[meta_value]["name"]
 63    return return_value
 64
 65
 66def get_series_meta_data(
 67    flow_id: str, xml_series: Element, series_count: int, dims: FlowMetaDict
 68) -> tuple[str, pd.Series]:
 69    """Extract and decode metadata from the XML tree for one given series.
 70
 71    Args:
 72        xml_series (Element): The XML element representing the series.
 73        series_count (int): The index of the series in the XML tree.
 74        dims (FlowMetaDict): Dictionary containing metadata dimensions and
 75            their associated codelist names.
 76
 77    Returns:
 78        tuple[str, pd.Series]: A tuple containing the series label and a Series
 79            of metadata items for the series.
 80
 81    """
 82    item_count = 0
 83    keys = [flow_id]
 84    flow_name = data_flows().get(flow_id, {"name": flow_id})["name"]
 85    meta_items = {"DATAFLOW": flow_name}  # start with the flow ID
 86    key_sets = ("SeriesKey", "Attributes")
 87    for key_set in key_sets:
 88        attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES)
 89        if attribs is None:
 90            print(f"No {key_set} found in series, skipping.")
 91            continue
 92        for item in attribs.findall("gen:Value", NAME_SPACES):
 93            # --- get the metadata item ID and value, or create a placeholder if missing
 94            meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}")
 95            meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}")
 96            keys.append(meta_value)
 97            decoded_meta_value = decode_meta_value(meta_value, meta_id, dims)
 98            meta_items[meta_id] = decoded_meta_value
 99            item_count += 1
100
101    final_key = ".".join(keys)  # create a unique label for the series
102
103    return final_key, pd.Series(meta_items).rename(final_key)
104
105
106def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]:
107    """Extract data from the XML tree."""
108    # Get the data dimensions for the flow_id, it provides entree to the metadata
109    dims = data_dimensions(flow_id)
110
111    meta = {}
112    data = {}
113    for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)):
114        if xml_series is None:
115            print("No Series found in XML tree, skipping.")
116            continue
117        label, dataset = get_series_meta_data(
118            flow_id,
119            # python typing is not smart enough to know that
120            # xml_series is an ElementTree
121            xml_series,
122            series_count,
123            dims,
124        )
125        if label in meta:
126            # this can happen if you implicitly select the same series multiple times
127            print(f"Duplicate series {label} in {flow_id} found, skipping.")
128            continue
129        meta[label] = dataset
130        series = get_series_data(xml_series, dataset)
131        series.name = label
132        data[label] = series
133
134    return pd.DataFrame(data), pd.DataFrame(meta).T  # data, meta
135
136
137# === public functions ===
138def fetch(
139    flow_id: str,
140    dims: dict[str, str] | None = None,
141    constraints: dict[str, str] | None = None,  # not implemented yet
142    *,
143    validate: bool = False,
144    **kwargs: Unpack[GetFileKwargs],
145) -> tuple[pd.DataFrame, pd.DataFrame]:
146    """Fetch data from the ABS SDMX API.
147
148    Args:
149        flow_id (str): The ID of the data flow from which to retrieve data items.
150        dims (dict[str, str], optional): A dictionary of dimensions to select the
151            data items. If None, the ABS fetch request will be for all data items,
152            which can be slow.
153        constraints (dict[str, str], optional): A dictionary of constraints to apply
154            to the data items. If None, no constraints are applied.
155        validate (bool): If True, print validation diagnostics for the proposed
156            dimensions against the metadata requirements. Defaults to False.
157        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
158
159    Returns: a tuple of two DataFrames:
160        - The first DataFrame contains the fetched data.
161        - The second DataFrame contains the metadata.
162
163    Raises:
164        HttpError: If there is an issue with the HTTP request.
165        CacheError: If there is an issue with the cache.
166        ValueError: If no XML root is found in the response.
167
168    Notes:
169        If the `dims` argument is not valid you will get a CacheError or HttpError.
170        If the `flow_id` is not valid, you will get a ValueError.
171    """
172    # --- prepare to get the XML root from the ABS SDMX API
173    kwargs["modality"] = kwargs.get("modality", "prefer-cache")
174    key = build_key(
175        flow_id,
176        dims,
177        validate=validate,
178    )
179    _not_implemented = constraints
180    url = f"{URL_STEM}/data/{flow_id}/{key}"
181    xml_root = acquire_xml(url, **kwargs)
182    return extract(flow_id, xml_root)
183
184
185# --- quick and dirty testing
186if __name__ == "__main__":
187    # Example usage
188    FLOW_ID = "WPI"
189    DIMS = {
190        "MEASURE": "3",
191        "INDEX": "OHRPEB",
192        "SECTOR": "7",
193        "INDUSTRY": "TOT",
194        "TSEST": "10",
195        "REGION": "AUS",
196        "FREQ": "Q",
197    }
198
199    FETCHED_DATA, FETCHED_META = fetch(
200        FLOW_ID,
201        dims=DIMS,
202        validate=True,
203        modality="prefer-url",
204    )
205    # Note: The transpose (.T) is used here to make the output more readable
206    print("\nFetched Data:\n", FETCHED_DATA.T, sep="")
207    print("\nFetched Metadata:\n", FETCHED_META.T, sep="")
def get_series_data( xml_series: xml.etree.ElementTree.Element, meta: pandas.core.series.Series) -> pandas.core.series.Series:
15def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series:
16    """Extract observed data from the XML tree for a given single series."""
17    series_elements = {}
18    for item in xml_series.findall("gen:Obs", NAME_SPACES):
19        # --- get the index and value from the XML item, or nan if not found
20        index_container = item.find("gen:ObsDimension", NAME_SPACES)
21        index_obs = index_container.attrib.get("value", None) if index_container is not None else None
22        value_container = item.find("gen:ObsValue", NAME_SPACES)
23        value_obs = value_container.attrib.get("value", None) if value_container is not None else None
24        if index_obs is None or value_obs is None:
25            continue
26        series_elements[index_obs] = value_obs
27    series: pd.Series = pd.Series(series_elements).sort_index()
28
29    # --- if we can, make the series values numeric
30    series = series.replace("", np.nan)
31    try:
32        series = pd.to_numeric(series)
33    except ValueError:
34        # If conversion fails, keep the series as is (it may contain useful non-numeric data)
35        print(f"Could not convert series {meta.name} to numeric, keeping as is.")
36
37    # --- if we can, make the index a PeriodIndex based on the frequency
38    if "FREQ" in meta.index:
39        freq = meta["FREQ"]
40        if freq == "Annual":
41            series.index = pd.PeriodIndex(series.index, freq="Y")
42        elif freq == "Quarterly":
43            series.index = pd.PeriodIndex(series.index, freq="Q")
44        elif freq == "Monthly":
45            series.index = pd.PeriodIndex(series.index, freq="M")
46        elif freq in ("Daily", "Daily or businessweek"):
47            series.index = pd.PeriodIndex(series.index, freq="D")
48        else:
49            print(f"Unknown frequency {freq}, leaving index as is.")
50
51    return series

Extract observed data from the XML tree for a given single series.

def decode_meta_value(meta_value: str, meta_id: str, dims: dict[str, dict[str, str]]) -> str:
54def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str:
55    """Decode a metadata value based on its ID and the relevant ABS codelist."""
56    return_value = meta_value  # default to returning the raw value
57    if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]:
58        cl_id = dims[meta_id]["id"]
59        cl_package_type = dims[meta_id]["package"]
60        if cl_id and cl_package_type == "codelist":
61            cl = code_lists(cl_id)
62            if meta_value in cl and "name" in cl[meta_value]:
63                return_value = cl[meta_value]["name"]
64    return return_value

Decode a metadata value based on its ID and the relevant ABS codelist.

def get_series_meta_data( flow_id: str, xml_series: xml.etree.ElementTree.Element, series_count: int, dims: dict[str, dict[str, str]]) -> tuple[str, pandas.core.series.Series]:
 67def get_series_meta_data(
 68    flow_id: str, xml_series: Element, series_count: int, dims: FlowMetaDict
 69) -> tuple[str, pd.Series]:
 70    """Extract and decode metadata from the XML tree for one given series.
 71
 72    Args:
 73        xml_series (Element): The XML element representing the series.
 74        series_count (int): The index of the series in the XML tree.
 75        dims (FlowMetaDict): Dictionary containing metadata dimensions and
 76            their associated codelist names.
 77
 78    Returns:
 79        tuple[str, pd.Series]: A tuple containing the series label and a Series
 80            of metadata items for the series.
 81
 82    """
 83    item_count = 0
 84    keys = [flow_id]
 85    flow_name = data_flows().get(flow_id, {"name": flow_id})["name"]
 86    meta_items = {"DATAFLOW": flow_name}  # start with the flow ID
 87    key_sets = ("SeriesKey", "Attributes")
 88    for key_set in key_sets:
 89        attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES)
 90        if attribs is None:
 91            print(f"No {key_set} found in series, skipping.")
 92            continue
 93        for item in attribs.findall("gen:Value", NAME_SPACES):
 94            # --- get the metadata item ID and value, or create a placeholder if missing
 95            meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}")
 96            meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}")
 97            keys.append(meta_value)
 98            decoded_meta_value = decode_meta_value(meta_value, meta_id, dims)
 99            meta_items[meta_id] = decoded_meta_value
100            item_count += 1
101
102    final_key = ".".join(keys)  # create a unique label for the series
103
104    return final_key, pd.Series(meta_items).rename(final_key)

Extract and decode metadata from the XML tree for one given series.

Args: xml_series (Element): The XML element representing the series. series_count (int): The index of the series in the XML tree. dims (FlowMetaDict): Dictionary containing metadata dimensions and their associated codelist names.

Returns: tuple[str, pd.Series]: A tuple containing the series label and a Series of metadata items for the series.

def extract( flow_id: str, tree: xml.etree.ElementTree.Element) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
107def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]:
108    """Extract data from the XML tree."""
109    # Get the data dimensions for the flow_id, it provides entree to the metadata
110    dims = data_dimensions(flow_id)
111
112    meta = {}
113    data = {}
114    for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)):
115        if xml_series is None:
116            print("No Series found in XML tree, skipping.")
117            continue
118        label, dataset = get_series_meta_data(
119            flow_id,
120            # python typing is not smart enough to know that
121            # xml_series is an ElementTree
122            xml_series,
123            series_count,
124            dims,
125        )
126        if label in meta:
127            # this can happen if you implicitly select the same series multiple times
128            print(f"Duplicate series {label} in {flow_id} found, skipping.")
129            continue
130        meta[label] = dataset
131        series = get_series_data(xml_series, dataset)
132        series.name = label
133        data[label] = series
134
135    return pd.DataFrame(data), pd.DataFrame(meta).T  # data, meta

Extract data from the XML tree.

def fetch( flow_id: str, dims: dict[str, str] | None = None, constraints: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
139def fetch(
140    flow_id: str,
141    dims: dict[str, str] | None = None,
142    constraints: dict[str, str] | None = None,  # not implemented yet
143    *,
144    validate: bool = False,
145    **kwargs: Unpack[GetFileKwargs],
146) -> tuple[pd.DataFrame, pd.DataFrame]:
147    """Fetch data from the ABS SDMX API.
148
149    Args:
150        flow_id (str): The ID of the data flow from which to retrieve data items.
151        dims (dict[str, str], optional): A dictionary of dimensions to select the
152            data items. If None, the ABS fetch request will be for all data items,
153            which can be slow.
154        constraints (dict[str, str], optional): A dictionary of constraints to apply
155            to the data items. If None, no constraints are applied.
156        validate (bool): If True, print validation diagnostics for the proposed
157            dimensions against the metadata requirements. Defaults to False.
158        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
159
160    Returns: a tuple of two DataFrames:
161        - The first DataFrame contains the fetched data.
162        - The second DataFrame contains the metadata.
163
164    Raises:
165        HttpError: If there is an issue with the HTTP request.
166        CacheError: If there is an issue with the cache.
167        ValueError: If no XML root is found in the response.
168
169    Notes:
170        If the `dims` argument is not valid you will get a CacheError or HttpError.
171        If the `flow_id` is not valid, you will get a ValueError.
172    """
173    # --- prepare to get the XML root from the ABS SDMX API
174    kwargs["modality"] = kwargs.get("modality", "prefer-cache")
175    key = build_key(
176        flow_id,
177        dims,
178        validate=validate,
179    )
180    _not_implemented = constraints
181    url = f"{URL_STEM}/data/{flow_id}/{key}"
182    xml_root = acquire_xml(url, **kwargs)
183    return extract(flow_id, xml_root)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.

Notes: If the dims argument is not valid you will get a CacheError or HttpError. If the flow_id is not valid, you will get a ValueError.