sdmxabs.fetch

Obtain data from the ABS SDMX API.

  1"""Obtain data from the ABS SDMX API."""
  2
  3from typing import Unpack
  4from xml.etree.ElementTree import Element
  5
  6import numpy as np
  7import pandas as pd
  8
  9from sdmxabs.download_cache import GetFileKwargs
 10from sdmxabs.flow_metadata import FlowMetaDict, build_key, code_lists, data_dimensions
 11from sdmxabs.xml_base import NAME_SPACES, URL_STEM, acquire_xml
 12
 13
 14def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series:
 15    """Extract observed data from the XML tree for a given single series."""
 16    series_elements = {}
 17    for item in xml_series.findall("gen:Obs", NAME_SPACES):
 18        # --- get the index and value from the XML item, or nan if not found
 19        index_container = item.find("gen:ObsDimension", NAME_SPACES)
 20        index_obs = index_container.attrib.get("value", None) if index_container is not None else None
 21        value_container = item.find("gen:ObsValue", NAME_SPACES)
 22        value_obs = value_container.attrib.get("value", None) if value_container is not None else None
 23        if index_obs is None or value_obs is None:
 24            continue
 25        series_elements[index_obs] = value_obs
 26    series: pd.Series = pd.Series(series_elements).sort_index()
 27
 28    # --- if we can, make the series values numeric
 29    series = series.replace("", np.nan)
 30    try:
 31        series = pd.to_numeric(series)
 32    except ValueError:
 33        # If conversion fails, keep the series as is (it may contain useful non-numeric data)
 34        print(f"Could not convert series {meta.name} to numeric, keeping as is.")
 35
 36    # --- if we can, make the index a PeriodIndex based on the frequency
 37    if "FREQ" in meta.index:
 38        freq = meta["FREQ"]
 39        if freq == "Annual":
 40            series.index = pd.PeriodIndex(series.index, freq="Y")
 41        elif freq == "Quarterly":
 42            series.index = pd.PeriodIndex(series.index, freq="Q")
 43        elif freq == "Monthly":
 44            series.index = pd.PeriodIndex(series.index, freq="M")
 45        elif freq in ("Daily", "Daily or businessweek"):
 46            series.index = pd.PeriodIndex(series.index, freq="D")
 47        else:
 48            print(f"Unknown frequency {freq}, leaving index as is.")
 49
 50    return series
 51
 52
 53def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str:
 54    """Decode a metadata value based on its ID and the relevant ABS codelist."""
 55    return_value = meta_value  # default to returning the raw value
 56    if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]:
 57        cl_id = dims[meta_id]["id"]
 58        cl_package_type = dims[meta_id]["package"]
 59        if cl_id and cl_package_type == "codelist":
 60            cl = code_lists(cl_id)
 61            if meta_value in cl and "name" in cl[meta_value]:
 62                return_value = cl[meta_value]["name"]
 63    return return_value
 64
 65
 66def get_series_meta_data(xml_series: Element, series_count: int, dims: FlowMetaDict) -> tuple[str, pd.Series]:
 67    """Extract and decode metadata from the XML tree for one given series.
 68
 69    Args:
 70        xml_series (Element): The XML element representing the series.
 71        series_count (int): The index of the series in the XML tree.
 72        dims (FlowMetaDict): Dictionary containing metadata dimensions and
 73            their associated codelist names.
 74
 75    Returns:
 76        tuple[str, pd.Series]: A tuple containing the series label and a Series
 77            of metadata items for the series.
 78
 79    """
 80    item_count = 0
 81    keys = []
 82    meta_items = {}
 83    key_sets = ("SeriesKey", "Attributes")
 84    for key_set in key_sets:
 85        attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES)
 86        if attribs is None:
 87            print(f"No {key_set} found in series, skipping.")
 88            continue
 89        for item in attribs.findall("gen:Value", NAME_SPACES):
 90            # --- get the metadata item ID and value, or create a placeholder if missing
 91            meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}")
 92            meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}")
 93            keys.append(meta_value)
 94            decoded_meta_value = decode_meta_value(meta_value, meta_id, dims)
 95            meta_items[meta_id] = decoded_meta_value
 96            item_count += 1
 97
 98    final_key = ".".join(keys)  # create a unique label for the series
 99
100    return final_key, pd.Series(meta_items).rename(final_key)
101
102
103def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]:
104    """Extract data from the XML tree."""
105    # Get the data dimensions for the flow_id, it provides entree to the metadata
106    dims = data_dimensions(flow_id)
107
108    meta = {}
109    data = {}
110    for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)):
111        if xml_series is None:
112            print("No Series found in XML tree, skipping.")
113            continue
114        label, dataset = get_series_meta_data(
115            # python typing is not smart enough to know that
116            # xml_series is an ElementTree
117            xml_series,
118            series_count,
119            dims,
120        )
121        if label in meta:
122            # this shoudl not happen, but if it does, skip the series
123            print(f"Duplicate series {label} in {flow_id} found, skipping.")
124            continue
125        meta[label] = dataset
126        series = get_series_data(xml_series, dataset)
127        series.name = label
128        data[label] = series
129
130    return pd.DataFrame(data), pd.DataFrame(meta).T  # data, meta
131
132
133# === public functions ===
134def fetch(
135    flow_id: str,
136    dims: dict[str, str] | None = None,
137    constraints: dict[str, str] | None = None,  # not implemented yet
138    *,
139    validate: bool = False,
140    **kwargs: Unpack[GetFileKwargs],
141) -> tuple[pd.DataFrame, pd.DataFrame]:
142    """Fetch data from the ABS SDMX API.
143
144    Args:
145        flow_id (str): The ID of the data flow from which to retrieve data items.
146        dims (dict[str, str], optional): A dictionary of dimensions to select the
147            data items. If None, the ABS fetch request will be for all data items,
148            which can be slow.
149        constraints (dict[str, str], optional): A dictionary of constraints to apply
150            to the data items. If None, no constraints are applied.
151        validate (bool): If True, print validation diagnostics for the proposed
152            dimensions against the metadata requirements. Defaults to False.
153        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
154
155    Returns: a tuple of two DataFrames:
156        - The first DataFrame contains the fetched data.
157        - The second DataFrame contains the metadata.
158
159    Raises:
160        HttpError: If there is an issue with the HTTP request.
161        CacheError: If there is an issue with the cache.
162        ValueError: If no XML root is found in the response.
163
164    """
165    # --- prepare to get the XML root from the ABS SDMX API
166    kwargs["modality"] = kwargs.get("modality", "prefer-cache")
167    key = build_key(
168        flow_id,
169        dims,
170        validate=validate,
171    )
172    _not_implemented = constraints
173    url = f"{URL_STEM}/data/{flow_id}/{key}"
174    xml_root = acquire_xml(url, **kwargs)
175    return extract(flow_id, xml_root)
176
177
178# --- quick and dirty testing
179if __name__ == "__main__":
180    # Example usage
181    FLOW_ID = "WPI"
182    DIMS = {
183        "MEASURE": "3",
184        "INDEX": "OHRPEB",
185        "SECTOR": "7",
186        "INDUSTRY": "TOT",
187        "TSEST": "10",
188        "REGION": "AUS",
189        "FREQ": "Q",
190    }
191
192    FETCHED_DATA, FETCHED_META = fetch(
193        FLOW_ID,
194        dims=DIMS,
195        validate=True,
196        modality="prefer-url",
197    )
198    # Note: The transpose (.T) is used here to make the output more readable
199    print("\nFetched Data:\n", FETCHED_DATA.T, sep="")
200    print("\nFetched Metadata:\n", FETCHED_META.T, sep="")
def get_series_data( xml_series: xml.etree.ElementTree.Element, meta: pandas.core.series.Series) -> pandas.core.series.Series:
15def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series:
16    """Extract observed data from the XML tree for a given single series."""
17    series_elements = {}
18    for item in xml_series.findall("gen:Obs", NAME_SPACES):
19        # --- get the index and value from the XML item, or nan if not found
20        index_container = item.find("gen:ObsDimension", NAME_SPACES)
21        index_obs = index_container.attrib.get("value", None) if index_container is not None else None
22        value_container = item.find("gen:ObsValue", NAME_SPACES)
23        value_obs = value_container.attrib.get("value", None) if value_container is not None else None
24        if index_obs is None or value_obs is None:
25            continue
26        series_elements[index_obs] = value_obs
27    series: pd.Series = pd.Series(series_elements).sort_index()
28
29    # --- if we can, make the series values numeric
30    series = series.replace("", np.nan)
31    try:
32        series = pd.to_numeric(series)
33    except ValueError:
34        # If conversion fails, keep the series as is (it may contain useful non-numeric data)
35        print(f"Could not convert series {meta.name} to numeric, keeping as is.")
36
37    # --- if we can, make the index a PeriodIndex based on the frequency
38    if "FREQ" in meta.index:
39        freq = meta["FREQ"]
40        if freq == "Annual":
41            series.index = pd.PeriodIndex(series.index, freq="Y")
42        elif freq == "Quarterly":
43            series.index = pd.PeriodIndex(series.index, freq="Q")
44        elif freq == "Monthly":
45            series.index = pd.PeriodIndex(series.index, freq="M")
46        elif freq in ("Daily", "Daily or businessweek"):
47            series.index = pd.PeriodIndex(series.index, freq="D")
48        else:
49            print(f"Unknown frequency {freq}, leaving index as is.")
50
51    return series

Extract observed data from the XML tree for a given single series.

def decode_meta_value(meta_value: str, meta_id: str, dims: dict[str, dict[str, str]]) -> str:
54def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str:
55    """Decode a metadata value based on its ID and the relevant ABS codelist."""
56    return_value = meta_value  # default to returning the raw value
57    if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]:
58        cl_id = dims[meta_id]["id"]
59        cl_package_type = dims[meta_id]["package"]
60        if cl_id and cl_package_type == "codelist":
61            cl = code_lists(cl_id)
62            if meta_value in cl and "name" in cl[meta_value]:
63                return_value = cl[meta_value]["name"]
64    return return_value

Decode a metadata value based on its ID and the relevant ABS codelist.

def get_series_meta_data( xml_series: xml.etree.ElementTree.Element, series_count: int, dims: dict[str, dict[str, str]]) -> tuple[str, pandas.core.series.Series]:
 67def get_series_meta_data(xml_series: Element, series_count: int, dims: FlowMetaDict) -> tuple[str, pd.Series]:
 68    """Extract and decode metadata from the XML tree for one given series.
 69
 70    Args:
 71        xml_series (Element): The XML element representing the series.
 72        series_count (int): The index of the series in the XML tree.
 73        dims (FlowMetaDict): Dictionary containing metadata dimensions and
 74            their associated codelist names.
 75
 76    Returns:
 77        tuple[str, pd.Series]: A tuple containing the series label and a Series
 78            of metadata items for the series.
 79
 80    """
 81    item_count = 0
 82    keys = []
 83    meta_items = {}
 84    key_sets = ("SeriesKey", "Attributes")
 85    for key_set in key_sets:
 86        attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES)
 87        if attribs is None:
 88            print(f"No {key_set} found in series, skipping.")
 89            continue
 90        for item in attribs.findall("gen:Value", NAME_SPACES):
 91            # --- get the metadata item ID and value, or create a placeholder if missing
 92            meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}")
 93            meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}")
 94            keys.append(meta_value)
 95            decoded_meta_value = decode_meta_value(meta_value, meta_id, dims)
 96            meta_items[meta_id] = decoded_meta_value
 97            item_count += 1
 98
 99    final_key = ".".join(keys)  # create a unique label for the series
100
101    return final_key, pd.Series(meta_items).rename(final_key)

Extract and decode metadata from the XML tree for one given series.

Args: xml_series (Element): The XML element representing the series. series_count (int): The index of the series in the XML tree. dims (FlowMetaDict): Dictionary containing metadata dimensions and their associated codelist names.

Returns: tuple[str, pd.Series]: A tuple containing the series label and a Series of metadata items for the series.

def extract( flow_id: str, tree: xml.etree.ElementTree.Element) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
104def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]:
105    """Extract data from the XML tree."""
106    # Get the data dimensions for the flow_id, it provides entree to the metadata
107    dims = data_dimensions(flow_id)
108
109    meta = {}
110    data = {}
111    for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)):
112        if xml_series is None:
113            print("No Series found in XML tree, skipping.")
114            continue
115        label, dataset = get_series_meta_data(
116            # python typing is not smart enough to know that
117            # xml_series is an ElementTree
118            xml_series,
119            series_count,
120            dims,
121        )
122        if label in meta:
123            # this shoudl not happen, but if it does, skip the series
124            print(f"Duplicate series {label} in {flow_id} found, skipping.")
125            continue
126        meta[label] = dataset
127        series = get_series_data(xml_series, dataset)
128        series.name = label
129        data[label] = series
130
131    return pd.DataFrame(data), pd.DataFrame(meta).T  # data, meta

Extract data from the XML tree.

def fetch( flow_id: str, dims: dict[str, str] | None = None, constraints: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
135def fetch(
136    flow_id: str,
137    dims: dict[str, str] | None = None,
138    constraints: dict[str, str] | None = None,  # not implemented yet
139    *,
140    validate: bool = False,
141    **kwargs: Unpack[GetFileKwargs],
142) -> tuple[pd.DataFrame, pd.DataFrame]:
143    """Fetch data from the ABS SDMX API.
144
145    Args:
146        flow_id (str): The ID of the data flow from which to retrieve data items.
147        dims (dict[str, str], optional): A dictionary of dimensions to select the
148            data items. If None, the ABS fetch request will be for all data items,
149            which can be slow.
150        constraints (dict[str, str], optional): A dictionary of constraints to apply
151            to the data items. If None, no constraints are applied.
152        validate (bool): If True, print validation diagnostics for the proposed
153            dimensions against the metadata requirements. Defaults to False.
154        **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
155
156    Returns: a tuple of two DataFrames:
157        - The first DataFrame contains the fetched data.
158        - The second DataFrame contains the metadata.
159
160    Raises:
161        HttpError: If there is an issue with the HTTP request.
162        CacheError: If there is an issue with the cache.
163        ValueError: If no XML root is found in the response.
164
165    """
166    # --- prepare to get the XML root from the ABS SDMX API
167    kwargs["modality"] = kwargs.get("modality", "prefer-cache")
168    key = build_key(
169        flow_id,
170        dims,
171        validate=validate,
172    )
173    _not_implemented = constraints
174    url = f"{URL_STEM}/data/{flow_id}/{key}"
175    xml_root = acquire_xml(url, **kwargs)
176    return extract(flow_id, xml_root)

Fetch data from the ABS SDMX API.

Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().

Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.

Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.