sdmxabs.fetch
Obtain data from the ABS SDMX API.
1"""Obtain data from the ABS SDMX API.""" 2 3from typing import Unpack 4from xml.etree.ElementTree import Element 5 6import numpy as np 7import pandas as pd 8 9from sdmxabs.download_cache import GetFileKwargs 10from sdmxabs.flow_metadata import FlowMetaDict, build_key, code_lists, data_dimensions, data_flows 11from sdmxabs.xml_base import NAME_SPACES, URL_STEM, acquire_xml 12 13 14def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series: 15 """Extract observed data from the XML tree for a given single series.""" 16 series_elements = {} 17 for item in xml_series.findall("gen:Obs", NAME_SPACES): 18 # --- get the index and value from the XML item, or nan if not found 19 index_container = item.find("gen:ObsDimension", NAME_SPACES) 20 index_obs = index_container.attrib.get("value", None) if index_container is not None else None 21 value_container = item.find("gen:ObsValue", NAME_SPACES) 22 value_obs = value_container.attrib.get("value", None) if value_container is not None else None 23 if index_obs is None or value_obs is None: 24 continue 25 series_elements[index_obs] = value_obs 26 series: pd.Series = pd.Series(series_elements).sort_index() 27 28 # --- if we can, make the series values numeric 29 series = series.replace("", np.nan) 30 try: 31 series = pd.to_numeric(series) 32 except ValueError: 33 # If conversion fails, keep the series as is (it may contain useful non-numeric data) 34 print(f"Could not convert series {meta.name} to numeric, keeping as is.") 35 36 # --- if we can, make the index a PeriodIndex based on the frequency 37 if "FREQ" in meta.index: 38 freq = meta["FREQ"] 39 if freq == "Annual": 40 series.index = pd.PeriodIndex(series.index, freq="Y") 41 elif freq == "Quarterly": 42 series.index = pd.PeriodIndex(series.index, freq="Q") 43 elif freq == "Monthly": 44 series.index = pd.PeriodIndex(series.index, freq="M") 45 elif freq in ("Daily", "Daily or businessweek"): 46 series.index = pd.PeriodIndex(series.index, freq="D") 47 else: 48 print(f"Unknown frequency {freq}, leaving index as is.") 49 50 return series 51 52 53def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str: 54 """Decode a metadata value based on its ID and the relevant ABS codelist.""" 55 return_value = meta_value # default to returning the raw value 56 if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]: 57 cl_id = dims[meta_id]["id"] 58 cl_package_type = dims[meta_id]["package"] 59 if cl_id and cl_package_type == "codelist": 60 cl = code_lists(cl_id) 61 if meta_value in cl and "name" in cl[meta_value]: 62 return_value = cl[meta_value]["name"] 63 return return_value 64 65 66def get_series_meta_data( 67 flow_id: str, xml_series: Element, series_count: int, dims: FlowMetaDict 68) -> tuple[str, pd.Series]: 69 """Extract and decode metadata from the XML tree for one given series. 70 71 Args: 72 xml_series (Element): The XML element representing the series. 73 series_count (int): The index of the series in the XML tree. 74 dims (FlowMetaDict): Dictionary containing metadata dimensions and 75 their associated codelist names. 76 77 Returns: 78 tuple[str, pd.Series]: A tuple containing the series label and a Series 79 of metadata items for the series. 80 81 """ 82 item_count = 0 83 keys = [flow_id] 84 flow_name = data_flows().get(flow_id, {"name": flow_id})["name"] 85 meta_items = {"DATAFLOW": flow_name} # start with the flow ID 86 key_sets = ("SeriesKey", "Attributes") 87 for key_set in key_sets: 88 attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES) 89 if attribs is None: 90 print(f"No {key_set} found in series, skipping.") 91 continue 92 for item in attribs.findall("gen:Value", NAME_SPACES): 93 # --- get the metadata item ID and value, or create a placeholder if missing 94 meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}") 95 meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}") 96 keys.append(meta_value) 97 decoded_meta_value = decode_meta_value(meta_value, meta_id, dims) 98 meta_items[meta_id] = decoded_meta_value 99 item_count += 1 100 101 final_key = ".".join(keys) # create a unique label for the series 102 103 return final_key, pd.Series(meta_items).rename(final_key) 104 105 106def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]: 107 """Extract data from the XML tree.""" 108 # Get the data dimensions for the flow_id, it provides entree to the metadata 109 dims = data_dimensions(flow_id) 110 111 meta = {} 112 data = {} 113 for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)): 114 if xml_series is None: 115 print("No Series found in XML tree, skipping.") 116 continue 117 label, dataset = get_series_meta_data( 118 flow_id, 119 # python typing is not smart enough to know that 120 # xml_series is an ElementTree 121 xml_series, 122 series_count, 123 dims, 124 ) 125 if label in meta: 126 # this can happen if you implicitly select the same series multiple times 127 print(f"Duplicate series {label} in {flow_id} found, skipping.") 128 continue 129 meta[label] = dataset 130 series = get_series_data(xml_series, dataset) 131 series.name = label 132 data[label] = series 133 134 return pd.DataFrame(data), pd.DataFrame(meta).T # data, meta 135 136 137# === public functions === 138def fetch( 139 flow_id: str, 140 dims: dict[str, str] | None = None, 141 constraints: dict[str, str] | None = None, # not implemented yet 142 *, 143 validate: bool = False, 144 **kwargs: Unpack[GetFileKwargs], 145) -> tuple[pd.DataFrame, pd.DataFrame]: 146 """Fetch data from the ABS SDMX API. 147 148 Args: 149 flow_id (str): The ID of the data flow from which to retrieve data items. 150 dims (dict[str, str], optional): A dictionary of dimensions to select the 151 data items. If None, the ABS fetch request will be for all data items, 152 which can be slow. 153 constraints (dict[str, str], optional): A dictionary of constraints to apply 154 to the data items. If None, no constraints are applied. 155 validate (bool): If True, print validation diagnostics for the proposed 156 dimensions against the metadata requirements. Defaults to False. 157 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 158 159 Returns: a tuple of two DataFrames: 160 - The first DataFrame contains the fetched data. 161 - The second DataFrame contains the metadata. 162 163 Raises: 164 HttpError: If there is an issue with the HTTP request. 165 CacheError: If there is an issue with the cache. 166 ValueError: If no XML root is found in the response. 167 168 Notes: 169 If the `dims` argument is not valid you will get a CacheError or HttpError. 170 If the `flow_id` is not valid, you will get a ValueError. 171 """ 172 # --- prepare to get the XML root from the ABS SDMX API 173 kwargs["modality"] = kwargs.get("modality", "prefer-cache") 174 key = build_key( 175 flow_id, 176 dims, 177 validate=validate, 178 ) 179 _not_implemented = constraints 180 url = f"{URL_STEM}/data/{flow_id}/{key}" 181 xml_root = acquire_xml(url, **kwargs) 182 return extract(flow_id, xml_root) 183 184 185# --- quick and dirty testing 186if __name__ == "__main__": 187 # Example usage 188 FLOW_ID = "WPI" 189 DIMS = { 190 "MEASURE": "3", 191 "INDEX": "OHRPEB", 192 "SECTOR": "7", 193 "INDUSTRY": "TOT", 194 "TSEST": "10", 195 "REGION": "AUS", 196 "FREQ": "Q", 197 } 198 199 FETCHED_DATA, FETCHED_META = fetch( 200 FLOW_ID, 201 dims=DIMS, 202 validate=True, 203 modality="prefer-url", 204 ) 205 # Note: The transpose (.T) is used here to make the output more readable 206 print("\nFetched Data:\n", FETCHED_DATA.T, sep="") 207 print("\nFetched Metadata:\n", FETCHED_META.T, sep="")
15def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series: 16 """Extract observed data from the XML tree for a given single series.""" 17 series_elements = {} 18 for item in xml_series.findall("gen:Obs", NAME_SPACES): 19 # --- get the index and value from the XML item, or nan if not found 20 index_container = item.find("gen:ObsDimension", NAME_SPACES) 21 index_obs = index_container.attrib.get("value", None) if index_container is not None else None 22 value_container = item.find("gen:ObsValue", NAME_SPACES) 23 value_obs = value_container.attrib.get("value", None) if value_container is not None else None 24 if index_obs is None or value_obs is None: 25 continue 26 series_elements[index_obs] = value_obs 27 series: pd.Series = pd.Series(series_elements).sort_index() 28 29 # --- if we can, make the series values numeric 30 series = series.replace("", np.nan) 31 try: 32 series = pd.to_numeric(series) 33 except ValueError: 34 # If conversion fails, keep the series as is (it may contain useful non-numeric data) 35 print(f"Could not convert series {meta.name} to numeric, keeping as is.") 36 37 # --- if we can, make the index a PeriodIndex based on the frequency 38 if "FREQ" in meta.index: 39 freq = meta["FREQ"] 40 if freq == "Annual": 41 series.index = pd.PeriodIndex(series.index, freq="Y") 42 elif freq == "Quarterly": 43 series.index = pd.PeriodIndex(series.index, freq="Q") 44 elif freq == "Monthly": 45 series.index = pd.PeriodIndex(series.index, freq="M") 46 elif freq in ("Daily", "Daily or businessweek"): 47 series.index = pd.PeriodIndex(series.index, freq="D") 48 else: 49 print(f"Unknown frequency {freq}, leaving index as is.") 50 51 return series
Extract observed data from the XML tree for a given single series.
54def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str: 55 """Decode a metadata value based on its ID and the relevant ABS codelist.""" 56 return_value = meta_value # default to returning the raw value 57 if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]: 58 cl_id = dims[meta_id]["id"] 59 cl_package_type = dims[meta_id]["package"] 60 if cl_id and cl_package_type == "codelist": 61 cl = code_lists(cl_id) 62 if meta_value in cl and "name" in cl[meta_value]: 63 return_value = cl[meta_value]["name"] 64 return return_value
Decode a metadata value based on its ID and the relevant ABS codelist.
67def get_series_meta_data( 68 flow_id: str, xml_series: Element, series_count: int, dims: FlowMetaDict 69) -> tuple[str, pd.Series]: 70 """Extract and decode metadata from the XML tree for one given series. 71 72 Args: 73 xml_series (Element): The XML element representing the series. 74 series_count (int): The index of the series in the XML tree. 75 dims (FlowMetaDict): Dictionary containing metadata dimensions and 76 their associated codelist names. 77 78 Returns: 79 tuple[str, pd.Series]: A tuple containing the series label and a Series 80 of metadata items for the series. 81 82 """ 83 item_count = 0 84 keys = [flow_id] 85 flow_name = data_flows().get(flow_id, {"name": flow_id})["name"] 86 meta_items = {"DATAFLOW": flow_name} # start with the flow ID 87 key_sets = ("SeriesKey", "Attributes") 88 for key_set in key_sets: 89 attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES) 90 if attribs is None: 91 print(f"No {key_set} found in series, skipping.") 92 continue 93 for item in attribs.findall("gen:Value", NAME_SPACES): 94 # --- get the metadata item ID and value, or create a placeholder if missing 95 meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}") 96 meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}") 97 keys.append(meta_value) 98 decoded_meta_value = decode_meta_value(meta_value, meta_id, dims) 99 meta_items[meta_id] = decoded_meta_value 100 item_count += 1 101 102 final_key = ".".join(keys) # create a unique label for the series 103 104 return final_key, pd.Series(meta_items).rename(final_key)
Extract and decode metadata from the XML tree for one given series.
Args: xml_series (Element): The XML element representing the series. series_count (int): The index of the series in the XML tree. dims (FlowMetaDict): Dictionary containing metadata dimensions and their associated codelist names.
Returns: tuple[str, pd.Series]: A tuple containing the series label and a Series of metadata items for the series.
107def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]: 108 """Extract data from the XML tree.""" 109 # Get the data dimensions for the flow_id, it provides entree to the metadata 110 dims = data_dimensions(flow_id) 111 112 meta = {} 113 data = {} 114 for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)): 115 if xml_series is None: 116 print("No Series found in XML tree, skipping.") 117 continue 118 label, dataset = get_series_meta_data( 119 flow_id, 120 # python typing is not smart enough to know that 121 # xml_series is an ElementTree 122 xml_series, 123 series_count, 124 dims, 125 ) 126 if label in meta: 127 # this can happen if you implicitly select the same series multiple times 128 print(f"Duplicate series {label} in {flow_id} found, skipping.") 129 continue 130 meta[label] = dataset 131 series = get_series_data(xml_series, dataset) 132 series.name = label 133 data[label] = series 134 135 return pd.DataFrame(data), pd.DataFrame(meta).T # data, meta
Extract data from the XML tree.
139def fetch( 140 flow_id: str, 141 dims: dict[str, str] | None = None, 142 constraints: dict[str, str] | None = None, # not implemented yet 143 *, 144 validate: bool = False, 145 **kwargs: Unpack[GetFileKwargs], 146) -> tuple[pd.DataFrame, pd.DataFrame]: 147 """Fetch data from the ABS SDMX API. 148 149 Args: 150 flow_id (str): The ID of the data flow from which to retrieve data items. 151 dims (dict[str, str], optional): A dictionary of dimensions to select the 152 data items. If None, the ABS fetch request will be for all data items, 153 which can be slow. 154 constraints (dict[str, str], optional): A dictionary of constraints to apply 155 to the data items. If None, no constraints are applied. 156 validate (bool): If True, print validation diagnostics for the proposed 157 dimensions against the metadata requirements. Defaults to False. 158 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 159 160 Returns: a tuple of two DataFrames: 161 - The first DataFrame contains the fetched data. 162 - The second DataFrame contains the metadata. 163 164 Raises: 165 HttpError: If there is an issue with the HTTP request. 166 CacheError: If there is an issue with the cache. 167 ValueError: If no XML root is found in the response. 168 169 Notes: 170 If the `dims` argument is not valid you will get a CacheError or HttpError. 171 If the `flow_id` is not valid, you will get a ValueError. 172 """ 173 # --- prepare to get the XML root from the ABS SDMX API 174 kwargs["modality"] = kwargs.get("modality", "prefer-cache") 175 key = build_key( 176 flow_id, 177 dims, 178 validate=validate, 179 ) 180 _not_implemented = constraints 181 url = f"{URL_STEM}/data/{flow_id}/{key}" 182 xml_root = acquire_xml(url, **kwargs) 183 return extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Notes:
If the dims argument is not valid you will get a CacheError or HttpError.
If the flow_id is not valid, you will get a ValueError.