sdmxabs.fetch
Obtain data from the ABS SDMX API.
1"""Obtain data from the ABS SDMX API.""" 2 3from typing import Unpack 4from xml.etree.ElementTree import Element 5 6import numpy as np 7import pandas as pd 8 9from sdmxabs.download_cache import GetFileKwargs 10from sdmxabs.flow_metadata import FlowMetaDict, build_key, code_lists, data_dimensions 11from sdmxabs.xml_base import NAME_SPACES, URL_STEM, acquire_xml 12 13 14def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series: 15 """Extract observed data from the XML tree for a given single series.""" 16 series_elements = {} 17 for item in xml_series.findall("gen:Obs", NAME_SPACES): 18 # --- get the index and value from the XML item, or nan if not found 19 index_container = item.find("gen:ObsDimension", NAME_SPACES) 20 index_obs = index_container.attrib.get("value", None) if index_container is not None else None 21 value_container = item.find("gen:ObsValue", NAME_SPACES) 22 value_obs = value_container.attrib.get("value", None) if value_container is not None else None 23 if index_obs is None or value_obs is None: 24 continue 25 series_elements[index_obs] = value_obs 26 series: pd.Series = pd.Series(series_elements).sort_index() 27 28 # --- if we can, make the series values numeric 29 series = series.replace("", np.nan) 30 try: 31 series = pd.to_numeric(series) 32 except ValueError: 33 # If conversion fails, keep the series as is (it may contain useful non-numeric data) 34 print(f"Could not convert series {meta.name} to numeric, keeping as is.") 35 36 # --- if we can, make the index a PeriodIndex based on the frequency 37 if "FREQ" in meta.index: 38 freq = meta["FREQ"] 39 if freq == "Annual": 40 series.index = pd.PeriodIndex(series.index, freq="Y") 41 elif freq == "Quarterly": 42 series.index = pd.PeriodIndex(series.index, freq="Q") 43 elif freq == "Monthly": 44 series.index = pd.PeriodIndex(series.index, freq="M") 45 elif freq in ("Daily", "Daily or businessweek"): 46 series.index = pd.PeriodIndex(series.index, freq="D") 47 else: 48 print(f"Unknown frequency {freq}, leaving index as is.") 49 50 return series 51 52 53def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str: 54 """Decode a metadata value based on its ID and the relevant ABS codelist.""" 55 return_value = meta_value # default to returning the raw value 56 if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]: 57 cl_id = dims[meta_id]["id"] 58 cl_package_type = dims[meta_id]["package"] 59 if cl_id and cl_package_type == "codelist": 60 cl = code_lists(cl_id) 61 if meta_value in cl and "name" in cl[meta_value]: 62 return_value = cl[meta_value]["name"] 63 return return_value 64 65 66def get_series_meta_data(xml_series: Element, series_count: int, dims: FlowMetaDict) -> tuple[str, pd.Series]: 67 """Extract and decode metadata from the XML tree for one given series. 68 69 Args: 70 xml_series (Element): The XML element representing the series. 71 series_count (int): The index of the series in the XML tree. 72 dims (FlowMetaDict): Dictionary containing metadata dimensions and 73 their associated codelist names. 74 75 Returns: 76 tuple[str, pd.Series]: A tuple containing the series label and a Series 77 of metadata items for the series. 78 79 """ 80 item_count = 0 81 keys = [] 82 meta_items = {} 83 key_sets = ("SeriesKey", "Attributes") 84 for key_set in key_sets: 85 attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES) 86 if attribs is None: 87 print(f"No {key_set} found in series, skipping.") 88 continue 89 for item in attribs.findall("gen:Value", NAME_SPACES): 90 # --- get the metadata item ID and value, or create a placeholder if missing 91 meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}") 92 meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}") 93 keys.append(meta_value) 94 decoded_meta_value = decode_meta_value(meta_value, meta_id, dims) 95 meta_items[meta_id] = decoded_meta_value 96 item_count += 1 97 98 final_key = ".".join(keys) # create a unique label for the series 99 100 return final_key, pd.Series(meta_items).rename(final_key) 101 102 103def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]: 104 """Extract data from the XML tree.""" 105 # Get the data dimensions for the flow_id, it provides entree to the metadata 106 dims = data_dimensions(flow_id) 107 108 meta = {} 109 data = {} 110 for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)): 111 if xml_series is None: 112 print("No Series found in XML tree, skipping.") 113 continue 114 label, dataset = get_series_meta_data( 115 # python typing is not smart enough to know that 116 # xml_series is an ElementTree 117 xml_series, 118 series_count, 119 dims, 120 ) 121 if label in meta: 122 # this shoudl not happen, but if it does, skip the series 123 print(f"Duplicate series {label} in {flow_id} found, skipping.") 124 continue 125 meta[label] = dataset 126 series = get_series_data(xml_series, dataset) 127 series.name = label 128 data[label] = series 129 130 return pd.DataFrame(data), pd.DataFrame(meta).T # data, meta 131 132 133# === public functions === 134def fetch( 135 flow_id: str, 136 dims: dict[str, str] | None = None, 137 constraints: dict[str, str] | None = None, # not implemented yet 138 *, 139 validate: bool = False, 140 **kwargs: Unpack[GetFileKwargs], 141) -> tuple[pd.DataFrame, pd.DataFrame]: 142 """Fetch data from the ABS SDMX API. 143 144 Args: 145 flow_id (str): The ID of the data flow from which to retrieve data items. 146 dims (dict[str, str], optional): A dictionary of dimensions to select the 147 data items. If None, the ABS fetch request will be for all data items, 148 which can be slow. 149 constraints (dict[str, str], optional): A dictionary of constraints to apply 150 to the data items. If None, no constraints are applied. 151 validate (bool): If True, print validation diagnostics for the proposed 152 dimensions against the metadata requirements. Defaults to False. 153 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 154 155 Returns: a tuple of two DataFrames: 156 - The first DataFrame contains the fetched data. 157 - The second DataFrame contains the metadata. 158 159 Raises: 160 HttpError: If there is an issue with the HTTP request. 161 CacheError: If there is an issue with the cache. 162 ValueError: If no XML root is found in the response. 163 164 """ 165 # --- prepare to get the XML root from the ABS SDMX API 166 kwargs["modality"] = kwargs.get("modality", "prefer-cache") 167 key = build_key( 168 flow_id, 169 dims, 170 validate=validate, 171 ) 172 _not_implemented = constraints 173 url = f"{URL_STEM}/data/{flow_id}/{key}" 174 xml_root = acquire_xml(url, **kwargs) 175 return extract(flow_id, xml_root) 176 177 178# --- quick and dirty testing 179if __name__ == "__main__": 180 # Example usage 181 FLOW_ID = "WPI" 182 DIMS = { 183 "MEASURE": "3", 184 "INDEX": "OHRPEB", 185 "SECTOR": "7", 186 "INDUSTRY": "TOT", 187 "TSEST": "10", 188 "REGION": "AUS", 189 "FREQ": "Q", 190 } 191 192 FETCHED_DATA, FETCHED_META = fetch( 193 FLOW_ID, 194 dims=DIMS, 195 validate=True, 196 modality="prefer-url", 197 ) 198 # Note: The transpose (.T) is used here to make the output more readable 199 print("\nFetched Data:\n", FETCHED_DATA.T, sep="") 200 print("\nFetched Metadata:\n", FETCHED_META.T, sep="")
15def get_series_data(xml_series: Element, meta: pd.Series) -> pd.Series: 16 """Extract observed data from the XML tree for a given single series.""" 17 series_elements = {} 18 for item in xml_series.findall("gen:Obs", NAME_SPACES): 19 # --- get the index and value from the XML item, or nan if not found 20 index_container = item.find("gen:ObsDimension", NAME_SPACES) 21 index_obs = index_container.attrib.get("value", None) if index_container is not None else None 22 value_container = item.find("gen:ObsValue", NAME_SPACES) 23 value_obs = value_container.attrib.get("value", None) if value_container is not None else None 24 if index_obs is None or value_obs is None: 25 continue 26 series_elements[index_obs] = value_obs 27 series: pd.Series = pd.Series(series_elements).sort_index() 28 29 # --- if we can, make the series values numeric 30 series = series.replace("", np.nan) 31 try: 32 series = pd.to_numeric(series) 33 except ValueError: 34 # If conversion fails, keep the series as is (it may contain useful non-numeric data) 35 print(f"Could not convert series {meta.name} to numeric, keeping as is.") 36 37 # --- if we can, make the index a PeriodIndex based on the frequency 38 if "FREQ" in meta.index: 39 freq = meta["FREQ"] 40 if freq == "Annual": 41 series.index = pd.PeriodIndex(series.index, freq="Y") 42 elif freq == "Quarterly": 43 series.index = pd.PeriodIndex(series.index, freq="Q") 44 elif freq == "Monthly": 45 series.index = pd.PeriodIndex(series.index, freq="M") 46 elif freq in ("Daily", "Daily or businessweek"): 47 series.index = pd.PeriodIndex(series.index, freq="D") 48 else: 49 print(f"Unknown frequency {freq}, leaving index as is.") 50 51 return series
Extract observed data from the XML tree for a given single series.
54def decode_meta_value(meta_value: str, meta_id: str, dims: FlowMetaDict) -> str: 55 """Decode a metadata value based on its ID and the relevant ABS codelist.""" 56 return_value = meta_value # default to returning the raw value 57 if meta_id in dims and "id" in dims[meta_id] and "package" in dims[meta_id]: 58 cl_id = dims[meta_id]["id"] 59 cl_package_type = dims[meta_id]["package"] 60 if cl_id and cl_package_type == "codelist": 61 cl = code_lists(cl_id) 62 if meta_value in cl and "name" in cl[meta_value]: 63 return_value = cl[meta_value]["name"] 64 return return_value
Decode a metadata value based on its ID and the relevant ABS codelist.
67def get_series_meta_data(xml_series: Element, series_count: int, dims: FlowMetaDict) -> tuple[str, pd.Series]: 68 """Extract and decode metadata from the XML tree for one given series. 69 70 Args: 71 xml_series (Element): The XML element representing the series. 72 series_count (int): The index of the series in the XML tree. 73 dims (FlowMetaDict): Dictionary containing metadata dimensions and 74 their associated codelist names. 75 76 Returns: 77 tuple[str, pd.Series]: A tuple containing the series label and a Series 78 of metadata items for the series. 79 80 """ 81 item_count = 0 82 keys = [] 83 meta_items = {} 84 key_sets = ("SeriesKey", "Attributes") 85 for key_set in key_sets: 86 attribs = xml_series.find(f"gen:{key_set}", NAME_SPACES) 87 if attribs is None: 88 print(f"No {key_set} found in series, skipping.") 89 continue 90 for item in attribs.findall("gen:Value", NAME_SPACES): 91 # --- get the metadata item ID and value, or create a placeholder if missing 92 meta_id = item.attrib.get("id", f"missing meta_id {series_count}-{item_count}") 93 meta_value = item.attrib.get("value", f"missing meta_value {series_count}-{item_count}") 94 keys.append(meta_value) 95 decoded_meta_value = decode_meta_value(meta_value, meta_id, dims) 96 meta_items[meta_id] = decoded_meta_value 97 item_count += 1 98 99 final_key = ".".join(keys) # create a unique label for the series 100 101 return final_key, pd.Series(meta_items).rename(final_key)
Extract and decode metadata from the XML tree for one given series.
Args: xml_series (Element): The XML element representing the series. series_count (int): The index of the series in the XML tree. dims (FlowMetaDict): Dictionary containing metadata dimensions and their associated codelist names.
Returns: tuple[str, pd.Series]: A tuple containing the series label and a Series of metadata items for the series.
104def extract(flow_id: str, tree: Element) -> tuple[pd.DataFrame, pd.DataFrame]: 105 """Extract data from the XML tree.""" 106 # Get the data dimensions for the flow_id, it provides entree to the metadata 107 dims = data_dimensions(flow_id) 108 109 meta = {} 110 data = {} 111 for series_count, xml_series in enumerate(tree.findall(".//gen:Series", NAME_SPACES)): 112 if xml_series is None: 113 print("No Series found in XML tree, skipping.") 114 continue 115 label, dataset = get_series_meta_data( 116 # python typing is not smart enough to know that 117 # xml_series is an ElementTree 118 xml_series, 119 series_count, 120 dims, 121 ) 122 if label in meta: 123 # this shoudl not happen, but if it does, skip the series 124 print(f"Duplicate series {label} in {flow_id} found, skipping.") 125 continue 126 meta[label] = dataset 127 series = get_series_data(xml_series, dataset) 128 series.name = label 129 data[label] = series 130 131 return pd.DataFrame(data), pd.DataFrame(meta).T # data, meta
Extract data from the XML tree.
135def fetch( 136 flow_id: str, 137 dims: dict[str, str] | None = None, 138 constraints: dict[str, str] | None = None, # not implemented yet 139 *, 140 validate: bool = False, 141 **kwargs: Unpack[GetFileKwargs], 142) -> tuple[pd.DataFrame, pd.DataFrame]: 143 """Fetch data from the ABS SDMX API. 144 145 Args: 146 flow_id (str): The ID of the data flow from which to retrieve data items. 147 dims (dict[str, str], optional): A dictionary of dimensions to select the 148 data items. If None, the ABS fetch request will be for all data items, 149 which can be slow. 150 constraints (dict[str, str], optional): A dictionary of constraints to apply 151 to the data items. If None, no constraints are applied. 152 validate (bool): If True, print validation diagnostics for the proposed 153 dimensions against the metadata requirements. Defaults to False. 154 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 155 156 Returns: a tuple of two DataFrames: 157 - The first DataFrame contains the fetched data. 158 - The second DataFrame contains the metadata. 159 160 Raises: 161 HttpError: If there is an issue with the HTTP request. 162 CacheError: If there is an issue with the cache. 163 ValueError: If no XML root is found in the response. 164 165 """ 166 # --- prepare to get the XML root from the ABS SDMX API 167 kwargs["modality"] = kwargs.get("modality", "prefer-cache") 168 key = build_key( 169 flow_id, 170 dims, 171 validate=validate, 172 ) 173 _not_implemented = constraints 174 url = f"{URL_STEM}/data/{flow_id}/{key}" 175 xml_root = acquire_xml(url, **kwargs) 176 return extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.