sdmxabs
Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.
1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.""" 2 3from importlib.metadata import PackageNotFoundError, version 4 5from .download_cache import ( 6 CacheError, 7 GetFileKwargs, 8 HttpError, 9 ModalityType, 10) 11from .fetch import fetch 12from .fetch_multi import fetch_multi 13from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item 14from .flow_metadata import code_lists, data_dimensions, data_flows 15 16# --- version and author 17try: 18 __version__ = version(__name__) 19except PackageNotFoundError: 20 __version__ = "0.0.0" # Fallback for development mode 21__author__ = "Bryan Palmer" 22 23# --- establish the package contents 24__all__ = [ 25 "CacheError", 26 "GetFileKwargs", 27 "HttpError", 28 "MatchCriteria", 29 "MatchItem", 30 "MatchType", 31 "ModalityType", 32 "__author__", 33 "__version__", 34 "code_lists", 35 "data_dimensions", 36 "data_flows", 37 "fetch", 38 "fetch_multi", 39 "fetch_selection", 40 "make_wanted", 41 "match_item", 42]
A problem retrieving data from the cache.
45class GetFileKwargs(TypedDict): 46 """TypedDict for acqure_url function arguments.""" 47 48 verbose: NotRequired[bool] 49 """If True, print information about the data retrieval process.""" 50 modality: NotRequired[ModalityType] 51 """Kind of retrieval: "prefer_cache", "prefer_url"."""
TypedDict for acqure_url function arguments.
A problem retrieving data using HTTP.
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
90@cache 91def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 92 """Get the code list metadata from the ABS SDMX API. 93 94 Args: 95 cl_id (str): The ID of the code list to retrieve. 96 **kwargs: Additional keyword arguments passed to acquire_url(). 97 98 Returns: 99 FlowMetaDict: A dictionary containing the codes and 100 their associated key=value pairs. A "name" key should always 101 be present. A "parent" key may also be present. 102 103 Raises: 104 HttpError: If there is an issue with the HTTP request. 105 CacheError: If there is an issue with the cache. 106 ValueError: If no XML root is found in the response. 107 108 Note: 109 You will get a CacheError if the codelist is not found on the ABS SDMX API. 110 (This package tries the website first, then the cache.) 111 112 """ 113 tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs) 114 115 codes: FlowMetaDict = {} 116 for code in tree.findall(".//str:Code", NAME_SPACES): 117 code_id = code.get("id", None) 118 if code_id is None: 119 continue 120 elements: dict[str, str] = {} 121 name = code.find("com:Name", NAME_SPACES) 122 elements["name"] = name.text if name is not None and name.text else "(missing)" 123 parent = code.find("str:Parent", NAME_SPACES) 124 parent_id = "" 125 if parent is not None: 126 ref = parent.find("Ref", NAME_SPACES) 127 if ref is not None: 128 parent_id = str(ref.get("id", "")) 129 if parent_id: # Only add if not empty 130 elements["parent"] = parent_id 131 132 codes[code_id] = elements 133 134 return codes
Get the code list metadata from the ABS SDMX API.
Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)
55@cache 56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 57 """Get the data dimensions metadata from the ABS SDMX API. 58 59 Args: 60 flow_id (str): The ID of the dataflow to retrieve dimensions for. 61 **kwargs: Additional keyword arguments passed to acquire_url(). 62 63 Returns: 64 dict[str, dict[str, str]]: A dictionary containing the dimensions and 65 their metadata in key=value pairs. 66 67 Raises: 68 HttpError: If there is an issue with the HTTP request. 69 CacheError: If there is an issue with the cache. 70 ValueError: If no XML root is found in the response. 71 72 """ 73 tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs) 74 75 dimensions = {} 76 for dim in tree.findall(".//str:Dimension", NAME_SPACES): 77 dim_id = dim.get("id") 78 dim_pos = dim.get("position") 79 if dim_id is None or dim_pos is None: 80 continue 81 contents = {"position": dim_pos} 82 if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and ( 83 enumer := lr.find("str:Enumeration/Ref", NAME_SPACES) 84 ) is not None: 85 contents = contents | enumer.attrib 86 dimensions[dim_id] = contents 87 return dimensions
Get the data dimensions metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
22@cache 23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 24 """Get the toplevel metadata from the ABS SDMX API. 25 26 Args: 27 flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". 28 **kwargs: Additional keyword arguments passed to acquire_url(). 29 30 Returns: 31 dict[str, dict[str, str]]: A dictionary containing the dataflow IDs 32 and their metadatain key=value pairs. 33 34 Raises: 35 HttpError: If there is an issue with the HTTP request. 36 CacheError: If there is an issue with the cache. 37 ValueError: If no XML root is found in the response. 38 39 """ 40 tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs) 41 42 d_flows: FlowMetaDict = {} 43 for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES): 44 attributes: dict[str, str] = dataflow.attrib.copy() 45 if "id" not in attributes: 46 continue 47 df_id = attributes.pop("id") 48 name_elem = dataflow.find("com:Name", NAME_SPACES) 49 df_name = name_elem.text if name_elem is not None else "(missing name)" 50 attributes["name"] = str(df_name) # str(...) because pylance complains about it being None 51 d_flows[df_id] = attributes 52 return d_flows
Get the toplevel metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
196def fetch( 197 flow_id: str, 198 dims: dict[str, str] | None = None, 199 parameters: dict[str, str] | None = None, 200 *, 201 validate: bool = False, 202 **kwargs: Unpack[GetFileKwargs], 203) -> tuple[pd.DataFrame, pd.DataFrame]: 204 """Fetch data from the ABS SDMX API. 205 206 Args: 207 flow_id (str): The ID of the data flow from which to retrieve data items. 208 dims (dict[str, str], optional): A dictionary of dimensions to select the 209 data items. If None, the ABS fetch request will be for all data items, 210 which can be slow. 211 parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply 212 to the data request. Supported parameters include: 213 - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') 214 - 'endPeriod': End period for data filtering (e.g., '2023-Q4') 215 - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') 216 If None, no parameters are applied. 217 validate (bool): If True, print validation diagnostics for the proposed 218 dimensions against the metadata requirements. Defaults to False. 219 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 220 221 Returns: a tuple of two DataFrames: 222 - The first DataFrame contains the fetched data. 223 - The second DataFrame contains the metadata. 224 225 Raises: 226 HttpError: If there is an issue with the HTTP request. 227 CacheError: If there is an issue with the cache. 228 ValueError: If no XML root is found in the response. 229 ValueError: If invalid parameter values are provided. 230 231 Notes: 232 If the `dims` argument is not valid you should get a CacheError or HttpError. 233 If the `flow_id` is not valid, you should get a ValueError. 234 235 """ 236 # --- debugging output 237 verbose = kwargs.get("verbose", False) 238 if verbose: 239 print(f"fetch(): {flow_id=} {dims=} {parameters=} {validate=} {kwargs=}") 240 241 # --- validate parameters 242 valid_detail_values = {"full", "dataonly", "serieskeysonly", "nodata"} 243 if parameters: 244 detail_value = parameters.get("detail") 245 if detail_value and detail_value not in valid_detail_values: 246 raise ValueError(f"Invalid detail value '{detail_value}'. Must be one of: {valid_detail_values}") 247 248 # --- prepare to get the XML root from the ABS SDMX API 249 # prefer fresh data every time 250 kwargs["modality"] = kwargs.get("modality", "prefer-url") 251 key = build_key( 252 flow_id, 253 dims, 254 validate=validate, 255 ) 256 257 # --- build URL with optional parameters 258 url = f"{URL_STEM}/data/{flow_id}/{key}" 259 if parameters: 260 url_params = [] 261 if "startPeriod" in parameters: 262 url_params.append(f"startPeriod={parameters['startPeriod']}") 263 if "endPeriod" in parameters: 264 url_params.append(f"endPeriod={parameters['endPeriod']}") 265 if "detail" in parameters: 266 url_params.append(f"detail={parameters['detail']}") 267 if url_params: 268 url += "?" + "&".join(url_params) 269 270 xml_root = acquire_xml(url, **kwargs) 271 return _extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply to the data request. Supported parameters include: - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') - 'endPeriod': End period for data filtering (e.g., '2023-Q4') - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') If None, no parameters are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. ValueError: If invalid parameter values are provided.
Notes:
If the dims argument is not valid you should get a CacheError or HttpError.
If the flow_id is not valid, you should get a ValueError.
114def fetch_multi( 115 wanted: pd.DataFrame, 116 parameters: dict[str, str] | None = None, 117 *, 118 validate: bool = False, 119 **kwargs: Unpack[GetFileKwargs], 120) -> tuple[pd.DataFrame, pd.DataFrame]: 121 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 122 123 Args: 124 wanted: A DataFrame with rows for each desired data set (of one or more series). 125 Each row should contain the necessary identifiers to fetch the dataset. 126 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 127 The 'flow_id' column is mandatory, and the rest are optional. 128 Note: the DataFrame index is not used in the fetching process. 129 parameters: A dictionary of additional parameters to pass to the fetch function. 130 validate: If True, the function will validate dimensions and values against 131 the ABS SDMX API codelists. Defaults to False. 132 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 133 134 Returns: 135 A tuple containing two DataFrames: 136 - The first DataFrame contains the fetched data. 137 - The second DataFrame contains metadata about the fetched datasets. 138 139 Raises: 140 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 141 142 Note: 143 CacheError and HttpError are raised by the fetch function. 144 These will be caught and reported to standard output. 145 146 Note: 147 The function validates that all datasets have compatible index types. 148 A ValueError will be raised if incompatible index types are detected 149 (e.g., mixing quarterly and monthly data). 150 151 """ 152 # --- debugging output 153 verbose = kwargs.get("verbose", False) 154 if verbose: 155 print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}") 156 157 # --- quick sanity checks 158 if wanted.empty: 159 print("wanted DataFrame is empty, returning empty DataFrames.") 160 return pd.DataFrame(), pd.DataFrame() 161 if "flow_id" not in wanted.columns: 162 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 163 164 # --- do the work 165 return _extract(wanted, parameters, validate=validate, **kwargs)
Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.
Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.
Raises:
ValueError: If the 'flow_id' column is missing from the wanted DataFrame.
Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.
Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).
203def fetch_selection( 204 flow_id: str, 205 criteria: MatchCriteria, 206 parameters: dict[str, str] | None = None, 207 *, 208 validate: bool = False, 209 **kwargs: Unpack[GetFileKwargs], 210) -> tuple[pd.DataFrame, pd.DataFrame]: 211 """Fetch data based on a selection criteria for items. 212 213 Args: 214 flow_id (str): The ID of the data flow to fetch. 215 criteria (MatchCriteria): A sequence of match criteria to filter the data. 216 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 217 validate (bool, optional): If True, validate the selection against the flow's 218 required dimensions. Defaults to False. 219 **kwargs: Additional keyword arguments for the fetch_multi function. 220 221 Returns: 222 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 223 224 """ 225 verbose = kwargs.get("verbose", False) 226 if verbose: 227 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 228 selection = make_wanted(flow_id, criteria) 229 return fetch_multi(selection, parameters, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
169def make_wanted( 170 flow_id: str, 171 criteria: MatchCriteria, 172) -> pd.DataFrame: 173 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 174 175 Args: 176 flow_id (str): The ID of the data flow to select items from. 177 criteria (MatchCriteria): A sequence of tuples containing the pattern, 178 dimension name, and match-type (exact, partial, or regex). 179 180 Returns: 181 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 182 into the call of the function fetch_multi(). 183 184 Raises: 185 ValueError: If the flow_id is not valid or if no items match the criteria. 186 187 Notes: 188 - Should build a one line DataFrame. This Frame may select multiple data series, 189 when passed to fetch_multi. It also can be concatenated with other DataFrames 190 to build a larger selection. 191 - If two match elements refer to the same dimension, only the `intersection` of the 192 matches will be returned. 193 194 """ 195 dimensions = _validate_flow_and_dimensions(flow_id) 196 result_dict = _process_match_criteria(criteria, flow_id, dimensions) 197 198 # Add flow_id and return as DataFrame 199 result_dict["flow_id"] = flow_id 200 return pd.DataFrame([result_dict]).astype(str)
Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
150def match_item( 151 pattern: str, 152 dimension: str, 153 match_type: MatchType = MatchType.PARTIAL, 154) -> MatchItem: 155 """Create a new MatchItem for use in select_items() and fetch_selection(). 156 157 Args: 158 pattern (str): The pattern to match. 159 dimension (str): The dimension to match against. 160 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 161 162 Returns: 163 MatchElement: A tuple representing the match element. 164 165 """ 166 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.