sdmxabs
Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.
1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.""" 2 3from importlib.metadata import PackageNotFoundError, version 4 5from .download_cache import ( 6 CacheError, 7 GetFileKwargs, 8 HttpError, 9 ModalityType, 10) 11from .fetch import fetch 12from .fetch_gdp import fetch_gdp 13from .fetch_multi import fetch_multi 14from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item 15from .flow_metadata import code_lists, data_dimensions, data_flows 16 17# --- version and author 18try: 19 __version__ = version(__name__) 20except PackageNotFoundError: 21 __version__ = "0.0.0" # Fallback for development mode 22__author__ = "Bryan Palmer" 23 24# --- establish the package contents 25__all__ = [ 26 "CacheError", 27 "GetFileKwargs", 28 "HttpError", 29 "MatchCriteria", 30 "MatchItem", 31 "MatchType", 32 "ModalityType", 33 "__author__", 34 "__version__", 35 "code_lists", 36 "data_dimensions", 37 "data_flows", 38 "fetch", 39 "fetch_gdp", 40 "fetch_multi", 41 "fetch_selection", 42 "make_wanted", 43 "match_item", 44]
A problem retrieving data from the cache.
45class GetFileKwargs(TypedDict): 46 """TypedDict for acqure_url function arguments.""" 47 48 verbose: NotRequired[bool] 49 """If True, print information about the data retrieval process.""" 50 modality: NotRequired[ModalityType] 51 """Kind of retrieval: "prefer_cache", "prefer_url"."""
TypedDict for acqure_url function arguments.
A problem retrieving data using HTTP.
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
90@cache 91def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 92 """Get the code list metadata from the ABS SDMX API. 93 94 Args: 95 cl_id (str): The ID of the code list to retrieve. 96 **kwargs: Additional keyword arguments passed to acquire_url(). 97 98 Returns: 99 FlowMetaDict: A dictionary containing the codes and 100 their associated key=value pairs. A "name" key should always 101 be present. A "parent" key may also be present. 102 103 Raises: 104 HttpError: If there is an issue with the HTTP request. 105 CacheError: If there is an issue with the cache. 106 ValueError: If no XML root is found in the response. 107 108 Note: 109 You will get a CacheError if the codelist is not found on the ABS SDMX API. 110 (This package tries the website first, then the cache.) 111 112 Guarantees for the inner dictionary: 113 - The inner dictionary will always have a "name" key. 114 - The inner dictionary may have a "parent" key if the code has a parent. 115 116 """ 117 tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs) 118 119 codes: FlowMetaDict = {} 120 for code in tree.findall(".//str:Code", NAME_SPACES): 121 code_id = code.get("id", None) 122 if code_id is None: 123 continue 124 elements: dict[str, str] = {} 125 126 # - get the name 127 name = code.find("com:Name", NAME_SPACES) 128 if name is None or not name.text: 129 # guarantee that we name key and value pair 130 print(f"Warning: Code {code_id} in {cl_id}has no name, skipping.") 131 continue # skip if no name 132 elements["name"] = name.text 133 134 # - get the parent 135 parent = code.find("str:Parent", NAME_SPACES) 136 parent_id = "" 137 if parent is not None: 138 ref = parent.find("Ref", NAME_SPACES) 139 if ref is not None: 140 parent_id = str(ref.get("id", "")) 141 if parent_id: # Only add if not empty 142 elements["parent"] = parent_id 143 144 codes[code_id] = elements 145 146 return codes
Get the code list metadata from the ABS SDMX API.
Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)
Guarantees for the inner dictionary: - The inner dictionary will always have a "name" key. - The inner dictionary may have a "parent" key if the code has a parent.
55@cache 56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 57 """Get the data dimensions metadata from the ABS SDMX API. 58 59 Args: 60 flow_id (str): The ID of the dataflow to retrieve dimensions for. 61 **kwargs: Additional keyword arguments passed to acquire_url(). 62 63 Returns: 64 dict[str, dict[str, str]]: A dictionary containing the dimensions and 65 their metadata in key=value pairs. 66 67 Raises: 68 HttpError: If there is an issue with the HTTP request. 69 CacheError: If there is an issue with the cache. 70 ValueError: If no XML root is found in the response. 71 72 """ 73 tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs) 74 75 dimensions = {} 76 for dim in tree.findall(".//str:Dimension", NAME_SPACES): 77 dim_id = dim.get("id") 78 dim_pos = dim.get("position") 79 if dim_id is None or dim_pos is None: 80 continue 81 contents = {"position": dim_pos} 82 if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and ( 83 enumer := lr.find("str:Enumeration/Ref", NAME_SPACES) 84 ) is not None: 85 contents = contents | enumer.attrib 86 dimensions[dim_id] = contents 87 return dimensions
Get the data dimensions metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
22@cache 23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 24 """Get the toplevel metadata from the ABS SDMX API. 25 26 Args: 27 flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". 28 **kwargs: Additional keyword arguments passed to acquire_url(). 29 30 Returns: 31 dict[str, dict[str, str]]: A dictionary containing the dataflow IDs 32 and their metadatain key=value pairs. 33 34 Raises: 35 HttpError: If there is an issue with the HTTP request. 36 CacheError: If there is an issue with the cache. 37 ValueError: If no XML root is found in the response. 38 39 """ 40 tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs) 41 42 d_flows: FlowMetaDict = {} 43 for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES): 44 attributes: dict[str, str] = dataflow.attrib.copy() 45 if "id" not in attributes: 46 continue 47 df_id = attributes.pop("id") 48 name_elem = dataflow.find("com:Name", NAME_SPACES) 49 df_name = name_elem.text if name_elem is not None else "(missing name)" 50 attributes["name"] = str(df_name) # str(...) because pylance complains about it being None 51 d_flows[df_id] = attributes 52 return d_flows
Get the toplevel metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
196def fetch( 197 flow_id: str, 198 dims: dict[str, str] | None = None, 199 parameters: dict[str, str] | None = None, 200 *, 201 validate: bool = False, 202 **kwargs: Unpack[GetFileKwargs], 203) -> tuple[pd.DataFrame, pd.DataFrame]: 204 """Fetch data from the ABS SDMX API. 205 206 Args: 207 flow_id (str): The ID of the data flow from which to retrieve data items. 208 dims (dict[str, str], optional): A dictionary of dimensions to select the 209 data items. If None, the ABS fetch request will be for all data items, 210 which can be slow. 211 parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply 212 to the data request. Supported parameters include: 213 - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') 214 - 'endPeriod': End period for data filtering (e.g., '2023-Q4') 215 - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') 216 If None, no parameters are applied. 217 validate (bool, optional): If True, validate `dims` against the flow's 218 required dimensions when generating the URL key. Defaults to False. 219 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 220 221 Returns: a tuple of two DataFrames: 222 - The first DataFrame contains the fetched data. 223 - The second DataFrame contains the metadata. 224 225 Raises: 226 HttpError: If there is an issue with the HTTP request. 227 CacheError: If there is an issue with the cache. 228 ValueError: If no XML root is found in the response. 229 ValueError: If invalid parameter values are provided. 230 231 Notes: 232 If the `dims` argument is not valid you should get a CacheError or HttpError. 233 If the `flow_id` is not valid, you should get a ValueError. 234 235 """ 236 # --- report the parameters used if requested 237 verbose = kwargs.get("verbose", False) 238 if verbose: 239 print(f"fetch(): {flow_id=} {dims=} {parameters=} {validate=} {kwargs=}") 240 241 # --- validate parameters 242 valid_detail_values = {"full", "dataonly", "serieskeysonly", "nodata"} 243 if parameters: 244 detail_value = parameters.get("detail") 245 if detail_value and detail_value not in valid_detail_values: 246 raise ValueError(f"Invalid detail value '{detail_value}'. Must be one of: {valid_detail_values}") 247 248 # --- prepare to get the XML root from the ABS SDMX API 249 # prefer fresh data every time 250 kwargs["modality"] = kwargs.get("modality", "prefer-url") 251 key = build_key( 252 flow_id, 253 dims, 254 validate=validate, 255 ) 256 257 # --- build URL with optional parameters 258 url = f"{URL_STEM}/data/{flow_id}/{key}" 259 if parameters: 260 url_params = [] 261 if "startPeriod" in parameters: 262 url_params.append(f"startPeriod={parameters['startPeriod']}") 263 if "endPeriod" in parameters: 264 url_params.append(f"endPeriod={parameters['endPeriod']}") 265 if "detail" in parameters: 266 url_params.append(f"detail={parameters['detail']}") 267 if url_params: 268 url += "?" + "&".join(url_params) 269 270 xml_root = acquire_xml(url, **kwargs) 271 return _extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args:
flow_id (str): The ID of the data flow from which to retrieve data items.
dims (dict[str, str], optional): A dictionary of dimensions to select the
data items. If None, the ABS fetch request will be for all data items,
which can be slow.
parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply
to the data request. Supported parameters include:
- 'startPeriod': Start period for data filtering (e.g., '2020-Q1')
- 'endPeriod': End period for data filtering (e.g., '2023-Q4')
- 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata')
If None, no parameters are applied.
validate (bool, optional): If True, validate dims against the flow's
required dimensions when generating the URL key. Defaults to False.
**kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. ValueError: If invalid parameter values are provided.
Notes:
If the dims argument is not valid you should get a CacheError or HttpError.
If the flow_id is not valid, you should get a ValueError.
18def fetch_gdp( 19 seasonality: Literal["o", "s", "t"] = "o", 20 price_measure: Literal["cp", "cvm"] = "cp", 21 parameters: dict[str, str] | None = None, 22 *, 23 validate: bool = False, 24 **kwargs: Unpack[GetFileKwargs], 25) -> tuple[pd.DataFrame, pd.DataFrame]: 26 """Fetch quarterly GDP data in $ from the ABS SDMX API. 27 28 Args: 29 seasonality (str): Type of seasonal adjustment to apply: 30 - "o": Original data without seasonal adjustment (default) 31 - "s": Seasonally adjusted data 32 - "t": Trend data 33 price_measure (str): Price measure type: 34 - "cp": Current prices (default) 35 - "cvm": Chain volume measures 36 parameters (dict[str, str] | None): Additional parameters for the API request, 37 such as 'startPeriod'. 38 validate (bool, optional): If True, validate the selection against the flow's 39 required dimensions when generating the URL key. Defaults to False. 40 **kwargs: Additional arguments passed to the fetch_selection() function 41 42 Returns: 43 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata 44 45 Raises: 46 ValueError: If invalid seasonality or price_measure values are provided 47 48 """ 49 # report the parameters used if requested 50 verbose = kwargs.get("verbose", False) 51 if verbose: 52 print(f"fetch_gdp(): {seasonality=}, {price_measure=} {validate=} {kwargs=}") 53 54 # Validate inputs 55 if seasonality not in SEAS_MAP: 56 error = f"Invalid '{seasonality=}'. Must be one of: {list(SEAS_MAP.keys())}" 57 raise ValueError(error) 58 if price_measure not in PRICE_MAP: 59 error = f"Invalid '{price_measure=}'. Must be one of: {list(PRICE_MAP.keys())}" 60 raise ValueError(error) 61 62 # build a selection criteria 63 selection_criteria = [] 64 selection_criteria.append(match_item(SEAS_MAP[seasonality], "TSEST", Mt.EXACT)) 65 selection_criteria.append(match_item(PRICE_MAP[price_measure], "MEASURE", Mt.EXACT)) 66 selection_criteria.append(match_item("Gross domestic product", "DATA_ITEM", Mt.EXACT)) 67 68 # return the data 69 flow_id = "ANA_AGG" 70 return fetch_selection(flow_id, selection_criteria, parameters, validate=validate, **kwargs)
Fetch quarterly GDP data in $ from the ABS SDMX API.
Args: seasonality (str): Type of seasonal adjustment to apply: - "o": Original data without seasonal adjustment (default) - "s": Seasonally adjusted data - "t": Trend data price_measure (str): Price measure type: - "cp": Current prices (default) - "cvm": Chain volume measures parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata
Raises: ValueError: If invalid seasonality or price_measure values are provided
116def fetch_multi( 117 wanted: pd.DataFrame, 118 parameters: dict[str, str] | None = None, 119 *, 120 validate: bool = False, 121 **kwargs: Unpack[GetFileKwargs], 122) -> tuple[pd.DataFrame, pd.DataFrame]: 123 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 124 125 Args: 126 wanted: A DataFrame with rows for each desired data set (of one or more series). 127 Each row should contain the necessary identifiers to fetch the dataset. 128 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 129 The 'flow_id' column is mandatory, and the rest are optional. 130 Note: the DataFrame index is not used in the fetching process. 131 parameters: A dictionary of additional parameters to pass to the fetch function. 132 validate: If True, the function will validate dimensions and values against 133 the ABS SDMX API codelists. Defaults to False. 134 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 135 136 Returns: 137 A tuple containing two DataFrames: 138 - The first DataFrame contains the fetched data. 139 - The second DataFrame contains metadata about the fetched datasets. 140 141 Raises: 142 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 143 144 Note: 145 CacheError and HttpError are raised by the fetch function. 146 These will be caught and reported to standard output. 147 148 Note: 149 The function validates that all datasets have compatible index types. 150 A ValueError will be raised if incompatible index types are detected 151 (e.g., mixing quarterly and monthly data). 152 153 """ 154 # --- report the parameters used if requested 155 verbose = kwargs.get("verbose", False) 156 if verbose: 157 print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}") 158 159 # --- quick sanity checks 160 if wanted.empty: 161 print("wanted DataFrame is empty, returning empty DataFrames.") 162 return pd.DataFrame(), pd.DataFrame() 163 if "flow_id" not in wanted.columns: 164 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 165 166 # --- do the work 167 return _extract(wanted, parameters, validate=validate, **kwargs)
Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.
Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.
Raises:
ValueError: If the 'flow_id' column is missing from the wanted DataFrame.
Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.
Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).
203def fetch_selection( 204 flow_id: str, 205 criteria: MatchCriteria, 206 parameters: dict[str, str] | None = None, 207 *, 208 validate: bool = False, 209 **kwargs: Unpack[GetFileKwargs], 210) -> tuple[pd.DataFrame, pd.DataFrame]: 211 """Fetch data based on a selection criteria for items. 212 213 Args: 214 flow_id (str): The ID of the data flow to fetch. 215 criteria (MatchCriteria): A sequence of match criteria to filter the data. 216 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 217 validate (bool, optional): If True, validate the selection against the flow's 218 required dimensions when generating the URL key. Defaults to False. 219 **kwargs: Additional keyword arguments for the fetch_multi function. 220 221 Returns: 222 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 223 224 """ 225 verbose = kwargs.get("verbose", False) 226 if verbose: 227 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 228 229 selection = make_wanted(flow_id, criteria) 230 return fetch_multi(selection, parameters, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
169def make_wanted( 170 flow_id: str, 171 criteria: MatchCriteria, 172) -> pd.DataFrame: 173 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 174 175 Args: 176 flow_id (str): The ID of the data flow to select items from. 177 criteria (MatchCriteria): A sequence of tuples containing the pattern, 178 dimension name, and match-type (exact, partial, or regex). 179 180 Returns: 181 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 182 into the call of the function fetch_multi(). 183 184 Raises: 185 ValueError: If the flow_id is not valid or if no items match the criteria. 186 187 Notes: 188 - Should build a one line DataFrame. This Frame may select multiple data series, 189 when passed to fetch_multi. It also can be concatenated with other DataFrames 190 to build a larger selection. 191 - If two match elements refer to the same dimension, only the `intersection` of the 192 matches will be returned. 193 194 """ 195 dimensions = _validate_flow_and_dimensions(flow_id) 196 result_dict = _process_match_criteria(criteria, flow_id, dimensions) 197 198 # Add flow_id and return as DataFrame 199 result_dict["flow_id"] = flow_id 200 return pd.DataFrame([result_dict]).astype(str)
Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
150def match_item( 151 pattern: str, 152 dimension: str, 153 match_type: MatchType = MatchType.PARTIAL, 154) -> MatchItem: 155 """Create a new MatchItem for use in select_items() and fetch_selection(). 156 157 Args: 158 pattern (str): The pattern to match. 159 dimension (str): The dimension to match against. 160 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 161 162 Returns: 163 MatchElement: A tuple representing the match element. 164 165 """ 166 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.