sdmxabs
Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.
1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.""" 2 3from importlib.metadata import PackageNotFoundError, version 4 5from .download_cache import ( 6 CacheError, 7 GetFileKwargs, 8 HttpError, 9 ModalityType, 10) 11from .fetch import fetch 12from .fetch_gdp import fetch_gdp 13from .fetch_multi import fetch_multi 14from .fetch_pop import fetch_pop, fetch_state_pop 15from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item 16from .flow_metadata import ( 17 FlowMetaDict, 18 code_list_for, 19 code_lists, 20 data_flows, 21 data_structures, 22 frame, 23 structure_from_flow_id, 24 structure_ident, 25) 26from .measures import measure_names, recalibrate, recalibrate_series 27 28# --- version and author 29try: 30 __version__ = version(__name__) 31except PackageNotFoundError: 32 __version__ = "0.0.0" # Fallback for development mode 33__author__ = "Bryan Palmer" 34 35# --- establish the package contents 36__all__ = [ 37 "CacheError", 38 "FlowMetaDict", 39 "GetFileKwargs", 40 "HttpError", 41 "MatchCriteria", 42 "MatchItem", 43 "MatchType", 44 "ModalityType", 45 "__author__", 46 "__version__", 47 "code_list_for", 48 "code_lists", 49 "data_flows", 50 "data_structures", 51 "fetch", 52 "fetch_gdp", 53 "fetch_multi", 54 "fetch_pop", 55 "fetch_selection", 56 "fetch_state_pop", 57 "frame", 58 "make_wanted", 59 "match_item", 60 "measure_names", 61 "recalibrate", 62 "recalibrate_series", 63 "structure_from_flow_id", 64 "structure_ident", 65]
A problem retrieving data from the cache.
45class GetFileKwargs(TypedDict): 46 """TypedDict for acqure_url function arguments.""" 47 48 verbose: NotRequired[bool] 49 """If True, print information about the data retrieval process.""" 50 modality: NotRequired[ModalityType] 51 """Kind of retrieval: "prefer_cache", "prefer_url"."""
TypedDict for acqure_url function arguments.
A problem retrieving data using HTTP.
22class MatchType(Enum): 23 """Enumeration for match types.""" 24 25 EXACT = 1 26 PARTIAL = 2 27 REGEX = 3
Enumeration for match types.
214@cache 215def code_list_for(struct_id: str, dim_name: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 216 """Get the code list for a specific dimension or attribute in a data structure. 217 218 Args: 219 struct_id (str): The data structure ID. 220 dim_name (str): The dimension or attribute ID to retrieve the code list for. 221 **kwargs: Additional keyword arguments passed to acquire_url(). 222 223 Returns: 224 FlowMetaDict: A dictionary containing the codes and their metadata. 225 226 Raises: 227 ValueError: If the dimension/attribute is not found in the structure. 228 229 """ 230 structure = data_structures(struct_id, **kwargs) 231 if not structure: 232 raise ValueError(f"No structure found for structure ID '{struct_id}'") 233 if dim_name not in structure: 234 raise ValueError(f"Dimension/Attribute '{dim_name}' not found in structure: '{struct_id}'") 235 236 codelist_id = structure[dim_name].get(CODE_LIST_ID, "") 237 if not codelist_id: 238 raise ValueError( 239 f"No codelist found for dimension/attribute '{dim_name}' in structure ID '{struct_id}'" 240 ) 241 242 return code_lists(codelist_id, **kwargs)
Get the code list for a specific dimension or attribute in a data structure.
Args: struct_id (str): The data structure ID. dim_name (str): The dimension or attribute ID to retrieve the code list for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their metadata.
Raises: ValueError: If the dimension/attribute is not found in the structure.
155@cache 156def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 157 """Get the code list metadata from the ABS SDMX API. 158 159 Args: 160 cl_id (str): The ID of the code list to retrieve. 161 **kwargs: Additional keyword arguments passed to acquire_url(). 162 163 Returns: 164 FlowMetaDict: A dictionary containing the codes and 165 their associated key=value pairs. A "name" key should always 166 be present. A "parent" key may also be present. 167 168 Raises: 169 HttpError: If there is an issue with the HTTP request. 170 CacheError: If there is an issue with the cache. 171 ValueError: If no XML root is found in the response. 172 173 Note: 174 You will get a CacheError if the codelist is not found on the ABS SDMX API. 175 (This package tries the website first, then the cache.) 176 177 Guarantees for the inner dictionary: 178 - The inner dictionary will always have a "name" key. 179 - The inner dictionary may have a "parent" key if the code has a parent. 180 181 """ 182 tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs) 183 184 codes: FlowMetaDict = {} 185 for code in tree.findall(".//str:Code", NAME_SPACES): 186 code_id = code.get("id", None) 187 if code_id is None: 188 continue 189 elements: dict[str, str] = {} 190 191 # - get the name 192 name = code.find("com:Name", NAME_SPACES) 193 if name is None or not name.text: 194 # guarantee that we name key and value pair 195 print(f"Warning: Code {code_id} in {cl_id}has no name, skipping.") 196 continue # skip if no name 197 elements["name"] = name.text 198 199 # - get the parent 200 parent = code.find("str:Parent", NAME_SPACES) 201 parent_id = "" 202 if parent is not None: 203 ref = parent.find("Ref", NAME_SPACES) 204 if ref is not None: 205 parent_id = str(ref.get("id", "")) 206 if parent_id: # Only add if not empty 207 elements["parent"] = parent_id 208 209 codes[code_id] = elements 210 211 return codes
Get the code list metadata from the ABS SDMX API.
Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)
Guarantees for the inner dictionary: - The inner dictionary will always have a "name" key. - The inner dictionary may have a "parent" key if the code has a parent.
41@cache 42def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 43 """Get the toplevel metadata from the ABS SDMX API. 44 45 Args: 46 flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". 47 **kwargs: Additional keyword arguments passed to acquire_url(). 48 49 Returns: 50 dict[str, dict[str, str]]: A dictionary containing the dataflow IDs 51 and their metadata in key=value pairs. Importantly, it includes 52 the DATASTRUCTURE identifier, which is used to retrieve 53 the dimensions and attributes metadata. 54 55 Raises: 56 HttpError: If there is an issue with the HTTP request. 57 CacheError: If there is an issue with the cache. 58 ValueError: If no XML root is found in the response. 59 60 Guarantees: 61 - data_flows(): the returned inner dictionary from data_flows() will always contain the keys 62 "flow_name" and "data_structure_id" for each dataflow. Any XML without these 63 keys from the ABS is ignored. 64 65 """ 66 tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs) 67 68 data_flows_dict: FlowMetaDict = {} 69 for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES): 70 attributes: dict[str, str] = dataflow.attrib.copy() 71 if "id" not in attributes: 72 continue 73 dataflow_id = attributes.pop("id") 74 name_elem = dataflow.find("com:Name", NAME_SPACES) 75 dataflow_name = name_elem.text if name_elem is not None else "(missing name)" 76 attributes[FLOW_NAME] = str(dataflow_name) 77 ds_elem = dataflow.find("str:Structure/Ref", NAME_SPACES) 78 if ds_elem is None: 79 continue # skip if no data structure reference 80 ds_id = ds_elem.get("id", "") 81 if not ds_id: 82 continue 83 attributes[DATA_STRUCT_ID] = ds_id 84 data_flows_dict[dataflow_id] = attributes 85 return data_flows_dict
Get the toplevel metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadata in key=value pairs. Importantly, it includes the DATASTRUCTURE identifier, which is used to retrieve the dimensions and attributes metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Guarantees: - data_flows(): the returned inner dictionary from data_flows() will always contain the keys "flow_name" and "data_structure_id" for each dataflow. Any XML without these keys from the ABS is ignored.
109@cache 110def data_structures(struct_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 111 """Get the data structure for a specific structure ID from the ABS SDMX API. 112 113 Args: 114 struct_id (str): The ID of the data structure to retrieve. 115 **kwargs: Additional keyword arguments passed to acquire_url(). 116 117 Returns: 118 dict[str, dict[str, str]]: A dictionary containing the dimensions and 119 their metadata in key=value pairs. 120 121 Raises: 122 HttpError: If there is an issue with the HTTP request. 123 CacheError: If there is an issue with the cache. 124 ValueError: If no XML root is found in the response. 125 126 Note: 127 The dimensions metadata includes a "position" for each dimmension. 128 The attributes metadata does not have "position" information. 129 130 """ 131 tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{struct_id}", **kwargs) 132 133 elements = {} 134 for ident in ("Dimension", "Attribute"): 135 for elem in tree.findall(f".//str:{ident}", NAME_SPACES): 136 element_id = elem.get("id") 137 if element_id is None: 138 continue 139 contents = {} 140 if ident == "Dimension": 141 contents[POSITION] = elem.get(POSITION, "") 142 if (lr := elem.find("str:LocalRepresentation", NAME_SPACES)) is not None and ( 143 enumer := lr.find("str:Enumeration/Ref", NAME_SPACES) 144 ) is not None: 145 contents = contents | enumer.attrib 146 # --- check we have a code list, and give it a better name 147 code_list_id = contents.pop("id", "") 148 if not code_list_id or contents.get("package") != "codelist": 149 continue 150 contents[CODE_LIST_ID] = code_list_id 151 elements[element_id] = contents 152 return elements
Get the data structure for a specific structure ID from the ABS SDMX API.
Args: struct_id (str): The ID of the data structure to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: The dimensions metadata includes a "position" for each dimmension. The attributes metadata does not have "position" information.
202def fetch( 203 flow_id: str, 204 selection: dict[str, str] | None = None, 205 parameters: dict[str, str] | None = None, 206 *, 207 validate: bool = False, 208 **kwargs: Unpack[GetFileKwargs], 209) -> tuple[pd.DataFrame, pd.DataFrame]: 210 """Fetch data from the ABS SDMX API. 211 212 Args: 213 flow_id (str): The ID of the data flow from which to retrieve data items. 214 selection (dict[str, str], optional): A dictionary of dimension=value pairs 215 to select the data items. If None, the ABS fetch request will be for all 216 data items, which can be slow. 217 parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply 218 to the data request. Supported parameters include: 219 - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') 220 - 'endPeriod': End period for data filtering (e.g., '2023-Q4') 221 - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') 222 If None, no parameters are applied. 223 validate (bool, optional): If True, validate against the flow's 224 required dimensions when generating the URL key. Defaults to False. 225 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 226 227 Returns: a tuple of two DataFrames: 228 - The first DataFrame contains the fetched data. 229 - The second DataFrame contains the metadata. 230 231 Raises: 232 HttpError: If there is an issue with the HTTP request. 233 CacheError: If there is an issue with the cache. 234 ValueError: If no XML root is found in the response. 235 ValueError: If invalid parameter values are provided. 236 237 Notes: 238 If the `dims` argument is not valid you should get a CacheError or HttpError. 239 If the `flow_id` is not valid, you should get a ValueError. 240 241 """ 242 # --- report the parameters used if requested 243 verbose = kwargs.get("verbose", False) 244 if verbose: 245 print(f"fetch(): {flow_id=} {selection=} {parameters=} {validate=} {kwargs=}") 246 247 # --- validate parameters 248 valid_detail_values = {"full", "dataonly", "serieskeysonly", "nodata"} 249 if parameters: 250 detail_value = parameters.get("detail") 251 if detail_value and detail_value not in valid_detail_values: 252 raise ValueError(f"Invalid detail value '{detail_value}'. Must be one of: {valid_detail_values}") 253 254 # --- prepare to get the XML root from the ABS SDMX API 255 # prefer fresh data every time 256 kwargs["modality"] = kwargs.get("modality", "prefer-url") 257 key = build_key(flow_id, selection, validate=validate) 258 259 # --- build URL with optional parameters 260 url = f"{URL_STEM}/data/{flow_id}/{key}" 261 if parameters: 262 url_params = [] 263 if "startPeriod" in parameters: 264 url_params.append(f"startPeriod={parameters['startPeriod']}") 265 if "endPeriod" in parameters: 266 url_params.append(f"endPeriod={parameters['endPeriod']}") 267 if "detail" in parameters: 268 url_params.append(f"detail={parameters['detail']}") 269 if url_params: 270 url += "?" + "&".join(url_params) 271 272 xml_root = acquire_xml(url, **kwargs) 273 return _extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args: flow_id (str): The ID of the data flow from which to retrieve data items. selection (dict[str, str], optional): A dictionary of dimension=value pairs to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply to the data request. Supported parameters include: - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') - 'endPeriod': End period for data filtering (e.g., '2023-Q4') - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') If None, no parameters are applied. validate (bool, optional): If True, validate against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. ValueError: If invalid parameter values are provided.
Notes:
If the dims argument is not valid you should get a CacheError or HttpError.
If the flow_id is not valid, you should get a ValueError.
18def fetch_gdp( 19 seasonality: Literal["o", "s", "t"] = "o", 20 price_measure: Literal["cp", "cvm"] = "cp", 21 parameters: dict[str, str] | None = None, 22 *, 23 validate: bool = False, 24 **kwargs: Unpack[GetFileKwargs], 25) -> tuple[pd.DataFrame, pd.DataFrame]: 26 """Fetch quarterly GDP data in $ from the ABS SDMX API. 27 28 Args: 29 seasonality (str): Type of seasonal adjustment to apply: 30 - "o": Original data without seasonal adjustment (default) 31 - "s": Seasonally adjusted data 32 - "t": Trend data 33 price_measure (str): Price measure type: 34 - "cp": Current prices (default) 35 - "cvm": Chain volume measures 36 parameters (dict[str, str] | None): Additional parameters for the API request, 37 such as 'startPeriod'. 38 validate (bool, optional): If True, validate the selection against the flow's 39 required dimensions when generating the URL key. Defaults to False. 40 **kwargs: Additional arguments passed to the fetch_selection() function 41 42 Returns: 43 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata 44 45 Raises: 46 ValueError: If invalid seasonality or price_measure values are provided 47 48 """ 49 # report the parameters used if requested 50 verbose = kwargs.get("verbose", False) 51 if verbose: 52 print(f"fetch_gdp(): {seasonality=}, {price_measure=} {validate=} {kwargs=}") 53 54 # Validate inputs 55 if seasonality not in SEAS_MAP: 56 error = f"Invalid '{seasonality=}'. Must be one of: {list(SEAS_MAP.keys())}" 57 raise ValueError(error) 58 if price_measure not in PRICE_MAP: 59 error = f"Invalid '{price_measure=}'. Must be one of: {list(PRICE_MAP.keys())}" 60 raise ValueError(error) 61 62 # build a selection criteria 63 selection_criteria = [ 64 (SEAS_MAP[seasonality], "TSEST", Mt.EXACT), 65 (PRICE_MAP[price_measure], "MEASURE", Mt.EXACT), 66 ("Gross domestic product", "DATA_ITEM", Mt.EXACT), 67 ] 68 # return the data 69 flow_id = "ANA_AGG" 70 return fetch_selection(flow_id, selection_criteria, parameters, validate=validate, **kwargs)
Fetch quarterly GDP data in $ from the ABS SDMX API.
Args: seasonality (str): Type of seasonal adjustment to apply: - "o": Original data without seasonal adjustment (default) - "s": Seasonally adjusted data - "t": Trend data price_measure (str): Price measure type: - "cp": Current prices (default) - "cvm": Chain volume measures parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata
Raises: ValueError: If invalid seasonality or price_measure values are provided
118def fetch_multi( 119 wanted: pd.DataFrame, 120 parameters: dict[str, str] | None = None, 121 *, 122 validate: bool = False, 123 **kwargs: Unpack[GetFileKwargs], 124) -> tuple[pd.DataFrame, pd.DataFrame]: 125 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 126 127 Args: 128 wanted: A DataFrame with rows for each desired data set (of one or more series). 129 Each row should contain the necessary identifiers to fetch the dataset. 130 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 131 The 'flow_id' column is mandatory, and the rest are optional. 132 Note: the DataFrame index is not used in the fetching process. 133 parameters: A dictionary of additional parameters to pass to the fetch function. 134 validate: If True, the function will validate dimensions and values against 135 the ABS SDMX API codelists. Defaults to False. 136 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 137 138 Returns: 139 A tuple containing two DataFrames: 140 - The first DataFrame contains the fetched data. 141 - The second DataFrame contains metadata about the fetched datasets. 142 143 Raises: 144 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 145 146 Note: 147 CacheError and HttpError are raised by the fetch function. 148 These will be caught and reported to standard output. 149 150 Note: 151 The function validates that all datasets have compatible index types. 152 A ValueError will be raised if incompatible index types are detected 153 (e.g., mixing quarterly and monthly data). 154 155 """ 156 # --- report the parameters used if requested 157 verbose = kwargs.get("verbose", False) 158 if verbose: 159 print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}") 160 161 # --- quick sanity checks 162 if wanted.empty: 163 print("wanted DataFrame is empty, returning empty DataFrames.") 164 return pd.DataFrame(), pd.DataFrame() 165 if "flow_id" not in wanted.columns: 166 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 167 168 # --- do the work 169 return _extract(wanted, parameters, validate=validate, **kwargs)
Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.
Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.
Raises:
ValueError: If the 'flow_id' column is missing from the wanted DataFrame.
Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.
Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).
136def fetch_pop( 137 source: Literal["erp", "na"] = "erp", 138 parameters: dict[str, str] | None = None, 139 *, 140 projection: bool = False, 141 validate: bool = False, 142 **kwargs: Unpack[GetFileKwargs], 143) -> tuple[pd.DataFrame, pd.DataFrame]: 144 """Fetch Estimated Resident Population (ERP) data from the ABS SDMX API. 145 146 Args: 147 source (str): Source of the population data: 148 - "erp": ABS published Estimated Resident Population (default) 149 - "na": Implied population from the ABS National Accounts 150 parameters (dict[str, str] | None): Additional parameters for the API request, 151 such as 'startPeriod'. 152 projection (bool, optional): If True, and data is available for the most recent year, 153 make a projection forward to the current quarter, based on growth over the last 4 quarters. 154 validate (bool, optional): If True, validate the selection against the flow's 155 required dimensions when generating the URL key. Defaults to False. 156 **kwargs: Additional arguments passed to the fetch_selection() function 157 158 Returns: 159 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata 160 161 """ 162 # report the parameters used if requested 163 verbose = kwargs.get("verbose", False) 164 if verbose: 165 print(f"fetch_pop(): {source=} {validate=} {kwargs=}") 166 167 # build a selection criteria and fetch the relevant data 168 match source: 169 case "erp": 170 data, meta = _erp_population("Australia", parameters, validate=validate, **kwargs) 171 case "na": 172 data, meta = _na_population(parameters, validate=validate, **kwargs) 173 case _: 174 raise ValueError(f"Invalid source '{source}'. Must be one of: ['erp', 'na']") 175 176 # if requested, make a projection of the data 177 if projection: 178 data = _make_projection(data) 179 180 return data, meta
Fetch Estimated Resident Population (ERP) data from the ABS SDMX API.
Args: source (str): Source of the population data: - "erp": ABS published Estimated Resident Population (default) - "na": Implied population from the ABS National Accounts parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, and data is available for the most recent year, make a projection forward to the current quarter, based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
169def fetch_selection( 170 flow_id: str, 171 criteria: MatchCriteria, 172 parameters: dict[str, str] | None = None, 173 *, 174 validate: bool = False, 175 **kwargs: Unpack[GetFileKwargs], 176) -> tuple[pd.DataFrame, pd.DataFrame]: 177 """Fetch data based on a selection criteria for items. 178 179 Args: 180 flow_id (str): The ID of the data flow to fetch. 181 criteria (MatchCriteria): A sequence of match criteria to filter the data. 182 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 183 validate (bool, optional): If True, validate the selection against the flow's 184 required dimensions when generating the URL key. Defaults to False. 185 **kwargs: Additional keyword arguments for the fetch_multi function. 186 187 Returns: 188 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 189 190 """ 191 verbose = kwargs.get("verbose", False) 192 if verbose: 193 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 194 195 selection = make_wanted(flow_id, criteria) 196 return fetch_multi(selection, parameters, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
183def fetch_state_pop( 184 state: str, 185 parameters: dict[str, str] | None = None, 186 *, 187 projection: bool = False, 188 validate: bool = False, 189 **kwargs: Unpack[GetFileKwargs], 190) -> tuple[pd.DataFrame, pd.DataFrame]: 191 """Fetch state-level ERP population data from the ABS SDMX API. 192 193 Args: 194 state (str): State/territory name or case-insensitive abbreviation (e.g., "NSW", "Vic", "qld", etc.). 195 [Note: Use "" or "all" for the population estimates for all states.] 196 parameters (dict[str, str] | None): Additional parameters for the API request, 197 such as 'startPeriod'. 198 projection (bool, optional): If True, make a projection forward to the current quarter 199 based on growth over the last 4 quarters. 200 validate (bool, optional): If True, validate the selection against the flow's 201 required dimensions when generating the URL key. Defaults to False. 202 **kwargs: Additional arguments passed to the fetch_selection() function 203 204 Returns: 205 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata 206 207 """ 208 # report the parameters used if requested 209 verbose = kwargs.get("verbose", False) 210 if verbose: 211 print(f"fetch_state_pop(): {state=} {validate=} {kwargs=}") 212 213 if state.lower() in ("", "all"): 214 full_state_name: str = "" 215 else: 216 full_state_name = _state_name_from_abbrev(state) 217 218 data, meta = _erp_population(full_state_name, parameters, validate=validate, **kwargs) 219 220 if projection: 221 data = _make_projection(data) 222 223 return data, meta
Fetch state-level ERP population data from the ABS SDMX API.
Args: state (str): State/territory name or case-insensitive abbreviation (e.g., "NSW", "Vic", "qld", etc.). [Note: Use "" or "all" for the population estimates for all states.] parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, make a projection forward to the current quarter based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
270def frame(f: FlowMetaDict) -> pd.DataFrame: 271 """Convert a FlowMetaDict to a pandas DataFrame. 272 273 Args: 274 f (FlowMetaDict): The flow metadata dictionary to convert. 275 276 Returns: 277 pd.DataFrame: A DataFrame representation of the flow metadata. 278 279 Note: This is a utility function to help visualize the flow metadata. 280 281 """ 282 return pd.DataFrame(f).T
Convert a FlowMetaDict to a pandas DataFrame.
Args: f (FlowMetaDict): The flow metadata dictionary to convert.
Returns: pd.DataFrame: A DataFrame representation of the flow metadata.
Note: This is a utility function to help visualize the flow metadata.
135def make_wanted( 136 flow_id: str, 137 criteria: MatchCriteria, 138) -> pd.DataFrame: 139 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 140 141 Args: 142 flow_id (str): The ID of the data flow to select items from. 143 criteria (MatchCriteria): A sequence of tuples containing the pattern, 144 dimension name, and match-type (exact, partial, or regex). 145 146 Returns: 147 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 148 into the call of the function fetch_multi(). 149 150 Raises: 151 ValueError: If the flow_id is not valid or if no items match the criteria. 152 153 Notes: 154 - Should build a one line DataFrame. This Frame may select multiple data series, 155 when passed to fetch_multi. It also can be concatenated with other DataFrames 156 to build a larger selection. 157 - If two match elements refer to the same dimension, only the `intersection` of the 158 matches will be returned. 159 160 """ 161 structure = structure_from_flow_id(flow_id) 162 result_dict = _process_match_criteria(criteria, structure) 163 164 # Add flow_id and return as DataFrame 165 result_dict["flow_id"] = flow_id 166 return pd.DataFrame([result_dict]).astype(str)
Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
113def match_item( 114 pattern: str, 115 dimension: str, 116 match_type: MatchType = MatchType.PARTIAL, 117) -> MatchItem: 118 """Create a new MatchItem for use in select_items() and fetch_selection(). 119 120 Args: 121 pattern (str): The pattern to match. 122 dimension (str): The dimension to match against. 123 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 124 125 Returns: 126 MatchElement: A tuple representing the match element. 127 128 Note: 129 This function is of little value. It is much easier to create the tuple directly. 130 131 """ 132 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
Note: This function is of little value. It is much easier to create the tuple directly.
111def measure_names(meta: pd.DataFrame) -> pd.Series: 112 """Get the measure names for each row in the metadata DataFrame - (for y-axis labels). 113 114 Args: 115 meta (pd.DataFrame): The metadata DataFrame. 116 117 Returns: 118 pd.Series: A Series containing the measure names, indexed by the row labels. 119 120 """ 121 series = pd.Series(dtype=str) 122 duplicate_number: str = " Number" # the space before 'Number' is important 123 for label, row in meta.iterrows(): 124 name: str = str(label) # worst case scenario 125 if "UNIT_MEASURE" in row: 126 name = str(row["UNIT_MEASURE"]) # a better base case 127 if row.get("UNIT_MULT"): 128 try: 129 index = int(row["UNIT_MULT"]) 130 if index in INDICIES and index > 0: 131 name = f"{INDICIES[index]} {name}" # best case 132 except ValueError: 133 pass 134 name = name.removesuffix(duplicate_number) # Just in case it is 'Number Numer' 135 series[label] = name 136 return series
Get the measure names for each row in the metadata DataFrame - (for y-axis labels).
Args: meta (pd.DataFrame): The metadata DataFrame.
Returns: pd.Series: A Series containing the measure names, indexed by the row labels.
139def recalibrate( 140 data: pd.DataFrame, units: pd.Series, *, as_a_whole: bool = False 141) -> tuple[pd.DataFrame, pd.Series]: 142 """Recalibrate the data so that its maximum value is between 1 and 1000. 143 144 Args: 145 units (pd.Series): The units of measure (as returned by measure_names()). 146 data (pd.DataFrame): The data to recalibrate. 147 as_a_whole (bool): If True, recalibrate the data as a whole, otherwise 148 recalibrate each column separately. 149 150 Returns: 151 tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data. 152 153 Why recalibrate? 154 So that the chart is easier to read and interpret, in units that are more familiar. 155 156 """ 157 # --- data/argument validation 158 if units.empty: 159 raise ValueError("The units Series is empty.") 160 if len(units) != len(data.columns): 161 raise ValueError("The units Series must have the same length as the data DataFrame's columns.") 162 if as_a_whole and not _is_all_the_same(units): 163 raise ValueError("Cannot recalibrate as a whole when there are multiple units of measure.") 164 if not all(x in data.columns for x in units.index): 165 raise ValueError("The units Series must all be indexed by the data DataFrame's columns.") 166 167 if as_a_whole: 168 str_label: str = units.iloc[0] 169 datax, str_label = _refactor(data, str_label) 170 new_units = pd.Series([str_label] * len(data.columns), index=data.columns) 171 return pd.DataFrame(datax), new_units 172 173 for column in data.columns: 174 str_label = units[column] 175 series = data[column] 176 seriesx, str_label = _refactor(series, str_label) 177 data[column] = cast("pd.Series", seriesx) 178 units[column] = str_label 179 180 return data, units
Recalibrate the data so that its maximum value is between 1 and 1000.
Args: units (pd.Series): The units of measure (as returned by measure_names()). data (pd.DataFrame): The data to recalibrate. as_a_whole (bool): If True, recalibrate the data as a whole, otherwise recalibrate each column separately.
Returns: tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data.
Why recalibrate? So that the chart is easier to read and interpret, in units that are more familiar.
183def recalibrate_series(series: pd.Series, label: str) -> tuple[pd.Series, str]: 184 """Recalibrate a Series with a label. 185 186 Args: 187 series (pd.Series): The Series to recalibrate. 188 label (str): The label for the Series. 189 190 Returns: 191 tuple[pd.Series, str]: The recalibrated Series and label. 192 193 """ 194 seriesx, label = _refactor(series, label) 195 return cast("pd.Series", seriesx), label
Recalibrate a Series with a label.
Args: series (pd.Series): The Series to recalibrate. label (str): The label for the Series.
Returns: tuple[pd.Series, str]: The recalibrated Series and label.
245@cache 246def structure_from_flow_id(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 247 """Get the data structure directly from the flow identifier. 248 249 Args: 250 flow_id (str): The ID of the data flow to validate. 251 **kwargs: Additional keyword arguments, ultimately passed to acquire_url(). 252 253 Returns: 254 FlowMetaDict: Dictionary containing the flow's structure. 255 256 Raises: 257 ValueError: If the flow_id is not valid. 258 ValueError: If the structure_id or structure cannot be found. 259 260 """ 261 if flow_id not in data_flows(**kwargs): 262 raise ValueError(f"Invalid flow_id: {flow_id}.") 263 structure_id = structure_ident(flow_id, **kwargs) 264 structure = data_structures(structure_id, **kwargs) 265 if not structure: 266 raise ValueError(f"No structure found for structure ID: {structure_id}.") 267 return structure
Get the data structure directly from the flow identifier.
Args: flow_id (str): The ID of the data flow to validate. **kwargs: Additional keyword arguments, ultimately passed to acquire_url().
Returns: FlowMetaDict: Dictionary containing the flow's structure.
Raises: ValueError: If the flow_id is not valid. ValueError: If the structure_id or structure cannot be found.
88@cache 89def structure_ident(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> str: 90 """Get the data structure ID for a specific dataflow. 91 92 Args: 93 flow_id (str): The ID of the dataflow to retrieve the structure ID for. 94 **kwargs: Additional keyword arguments passed to acquire_url(). 95 96 Returns: 97 str: The data structure ID for the specified dataflow. 98 99 Raises: 100 ValueError: If the flow_id is not found or has no associated structure ID. 101 102 """ 103 flow = data_flows(flow_id, **kwargs) 104 if flow_id not in flow or DATA_STRUCT_ID not in flow[flow_id]: 105 raise ValueError(f"No data structure found for flow '{flow_id}'") 106 return flow[flow_id][DATA_STRUCT_ID]
Get the data structure ID for a specific dataflow.
Args: flow_id (str): The ID of the dataflow to retrieve the structure ID for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: str: The data structure ID for the specified dataflow.
Raises: ValueError: If the flow_id is not found or has no associated structure ID.