sdmxabs
Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.
1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.""" 2 3from importlib.metadata import PackageNotFoundError, version 4 5from .download_cache import ( 6 CacheError, 7 GetFileKwargs, 8 HttpError, 9 ModalityType, 10) 11from .fetch import fetch 12from .fetch_gdp import fetch_gdp 13from .fetch_multi import fetch_multi 14from .fetch_pop import fetch_pop, fetch_state_pop 15from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item 16from .flow_metadata import FlowMetaDict, code_list_for_dim, code_lists, data_dimensions, data_flows, frame 17from .measures import measure_names, recalibrate, recalibrate_series 18 19# --- version and author 20try: 21 __version__ = version(__name__) 22except PackageNotFoundError: 23 __version__ = "0.0.0" # Fallback for development mode 24__author__ = "Bryan Palmer" 25 26# --- establish the package contents 27__all__ = [ 28 "CacheError", 29 "FlowMetaDict", 30 "GetFileKwargs", 31 "HttpError", 32 "MatchCriteria", 33 "MatchItem", 34 "MatchType", 35 "ModalityType", 36 "__author__", 37 "__version__", 38 "code_list_for_dim", 39 "code_lists", 40 "data_dimensions", 41 "data_flows", 42 "fetch", 43 "fetch_gdp", 44 "fetch_multi", 45 "fetch_pop", 46 "fetch_selection", 47 "fetch_state_pop", 48 "frame", 49 "make_wanted", 50 "match_item", 51 "measure_names", 52 "recalibrate", 53 "recalibrate_series", 54]
A problem retrieving data from the cache.
45class GetFileKwargs(TypedDict): 46 """TypedDict for acqure_url function arguments.""" 47 48 verbose: NotRequired[bool] 49 """If True, print information about the data retrieval process.""" 50 modality: NotRequired[ModalityType] 51 """Kind of retrieval: "prefer_cache", "prefer_url"."""
TypedDict for acqure_url function arguments.
A problem retrieving data using HTTP.
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
155@cache 156def code_list_for_dim(flow_id: str, dim_name: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 157 """Get the code list for a specific dimension or attribute in a dataflow. 158 159 Args: 160 flow_id (str): The ID of the dataflow. 161 dim_name (str): The dimension ID to retrieve the code list for. 162 **kwargs: Additional keyword arguments passed to acquire_url(). 163 164 Returns: 165 FlowMetaDict: A dictionary containing the codes and their metadata. 166 167 Raises: 168 ValueError: If the dimension/attribute is not found in the dataflow. 169 170 """ 171 dimensions = data_dimensions(flow_id, **kwargs) 172 if dim_name not in dimensions: 173 raise ValueError(f"Dimension '{dim_name}' not found in flow '{flow_id}'") 174 175 codelist_id = dimensions[dim_name].get("id", "") 176 if not codelist_id: 177 raise ValueError(f"No codelist found for dimension/attribute '{dim_name}' in flow '{flow_id}'") 178 179 return code_lists(codelist_id, **kwargs)
Get the code list for a specific dimension or attribute in a dataflow.
Args: flow_id (str): The ID of the dataflow. dim_name (str): The dimension ID to retrieve the code list for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their metadata.
Raises: ValueError: If the dimension/attribute is not found in the dataflow.
96@cache 97def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 98 """Get the code list metadata from the ABS SDMX API. 99 100 Args: 101 cl_id (str): The ID of the code list to retrieve. 102 **kwargs: Additional keyword arguments passed to acquire_url(). 103 104 Returns: 105 FlowMetaDict: A dictionary containing the codes and 106 their associated key=value pairs. A "name" key should always 107 be present. A "parent" key may also be present. 108 109 Raises: 110 HttpError: If there is an issue with the HTTP request. 111 CacheError: If there is an issue with the cache. 112 ValueError: If no XML root is found in the response. 113 114 Note: 115 You will get a CacheError if the codelist is not found on the ABS SDMX API. 116 (This package tries the website first, then the cache.) 117 118 Guarantees for the inner dictionary: 119 - The inner dictionary will always have a "name" key. 120 - The inner dictionary may have a "parent" key if the code has a parent. 121 122 """ 123 tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs) 124 125 codes: FlowMetaDict = {} 126 for code in tree.findall(".//str:Code", NAME_SPACES): 127 code_id = code.get("id", None) 128 if code_id is None: 129 continue 130 elements: dict[str, str] = {} 131 132 # - get the name 133 name = code.find("com:Name", NAME_SPACES) 134 if name is None or not name.text: 135 # guarantee that we name key and value pair 136 print(f"Warning: Code {code_id} in {cl_id}has no name, skipping.") 137 continue # skip if no name 138 elements["name"] = name.text 139 140 # - get the parent 141 parent = code.find("str:Parent", NAME_SPACES) 142 parent_id = "" 143 if parent is not None: 144 ref = parent.find("Ref", NAME_SPACES) 145 if ref is not None: 146 parent_id = str(ref.get("id", "")) 147 if parent_id: # Only add if not empty 148 elements["parent"] = parent_id 149 150 codes[code_id] = elements 151 152 return codes
Get the code list metadata from the ABS SDMX API.
Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)
Guarantees for the inner dictionary: - The inner dictionary will always have a "name" key. - The inner dictionary may have a "parent" key if the code has a parent.
55@cache 56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 57 """Get the data dimensions and attributes metadata from the ABS SDMX API. 58 59 Args: 60 flow_id (str): The ID of the dataflow to retrieve dimensions for. 61 **kwargs: Additional keyword arguments passed to acquire_url(). 62 63 Returns: 64 dict[str, dict[str, str]]: A dictionary containing the dimensions and 65 their metadata in key=value pairs. 66 67 Raises: 68 HttpError: If there is an issue with the HTTP request. 69 CacheError: If there is an issue with the cache. 70 ValueError: If no XML root is found in the response. 71 72 Note: 73 The dimensions metadata includes a "position" for each dimmension. 74 The attributes metadata does not have "position" information. 75 76 """ 77 tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs) 78 79 elements = {} 80 for ident in ["Dimension", "Attribute"]: 81 for elem in tree.findall(f".//str:{ident}", NAME_SPACES): 82 element_id = elem.get("id") 83 if element_id is None: 84 continue 85 contents = {} 86 if ident == "Dimension": 87 contents["position"] = elem.get("position", "") 88 if (lr := elem.find("str:LocalRepresentation", NAME_SPACES)) is not None and ( 89 enumer := lr.find("str:Enumeration/Ref", NAME_SPACES) 90 ) is not None: 91 contents = contents | enumer.attrib 92 elements[element_id] = contents 93 return elements
Get the data dimensions and attributes metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: The dimensions metadata includes a "position" for each dimmension. The attributes metadata does not have "position" information.
22@cache 23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 24 """Get the toplevel metadata from the ABS SDMX API. 25 26 Args: 27 flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". 28 **kwargs: Additional keyword arguments passed to acquire_url(). 29 30 Returns: 31 dict[str, dict[str, str]]: A dictionary containing the dataflow IDs 32 and their metadatain key=value pairs. 33 34 Raises: 35 HttpError: If there is an issue with the HTTP request. 36 CacheError: If there is an issue with the cache. 37 ValueError: If no XML root is found in the response. 38 39 """ 40 tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs) 41 42 d_flows: FlowMetaDict = {} 43 for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES): 44 attributes: dict[str, str] = dataflow.attrib.copy() 45 if "id" not in attributes: 46 continue 47 df_id = attributes.pop("id") 48 name_elem = dataflow.find("com:Name", NAME_SPACES) 49 df_name = name_elem.text if name_elem is not None else "(missing name)" 50 attributes["name"] = str(df_name) # str(...) because pylance complains about it being None 51 d_flows[df_id] = attributes 52 return d_flows
Get the toplevel metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
200def fetch( 201 flow_id: str, 202 dims: dict[str, str] | None = None, 203 parameters: dict[str, str] | None = None, 204 *, 205 validate: bool = False, 206 **kwargs: Unpack[GetFileKwargs], 207) -> tuple[pd.DataFrame, pd.DataFrame]: 208 """Fetch data from the ABS SDMX API. 209 210 Args: 211 flow_id (str): The ID of the data flow from which to retrieve data items. 212 dims (dict[str, str], optional): A dictionary of dimensions to select the 213 data items. If None, the ABS fetch request will be for all data items, 214 which can be slow. 215 parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply 216 to the data request. Supported parameters include: 217 - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') 218 - 'endPeriod': End period for data filtering (e.g., '2023-Q4') 219 - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') 220 If None, no parameters are applied. 221 validate (bool, optional): If True, validate `dims` against the flow's 222 required dimensions when generating the URL key. Defaults to False. 223 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 224 225 Returns: a tuple of two DataFrames: 226 - The first DataFrame contains the fetched data. 227 - The second DataFrame contains the metadata. 228 229 Raises: 230 HttpError: If there is an issue with the HTTP request. 231 CacheError: If there is an issue with the cache. 232 ValueError: If no XML root is found in the response. 233 ValueError: If invalid parameter values are provided. 234 235 Notes: 236 If the `dims` argument is not valid you should get a CacheError or HttpError. 237 If the `flow_id` is not valid, you should get a ValueError. 238 239 """ 240 # --- report the parameters used if requested 241 verbose = kwargs.get("verbose", False) 242 if verbose: 243 print(f"fetch(): {flow_id=} {dims=} {parameters=} {validate=} {kwargs=}") 244 245 # --- validate parameters 246 valid_detail_values = {"full", "dataonly", "serieskeysonly", "nodata"} 247 if parameters: 248 detail_value = parameters.get("detail") 249 if detail_value and detail_value not in valid_detail_values: 250 raise ValueError(f"Invalid detail value '{detail_value}'. Must be one of: {valid_detail_values}") 251 252 # --- prepare to get the XML root from the ABS SDMX API 253 # prefer fresh data every time 254 kwargs["modality"] = kwargs.get("modality", "prefer-url") 255 key = build_key(flow_id, dims, validate=validate) 256 257 # --- build URL with optional parameters 258 url = f"{URL_STEM}/data/{flow_id}/{key}" 259 if parameters: 260 url_params = [] 261 if "startPeriod" in parameters: 262 url_params.append(f"startPeriod={parameters['startPeriod']}") 263 if "endPeriod" in parameters: 264 url_params.append(f"endPeriod={parameters['endPeriod']}") 265 if "detail" in parameters: 266 url_params.append(f"detail={parameters['detail']}") 267 if url_params: 268 url += "?" + "&".join(url_params) 269 270 xml_root = acquire_xml(url, **kwargs) 271 return _extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args:
flow_id (str): The ID of the data flow from which to retrieve data items.
dims (dict[str, str], optional): A dictionary of dimensions to select the
data items. If None, the ABS fetch request will be for all data items,
which can be slow.
parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply
to the data request. Supported parameters include:
- 'startPeriod': Start period for data filtering (e.g., '2020-Q1')
- 'endPeriod': End period for data filtering (e.g., '2023-Q4')
- 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata')
If None, no parameters are applied.
validate (bool, optional): If True, validate dims against the flow's
required dimensions when generating the URL key. Defaults to False.
**kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. ValueError: If invalid parameter values are provided.
Notes:
If the dims argument is not valid you should get a CacheError or HttpError.
If the flow_id is not valid, you should get a ValueError.
18def fetch_gdp( 19 seasonality: Literal["o", "s", "t"] = "o", 20 price_measure: Literal["cp", "cvm"] = "cp", 21 parameters: dict[str, str] | None = None, 22 *, 23 validate: bool = False, 24 **kwargs: Unpack[GetFileKwargs], 25) -> tuple[pd.DataFrame, pd.DataFrame]: 26 """Fetch quarterly GDP data in $ from the ABS SDMX API. 27 28 Args: 29 seasonality (str): Type of seasonal adjustment to apply: 30 - "o": Original data without seasonal adjustment (default) 31 - "s": Seasonally adjusted data 32 - "t": Trend data 33 price_measure (str): Price measure type: 34 - "cp": Current prices (default) 35 - "cvm": Chain volume measures 36 parameters (dict[str, str] | None): Additional parameters for the API request, 37 such as 'startPeriod'. 38 validate (bool, optional): If True, validate the selection against the flow's 39 required dimensions when generating the URL key. Defaults to False. 40 **kwargs: Additional arguments passed to the fetch_selection() function 41 42 Returns: 43 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata 44 45 Raises: 46 ValueError: If invalid seasonality or price_measure values are provided 47 48 """ 49 # report the parameters used if requested 50 verbose = kwargs.get("verbose", False) 51 if verbose: 52 print(f"fetch_gdp(): {seasonality=}, {price_measure=} {validate=} {kwargs=}") 53 54 # Validate inputs 55 if seasonality not in SEAS_MAP: 56 error = f"Invalid '{seasonality=}'. Must be one of: {list(SEAS_MAP.keys())}" 57 raise ValueError(error) 58 if price_measure not in PRICE_MAP: 59 error = f"Invalid '{price_measure=}'. Must be one of: {list(PRICE_MAP.keys())}" 60 raise ValueError(error) 61 62 # build a selection criteria 63 selection_criteria = [ 64 (SEAS_MAP[seasonality], "TSEST", Mt.EXACT), 65 (PRICE_MAP[price_measure], "MEASURE", Mt.EXACT), 66 ("Gross domestic product", "DATA_ITEM", Mt.EXACT), 67 ] 68 # return the data 69 flow_id = "ANA_AGG" 70 return fetch_selection(flow_id, selection_criteria, parameters, validate=validate, **kwargs)
Fetch quarterly GDP data in $ from the ABS SDMX API.
Args: seasonality (str): Type of seasonal adjustment to apply: - "o": Original data without seasonal adjustment (default) - "s": Seasonally adjusted data - "t": Trend data price_measure (str): Price measure type: - "cp": Current prices (default) - "cvm": Chain volume measures parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata
Raises: ValueError: If invalid seasonality or price_measure values are provided
116def fetch_multi( 117 wanted: pd.DataFrame, 118 parameters: dict[str, str] | None = None, 119 *, 120 validate: bool = False, 121 **kwargs: Unpack[GetFileKwargs], 122) -> tuple[pd.DataFrame, pd.DataFrame]: 123 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 124 125 Args: 126 wanted: A DataFrame with rows for each desired data set (of one or more series). 127 Each row should contain the necessary identifiers to fetch the dataset. 128 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 129 The 'flow_id' column is mandatory, and the rest are optional. 130 Note: the DataFrame index is not used in the fetching process. 131 parameters: A dictionary of additional parameters to pass to the fetch function. 132 validate: If True, the function will validate dimensions and values against 133 the ABS SDMX API codelists. Defaults to False. 134 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 135 136 Returns: 137 A tuple containing two DataFrames: 138 - The first DataFrame contains the fetched data. 139 - The second DataFrame contains metadata about the fetched datasets. 140 141 Raises: 142 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 143 144 Note: 145 CacheError and HttpError are raised by the fetch function. 146 These will be caught and reported to standard output. 147 148 Note: 149 The function validates that all datasets have compatible index types. 150 A ValueError will be raised if incompatible index types are detected 151 (e.g., mixing quarterly and monthly data). 152 153 """ 154 # --- report the parameters used if requested 155 verbose = kwargs.get("verbose", False) 156 if verbose: 157 print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}") 158 159 # --- quick sanity checks 160 if wanted.empty: 161 print("wanted DataFrame is empty, returning empty DataFrames.") 162 return pd.DataFrame(), pd.DataFrame() 163 if "flow_id" not in wanted.columns: 164 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 165 166 # --- do the work 167 return _extract(wanted, parameters, validate=validate, **kwargs)
Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.
Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.
Raises:
ValueError: If the 'flow_id' column is missing from the wanted DataFrame.
Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.
Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).
135def fetch_pop( 136 source: Literal["erp", "na"] = "erp", 137 parameters: dict[str, str] | None = None, 138 *, 139 projection: bool = False, 140 validate: bool = False, 141 **kwargs: Unpack[GetFileKwargs], 142) -> tuple[pd.DataFrame, pd.DataFrame]: 143 """Fetch Estimated Resident Population (ERP) data from the ABS SDMX API. 144 145 Args: 146 source (str): Source of the population data: 147 - "erp": ABS published Estimated Resident Population (default) 148 - "na": Implied population from the ABS National Accounts 149 parameters (dict[str, str] | None): Additional parameters for the API request, 150 such as 'startPeriod'. 151 projection (bool, optional): If True, and data is available for the most recent year, 152 make a projection forward to the current quarter, based on growth over the last 4 quarters. 153 validate (bool, optional): If True, validate the selection against the flow's 154 required dimensions when generating the URL key. Defaults to False. 155 **kwargs: Additional arguments passed to the fetch_selection() function 156 157 Returns: 158 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata 159 160 """ 161 # report the parameters used if requested 162 verbose = kwargs.get("verbose", False) 163 if verbose: 164 print(f"fetch_pop(): {source=} {validate=} {kwargs=}") 165 166 # build a selection criteria and fetch the relevant data 167 match source: 168 case "erp": 169 data, meta = _erp_population("Australia", parameters, validate=validate, **kwargs) 170 case "na": 171 data, meta = _na_population(parameters, validate=validate, **kwargs) 172 case _: 173 raise ValueError(f"Invalid source '{source}'. Must be one of: ['erp', 'na']") 174 175 # if requested, make a projection of the data 176 if projection: 177 data = _make_projection(data) 178 179 return data, meta
Fetch Estimated Resident Population (ERP) data from the ABS SDMX API.
Args: source (str): Source of the population data: - "erp": ABS published Estimated Resident Population (default) - "na": Implied population from the ABS National Accounts parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, and data is available for the most recent year, make a projection forward to the current quarter, based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
206def fetch_selection( 207 flow_id: str, 208 criteria: MatchCriteria, 209 parameters: dict[str, str] | None = None, 210 *, 211 validate: bool = False, 212 **kwargs: Unpack[GetFileKwargs], 213) -> tuple[pd.DataFrame, pd.DataFrame]: 214 """Fetch data based on a selection criteria for items. 215 216 Args: 217 flow_id (str): The ID of the data flow to fetch. 218 criteria (MatchCriteria): A sequence of match criteria to filter the data. 219 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 220 validate (bool, optional): If True, validate the selection against the flow's 221 required dimensions when generating the URL key. Defaults to False. 222 **kwargs: Additional keyword arguments for the fetch_multi function. 223 224 Returns: 225 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 226 227 """ 228 verbose = kwargs.get("verbose", False) 229 if verbose: 230 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 231 232 selection = make_wanted(flow_id, criteria) 233 return fetch_multi(selection, parameters, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
182def fetch_state_pop( 183 state: str, 184 parameters: dict[str, str] | None = None, 185 *, 186 projection: bool = False, 187 validate: bool = False, 188 **kwargs: Unpack[GetFileKwargs], 189) -> tuple[pd.DataFrame, pd.DataFrame]: 190 """Fetch state-level ERP population data from the ABS SDMX API. 191 192 Args: 193 state (str): State/territory name or case-insensitive abbreviation (e.g., "NSW", "Vic", "qld", etc.). 194 [Note: Use "" or "all" for the population estimates for all states.] 195 parameters (dict[str, str] | None): Additional parameters for the API request, 196 such as 'startPeriod'. 197 projection (bool, optional): If True, make a projection forward to the current quarter 198 based on growth over the last 4 quarters. 199 validate (bool, optional): If True, validate the selection against the flow's 200 required dimensions when generating the URL key. Defaults to False. 201 **kwargs: Additional arguments passed to the fetch_selection() function 202 203 Returns: 204 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata 205 206 """ 207 # report the parameters used if requested 208 verbose = kwargs.get("verbose", False) 209 if verbose: 210 print(f"fetch_state_pop(): {state=} {validate=} {kwargs=}") 211 212 if state.lower() in ("", "all"): 213 full_state_name: str = "" 214 else: 215 full_state_name = _state_name_from_abbrev(state) 216 217 data, meta = _erp_population(full_state_name, parameters, validate=validate, **kwargs) 218 219 if projection: 220 data = _make_projection(data) 221 222 return data, meta
Fetch state-level ERP population data from the ABS SDMX API.
Args: state (str): State/territory name or case-insensitive abbreviation (e.g., "NSW", "Vic", "qld", etc.). [Note: Use "" or "all" for the population estimates for all states.] parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, make a projection forward to the current quarter based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
206def frame(f: FlowMetaDict) -> pd.DataFrame: 207 """Convert a FlowMetaDict to a pandas DataFrame. 208 209 Args: 210 f (FlowMetaDict): The flow metadata dictionary to convert. 211 212 Returns: 213 pd.DataFrame: A DataFrame representation of the flow metadata. 214 215 Note: This is a utility function to help visualize the flow metadata. 216 217 """ 218 return pd.DataFrame(f).T
Convert a FlowMetaDict to a pandas DataFrame.
Args: f (FlowMetaDict): The flow metadata dictionary to convert.
Returns: pd.DataFrame: A DataFrame representation of the flow metadata.
Note: This is a utility function to help visualize the flow metadata.
172def make_wanted( 173 flow_id: str, 174 criteria: MatchCriteria, 175) -> pd.DataFrame: 176 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 177 178 Args: 179 flow_id (str): The ID of the data flow to select items from. 180 criteria (MatchCriteria): A sequence of tuples containing the pattern, 181 dimension name, and match-type (exact, partial, or regex). 182 183 Returns: 184 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 185 into the call of the function fetch_multi(). 186 187 Raises: 188 ValueError: If the flow_id is not valid or if no items match the criteria. 189 190 Notes: 191 - Should build a one line DataFrame. This Frame may select multiple data series, 192 when passed to fetch_multi. It also can be concatenated with other DataFrames 193 to build a larger selection. 194 - If two match elements refer to the same dimension, only the `intersection` of the 195 matches will be returned. 196 197 """ 198 dimensions = _validate_flow_and_dimensions(flow_id) 199 result_dict = _process_match_criteria(criteria, flow_id, dimensions) 200 201 # Add flow_id and return as DataFrame 202 result_dict["flow_id"] = flow_id 203 return pd.DataFrame([result_dict]).astype(str)
Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
150def match_item( 151 pattern: str, 152 dimension: str, 153 match_type: MatchType = MatchType.PARTIAL, 154) -> MatchItem: 155 """Create a new MatchItem for use in select_items() and fetch_selection(). 156 157 Args: 158 pattern (str): The pattern to match. 159 dimension (str): The dimension to match against. 160 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 161 162 Returns: 163 MatchElement: A tuple representing the match element. 164 165 Note: 166 This function is of little value. It is easier to create the tuple directly. 167 168 """ 169 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
Note: This function is of little value. It is easier to create the tuple directly.
111def measure_names(meta: pd.DataFrame) -> pd.Series: 112 """Get the measure names for each row in the metadata DataFrame - (for y-axis labels). 113 114 Args: 115 meta (pd.DataFrame): The metadata DataFrame. 116 117 Returns: 118 pd.Series: A Series containing the measure names, indexed by the row labels. 119 120 """ 121 series = pd.Series(dtype=str) 122 duplicate_number: str = " Number" # the space before 'Number' is important 123 for label, row in meta.iterrows(): 124 name: str = str(label) # worst case scenario 125 if "UNIT_MEASURE" in row: 126 name = str(row["UNIT_MEASURE"]) # a better base case 127 if row.get("UNIT_MULT"): 128 try: 129 index = int(row["UNIT_MULT"]) 130 if index in INDICIES and index > 0: 131 name = f"{INDICIES[index]} {name}" # best case 132 except ValueError: 133 pass 134 name = name.removesuffix(duplicate_number) # Just in case it is 'Number Numer' 135 series[label] = name 136 return series
Get the measure names for each row in the metadata DataFrame - (for y-axis labels).
Args: meta (pd.DataFrame): The metadata DataFrame.
Returns: pd.Series: A Series containing the measure names, indexed by the row labels.
139def recalibrate( 140 data: pd.DataFrame, units: pd.Series, *, as_a_whole: bool = False 141) -> tuple[pd.DataFrame, pd.Series]: 142 """Recalibrate the data so that its maximum value is between 1 and 1000. 143 144 Args: 145 units (pd.Series): The units of measure (as returned by measure_names()). 146 data (pd.DataFrame): The data to recalibrate. 147 as_a_whole (bool): If True, recalibrate the data as a whole, otherwise 148 recalibrate each column separately. 149 150 Returns: 151 tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data. 152 153 Why recalibrate? 154 So that the chart is easier to read and interpret, in units that are more familiar. 155 156 """ 157 # --- data/argument validation 158 if units.empty: 159 raise ValueError("The units Series is empty.") 160 if len(units) != len(data.columns): 161 raise ValueError("The units Series must have the same length as the data DataFrame's columns.") 162 if as_a_whole and not _is_all_the_same(units): 163 raise ValueError("Cannot recalibrate as a whole when there are multiple units of measure.") 164 if not all(x in data.columns for x in units.index): 165 raise ValueError("The units Series must all be indexed by the data DataFrame's columns.") 166 167 if as_a_whole: 168 str_label: str = units.iloc[0] 169 datax, str_label = _refactor(data, str_label) 170 new_units = pd.Series([str_label] * len(data.columns), index=data.columns) 171 return pd.DataFrame(datax), new_units 172 173 for column in data.columns: 174 str_label = units[column] 175 series = data[column] 176 seriesx, str_label = _refactor(series, str_label) 177 data[column] = cast("pd.Series", seriesx) 178 units[column] = str_label 179 180 return data, units
Recalibrate the data so that its maximum value is between 1 and 1000.
Args: units (pd.Series): The units of measure (as returned by measure_names()). data (pd.DataFrame): The data to recalibrate. as_a_whole (bool): If True, recalibrate the data as a whole, otherwise recalibrate each column separately.
Returns: tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data.
Why recalibrate? So that the chart is easier to read and interpret, in units that are more familiar.
183def recalibrate_series(series: pd.Series, label: str) -> tuple[pd.Series, str]: 184 """Recalibrate a Series with a label. 185 186 Args: 187 series (pd.Series): The Series to recalibrate. 188 label (str): The label for the Series. 189 190 Returns: 191 tuple[pd.Series, str]: The recalibrated Series and label. 192 193 """ 194 seriesx, label = _refactor(series, label) 195 return cast("pd.Series", seriesx), label
Recalibrate a Series with a label.
Args: series (pd.Series): The Series to recalibrate. label (str): The label for the Series.
Returns: tuple[pd.Series, str]: The recalibrated Series and label.