sdmxabs
Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.
1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.""" 2 3from importlib.metadata import PackageNotFoundError, version 4 5from .download_cache import ( 6 CacheError, 7 GetFileKwargs, 8 HttpError, 9 ModalityType, 10) 11from .fetch import fetch 12from .fetch_gdp import fetch_gdp 13from .fetch_multi import fetch_multi 14from .fetch_pop import fetch_pop 15from .fetch_selection import MatchCriteria, MatchItem, MatchType, fetch_selection, make_wanted, match_item 16from .flow_metadata import code_lists, data_dimensions, data_flows 17from .measures import measure_names, recalibrate, recalibrate_series 18 19# --- version and author 20try: 21 __version__ = version(__name__) 22except PackageNotFoundError: 23 __version__ = "0.0.0" # Fallback for development mode 24__author__ = "Bryan Palmer" 25 26# --- establish the package contents 27__all__ = [ 28 "CacheError", 29 "GetFileKwargs", 30 "HttpError", 31 "MatchCriteria", 32 "MatchItem", 33 "MatchType", 34 "ModalityType", 35 "__author__", 36 "__version__", 37 "code_lists", 38 "data_dimensions", 39 "data_flows", 40 "fetch", 41 "fetch_gdp", 42 "fetch_multi", 43 "fetch_pop", 44 "fetch_selection", 45 "make_wanted", 46 "match_item", 47 "measure_names", 48 "recalibrate", 49 "recalibrate_series", 50]
A problem retrieving data from the cache.
45class GetFileKwargs(TypedDict): 46 """TypedDict for acqure_url function arguments.""" 47 48 verbose: NotRequired[bool] 49 """If True, print information about the data retrieval process.""" 50 modality: NotRequired[ModalityType] 51 """Kind of retrieval: "prefer_cache", "prefer_url"."""
TypedDict for acqure_url function arguments.
A problem retrieving data using HTTP.
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
96@cache 97def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 98 """Get the code list metadata from the ABS SDMX API. 99 100 Args: 101 cl_id (str): The ID of the code list to retrieve. 102 **kwargs: Additional keyword arguments passed to acquire_url(). 103 104 Returns: 105 FlowMetaDict: A dictionary containing the codes and 106 their associated key=value pairs. A "name" key should always 107 be present. A "parent" key may also be present. 108 109 Raises: 110 HttpError: If there is an issue with the HTTP request. 111 CacheError: If there is an issue with the cache. 112 ValueError: If no XML root is found in the response. 113 114 Note: 115 You will get a CacheError if the codelist is not found on the ABS SDMX API. 116 (This package tries the website first, then the cache.) 117 118 Guarantees for the inner dictionary: 119 - The inner dictionary will always have a "name" key. 120 - The inner dictionary may have a "parent" key if the code has a parent. 121 122 """ 123 tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs) 124 125 codes: FlowMetaDict = {} 126 for code in tree.findall(".//str:Code", NAME_SPACES): 127 code_id = code.get("id", None) 128 if code_id is None: 129 continue 130 elements: dict[str, str] = {} 131 132 # - get the name 133 name = code.find("com:Name", NAME_SPACES) 134 if name is None or not name.text: 135 # guarantee that we name key and value pair 136 print(f"Warning: Code {code_id} in {cl_id}has no name, skipping.") 137 continue # skip if no name 138 elements["name"] = name.text 139 140 # - get the parent 141 parent = code.find("str:Parent", NAME_SPACES) 142 parent_id = "" 143 if parent is not None: 144 ref = parent.find("Ref", NAME_SPACES) 145 if ref is not None: 146 parent_id = str(ref.get("id", "")) 147 if parent_id: # Only add if not empty 148 elements["parent"] = parent_id 149 150 codes[code_id] = elements 151 152 return codes
Get the code list metadata from the ABS SDMX API.
Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: You will get a CacheError if the codelist is not found on the ABS SDMX API. (This package tries the website first, then the cache.)
Guarantees for the inner dictionary: - The inner dictionary will always have a "name" key. - The inner dictionary may have a "parent" key if the code has a parent.
55@cache 56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 57 """Get the data dimensions and attributes metadata from the ABS SDMX API. 58 59 Args: 60 flow_id (str): The ID of the dataflow to retrieve dimensions for. 61 **kwargs: Additional keyword arguments passed to acquire_url(). 62 63 Returns: 64 dict[str, dict[str, str]]: A dictionary containing the dimensions and 65 their metadata in key=value pairs. 66 67 Raises: 68 HttpError: If there is an issue with the HTTP request. 69 CacheError: If there is an issue with the cache. 70 ValueError: If no XML root is found in the response. 71 72 Note: 73 The dimensions metadata includes a "position" for each dimmension. 74 The attributes metadata does not have "position" information. 75 76 """ 77 tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs) 78 79 elements = {} 80 for ident in ["Dimension", "Attribute"]: 81 for elem in tree.findall(f".//str:{ident}", NAME_SPACES): 82 element_id = elem.get("id") 83 if element_id is None: 84 continue 85 contents = {} 86 if ident == "Dimension": 87 contents["position"] = elem.get("position", "") 88 if (lr := elem.find("str:LocalRepresentation", NAME_SPACES)) is not None and ( 89 enumer := lr.find("str:Enumeration/Ref", NAME_SPACES) 90 ) is not None: 91 contents = contents | enumer.attrib 92 elements[element_id] = contents 93 return elements
Get the data dimensions and attributes metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
Note: The dimensions metadata includes a "position" for each dimmension. The attributes metadata does not have "position" information.
22@cache 23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 24 """Get the toplevel metadata from the ABS SDMX API. 25 26 Args: 27 flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". 28 **kwargs: Additional keyword arguments passed to acquire_url(). 29 30 Returns: 31 dict[str, dict[str, str]]: A dictionary containing the dataflow IDs 32 and their metadatain key=value pairs. 33 34 Raises: 35 HttpError: If there is an issue with the HTTP request. 36 CacheError: If there is an issue with the cache. 37 ValueError: If no XML root is found in the response. 38 39 """ 40 tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs) 41 42 d_flows: FlowMetaDict = {} 43 for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES): 44 attributes: dict[str, str] = dataflow.attrib.copy() 45 if "id" not in attributes: 46 continue 47 df_id = attributes.pop("id") 48 name_elem = dataflow.find("com:Name", NAME_SPACES) 49 df_name = name_elem.text if name_elem is not None else "(missing name)" 50 attributes["name"] = str(df_name) # str(...) because pylance complains about it being None 51 d_flows[df_id] = attributes 52 return d_flows
Get the toplevel metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
200def fetch( 201 flow_id: str, 202 dims: dict[str, str] | None = None, 203 parameters: dict[str, str] | None = None, 204 *, 205 validate: bool = False, 206 **kwargs: Unpack[GetFileKwargs], 207) -> tuple[pd.DataFrame, pd.DataFrame]: 208 """Fetch data from the ABS SDMX API. 209 210 Args: 211 flow_id (str): The ID of the data flow from which to retrieve data items. 212 dims (dict[str, str], optional): A dictionary of dimensions to select the 213 data items. If None, the ABS fetch request will be for all data items, 214 which can be slow. 215 parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply 216 to the data request. Supported parameters include: 217 - 'startPeriod': Start period for data filtering (e.g., '2020-Q1') 218 - 'endPeriod': End period for data filtering (e.g., '2023-Q4') 219 - 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata') 220 If None, no parameters are applied. 221 validate (bool, optional): If True, validate `dims` against the flow's 222 required dimensions when generating the URL key. Defaults to False. 223 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 224 225 Returns: a tuple of two DataFrames: 226 - The first DataFrame contains the fetched data. 227 - The second DataFrame contains the metadata. 228 229 Raises: 230 HttpError: If there is an issue with the HTTP request. 231 CacheError: If there is an issue with the cache. 232 ValueError: If no XML root is found in the response. 233 ValueError: If invalid parameter values are provided. 234 235 Notes: 236 If the `dims` argument is not valid you should get a CacheError or HttpError. 237 If the `flow_id` is not valid, you should get a ValueError. 238 239 """ 240 # --- report the parameters used if requested 241 verbose = kwargs.get("verbose", False) 242 if verbose: 243 print(f"fetch(): {flow_id=} {dims=} {parameters=} {validate=} {kwargs=}") 244 245 # --- validate parameters 246 valid_detail_values = {"full", "dataonly", "serieskeysonly", "nodata"} 247 if parameters: 248 detail_value = parameters.get("detail") 249 if detail_value and detail_value not in valid_detail_values: 250 raise ValueError(f"Invalid detail value '{detail_value}'. Must be one of: {valid_detail_values}") 251 252 # --- prepare to get the XML root from the ABS SDMX API 253 # prefer fresh data every time 254 kwargs["modality"] = kwargs.get("modality", "prefer-url") 255 key = build_key(flow_id, dims, validate=validate) 256 257 # --- build URL with optional parameters 258 url = f"{URL_STEM}/data/{flow_id}/{key}" 259 if parameters: 260 url_params = [] 261 if "startPeriod" in parameters: 262 url_params.append(f"startPeriod={parameters['startPeriod']}") 263 if "endPeriod" in parameters: 264 url_params.append(f"endPeriod={parameters['endPeriod']}") 265 if "detail" in parameters: 266 url_params.append(f"detail={parameters['detail']}") 267 if url_params: 268 url += "?" + "&".join(url_params) 269 270 xml_root = acquire_xml(url, **kwargs) 271 return _extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args:
flow_id (str): The ID of the data flow from which to retrieve data items.
dims (dict[str, str], optional): A dictionary of dimensions to select the
data items. If None, the ABS fetch request will be for all data items,
which can be slow.
parameters (dict[str, str], optional): A dictionary of SDMX parameters to apply
to the data request. Supported parameters include:
- 'startPeriod': Start period for data filtering (e.g., '2020-Q1')
- 'endPeriod': End period for data filtering (e.g., '2023-Q4')
- 'detail': Level of detail ('full', 'dataonly', 'serieskeysonly', 'nodata')
If None, no parameters are applied.
validate (bool, optional): If True, validate dims against the flow's
required dimensions when generating the URL key. Defaults to False.
**kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response. ValueError: If invalid parameter values are provided.
Notes:
If the dims argument is not valid you should get a CacheError or HttpError.
If the flow_id is not valid, you should get a ValueError.
18def fetch_gdp( 19 seasonality: Literal["o", "s", "t"] = "o", 20 price_measure: Literal["cp", "cvm"] = "cp", 21 parameters: dict[str, str] | None = None, 22 *, 23 validate: bool = False, 24 **kwargs: Unpack[GetFileKwargs], 25) -> tuple[pd.DataFrame, pd.DataFrame]: 26 """Fetch quarterly GDP data in $ from the ABS SDMX API. 27 28 Args: 29 seasonality (str): Type of seasonal adjustment to apply: 30 - "o": Original data without seasonal adjustment (default) 31 - "s": Seasonally adjusted data 32 - "t": Trend data 33 price_measure (str): Price measure type: 34 - "cp": Current prices (default) 35 - "cvm": Chain volume measures 36 parameters (dict[str, str] | None): Additional parameters for the API request, 37 such as 'startPeriod'. 38 validate (bool, optional): If True, validate the selection against the flow's 39 required dimensions when generating the URL key. Defaults to False. 40 **kwargs: Additional arguments passed to the fetch_selection() function 41 42 Returns: 43 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata 44 45 Raises: 46 ValueError: If invalid seasonality or price_measure values are provided 47 48 """ 49 # report the parameters used if requested 50 verbose = kwargs.get("verbose", False) 51 if verbose: 52 print(f"fetch_gdp(): {seasonality=}, {price_measure=} {validate=} {kwargs=}") 53 54 # Validate inputs 55 if seasonality not in SEAS_MAP: 56 error = f"Invalid '{seasonality=}'. Must be one of: {list(SEAS_MAP.keys())}" 57 raise ValueError(error) 58 if price_measure not in PRICE_MAP: 59 error = f"Invalid '{price_measure=}'. Must be one of: {list(PRICE_MAP.keys())}" 60 raise ValueError(error) 61 62 # build a selection criteria 63 selection_criteria = [] 64 selection_criteria.append(match_item(SEAS_MAP[seasonality], "TSEST", Mt.EXACT)) 65 selection_criteria.append(match_item(PRICE_MAP[price_measure], "MEASURE", Mt.EXACT)) 66 selection_criteria.append(match_item("Gross domestic product", "DATA_ITEM", Mt.EXACT)) 67 68 # return the data 69 flow_id = "ANA_AGG" 70 return fetch_selection(flow_id, selection_criteria, parameters, validate=validate, **kwargs)
Fetch quarterly GDP data in $ from the ABS SDMX API.
Args: seasonality (str): Type of seasonal adjustment to apply: - "o": Original data without seasonal adjustment (default) - "s": Seasonally adjusted data - "t": Trend data price_measure (str): Price measure type: - "cp": Current prices (default) - "cvm": Chain volume measures parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the GDP data and metadata
Raises: ValueError: If invalid seasonality or price_measure values are provided
116def fetch_multi( 117 wanted: pd.DataFrame, 118 parameters: dict[str, str] | None = None, 119 *, 120 validate: bool = False, 121 **kwargs: Unpack[GetFileKwargs], 122) -> tuple[pd.DataFrame, pd.DataFrame]: 123 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 124 125 Args: 126 wanted: A DataFrame with rows for each desired data set (of one or more series). 127 Each row should contain the necessary identifiers to fetch the dataset. 128 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 129 The 'flow_id' column is mandatory, and the rest are optional. 130 Note: the DataFrame index is not used in the fetching process. 131 parameters: A dictionary of additional parameters to pass to the fetch function. 132 validate: If True, the function will validate dimensions and values against 133 the ABS SDMX API codelists. Defaults to False. 134 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 135 136 Returns: 137 A tuple containing two DataFrames: 138 - The first DataFrame contains the fetched data. 139 - The second DataFrame contains metadata about the fetched datasets. 140 141 Raises: 142 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 143 144 Note: 145 CacheError and HttpError are raised by the fetch function. 146 These will be caught and reported to standard output. 147 148 Note: 149 The function validates that all datasets have compatible index types. 150 A ValueError will be raised if incompatible index types are detected 151 (e.g., mixing quarterly and monthly data). 152 153 """ 154 # --- report the parameters used if requested 155 verbose = kwargs.get("verbose", False) 156 if verbose: 157 print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}") 158 159 # --- quick sanity checks 160 if wanted.empty: 161 print("wanted DataFrame is empty, returning empty DataFrames.") 162 return pd.DataFrame(), pd.DataFrame() 163 if "flow_id" not in wanted.columns: 164 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 165 166 # --- do the work 167 return _extract(wanted, parameters, validate=validate, **kwargs)
Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.
Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.
Raises:
ValueError: If the 'flow_id' column is missing from the wanted DataFrame.
Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.
Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).
107def fetch_pop( 108 source: Literal["erp", "na"] = "erp", 109 parameters: dict[str, str] | None = None, 110 *, 111 projection: bool = False, 112 validate: bool = False, 113 **kwargs: Unpack[GetFileKwargs], 114) -> tuple[pd.DataFrame, pd.DataFrame]: 115 """Fetch Estimated Resident Population (ERP) data from the ABS SDMX API. 116 117 Args: 118 source (str): Source of the population data: 119 - "erp": ABS published Estimated Resident Population (default) 120 - "na": Implied population from the ABS National Accounts 121 parameters (dict[str, str] | None): Additional parameters for the API request, 122 such as 'startPeriod'. 123 projection (bool, optional): If True, and data is available for the most recent year, 124 make a projection forward to the current quarter, based on growth over the last 4 quarters. 125 validate (bool, optional): If True, validate the selection against the flow's 126 required dimensions when generating the URL key. Defaults to False. 127 **kwargs: Additional arguments passed to the fetch_selection() function 128 129 Returns: 130 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata 131 132 """ 133 # report the parameters used if requested 134 verbose = kwargs.get("verbose", False) 135 if verbose: 136 print(f"fetch_pop(): {source=} {validate=} {kwargs=}") 137 138 # build a selection criteria and fetch the relevant data 139 match source: 140 case "erp": 141 data, meta = _erp_population(parameters, validate=validate, **kwargs) 142 case "na": 143 data, meta = _na_population(parameters, validate=validate, **kwargs) 144 case _: 145 raise ValueError(f"Invalid source '{source}'. Must be one of: ['erp', 'na']") 146 147 # if requested, make a projection of the data 148 if projection: 149 data = _make_projection(data) 150 151 return data, meta
Fetch Estimated Resident Population (ERP) data from the ABS SDMX API.
Args: source (str): Source of the population data: - "erp": ABS published Estimated Resident Population (default) - "na": Implied population from the ABS National Accounts parameters (dict[str, str] | None): Additional parameters for the API request, such as 'startPeriod'. projection (bool, optional): If True, and data is available for the most recent year, make a projection forward to the current quarter, based on growth over the last 4 quarters. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional arguments passed to the fetch_selection() function
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the population data and metadata
203def fetch_selection( 204 flow_id: str, 205 criteria: MatchCriteria, 206 parameters: dict[str, str] | None = None, 207 *, 208 validate: bool = False, 209 **kwargs: Unpack[GetFileKwargs], 210) -> tuple[pd.DataFrame, pd.DataFrame]: 211 """Fetch data based on a selection criteria for items. 212 213 Args: 214 flow_id (str): The ID of the data flow to fetch. 215 criteria (MatchCriteria): A sequence of match criteria to filter the data. 216 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 217 validate (bool, optional): If True, validate the selection against the flow's 218 required dimensions when generating the URL key. Defaults to False. 219 **kwargs: Additional keyword arguments for the fetch_multi function. 220 221 Returns: 222 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 223 224 """ 225 verbose = kwargs.get("verbose", False) 226 if verbose: 227 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 228 229 selection = make_wanted(flow_id, criteria) 230 return fetch_multi(selection, parameters, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
169def make_wanted( 170 flow_id: str, 171 criteria: MatchCriteria, 172) -> pd.DataFrame: 173 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 174 175 Args: 176 flow_id (str): The ID of the data flow to select items from. 177 criteria (MatchCriteria): A sequence of tuples containing the pattern, 178 dimension name, and match-type (exact, partial, or regex). 179 180 Returns: 181 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 182 into the call of the function fetch_multi(). 183 184 Raises: 185 ValueError: If the flow_id is not valid or if no items match the criteria. 186 187 Notes: 188 - Should build a one line DataFrame. This Frame may select multiple data series, 189 when passed to fetch_multi. It also can be concatenated with other DataFrames 190 to build a larger selection. 191 - If two match elements refer to the same dimension, only the `intersection` of the 192 matches will be returned. 193 194 """ 195 dimensions = _validate_flow_and_dimensions(flow_id) 196 result_dict = _process_match_criteria(criteria, flow_id, dimensions) 197 198 # Add flow_id and return as DataFrame 199 result_dict["flow_id"] = flow_id 200 return pd.DataFrame([result_dict]).astype(str)
Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
150def match_item( 151 pattern: str, 152 dimension: str, 153 match_type: MatchType = MatchType.PARTIAL, 154) -> MatchItem: 155 """Create a new MatchItem for use in select_items() and fetch_selection(). 156 157 Args: 158 pattern (str): The pattern to match. 159 dimension (str): The dimension to match against. 160 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 161 162 Returns: 163 MatchElement: A tuple representing the match element. 164 165 """ 166 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
107def measure_names(meta: pd.DataFrame) -> pd.Series: 108 """Get the measure names for each row in the metadata DataFrame - (for y-axis labels). 109 110 Args: 111 meta (pd.DataFrame): The metadata DataFrame. 112 113 Returns: 114 pd.Series: A Series containing the measure names, indexed by the row labels. 115 116 """ 117 series = pd.Series(dtype=str) 118 duplicate_number: str = " Number" 119 for label, row in meta.iterrows(): 120 name: str = str(label) # worst case scenario 121 if "UNIT_MEASURE" in row: 122 name = str(row["UNIT_MEASURE"]) 123 if row.get("UNIT_MULT"): 124 try: 125 index = int(row["UNIT_MULT"]) 126 if index in INDICIES and index > 0: 127 name = f"{INDICIES[index]} {name}" 128 except ValueError: 129 pass 130 name = name.removesuffix(duplicate_number) 131 series[label] = name 132 return series
Get the measure names for each row in the metadata DataFrame - (for y-axis labels).
Args: meta (pd.DataFrame): The metadata DataFrame.
Returns: pd.Series: A Series containing the measure names, indexed by the row labels.
135def recalibrate( 136 data: pd.DataFrame, units: pd.Series, *, as_a_whole: bool = False 137) -> tuple[pd.DataFrame, pd.Series]: 138 """Recalibrate the data so that its maximum value is between 1 and 1000. 139 140 Args: 141 units (pd.Series): The units of measure (as returned by measure_names()). 142 data (pd.DataFrame): The data to recalibrate. 143 as_a_whole (bool): If True, recalibrate the data as a whole, otherwise 144 recalibrate each column separately. 145 146 Returns: 147 tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data. 148 149 Why recalibrate? 150 So that the chart is easier to read and interpret, in units that are more familiar. 151 152 """ 153 # --- data/argument validation 154 if units.empty: 155 raise ValueError("The units Series is empty.") 156 if len(units) != len(data.columns): 157 raise ValueError("The units Series must have the same length as the data DataFrame's columns.") 158 if as_a_whole and not _is_all_the_same(units): 159 raise ValueError("Cannot recalibrate as a whole when there are multiple units of measure.") 160 if not all(x in data.columns for x in units.index): 161 raise ValueError("The units Series must all be indexed by the data DataFrame's columns.") 162 163 if as_a_whole: 164 label = units.iloc[0] 165 datax, label = refactor(data, label) 166 units.index = pd.Index([label] * len(units)) 167 return pd.DataFrame(datax), units 168 169 for column in data.columns: 170 label = units[column] 171 series = data[column] 172 seriesx, label = refactor(series, label) 173 data[column] = cast("pd.Series", seriesx) 174 units[column] = label 175 176 return data, units
Recalibrate the data so that its maximum value is between 1 and 1000.
Args: units (pd.Series): The units of measure (as returned by measure_names()). data (pd.DataFrame): The data to recalibrate. as_a_whole (bool): If True, recalibrate the data as a whole, otherwise recalibrate each column separately.
Returns: tuple[pd.Series, pd.DataFrame]: The recalibrated units and recalibrated data.
Why recalibrate? So that the chart is easier to read and interpret, in units that are more familiar.
179def recalibrate_series(series: pd.Series, label: str) -> tuple[pd.Series, str]: 180 """Recalibrate a Series with a label. 181 182 Args: 183 series (pd.Series): The Series to recalibrate. 184 label (str): The label for the Series. 185 186 Returns: 187 tuple[pd.Series, str]: The recalibrated Series and label. 188 189 """ 190 seriesx, label = refactor(series, label) 191 return cast("pd.Series", seriesx), label
Recalibrate a Series with a label.
Args: series (pd.Series): The Series to recalibrate. label (str): The label for the Series.
Returns: tuple[pd.Series, str]: The recalibrated Series and label.