sdmxabs.fetch_selection
Select one or more data series from the ABS Catalogue based on search criteria.
1"""Select one or more data series from the ABS Catalogue based on search criteria.""" 2 3import re 4from collections.abc import Sequence 5from enum import Enum 6from typing import Unpack 7 8import pandas as pd 9 10from sdmxabs.download_cache import GetFileKwargs 11from sdmxabs.fetch_multi import fetch_multi 12from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows 13 14 15# --- some types specific to this module 16class MatchType(Enum): 17 """Enumeration for match types.""" 18 19 EXACT = 1 20 PARTIAL = 2 21 REGEX = 3 22 23 24MatchItem = tuple[str, str, MatchType] # pattern, dimension, MatchType 25MatchCriteria = Sequence[MatchItem] # Sequence of tuples containing (pattern, dimension, MatchType) 26 27 28# --- private functions 29def _package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None: 30 """Package the codes into the return dictionary for a given dimension. 31 32 If the dimension already exists in the return_dict, we will intersect the newly 33 identified codes with the existing codes. If the intersection is a null set, the 34 dimension will be removed from the return_dict (ie. the global match). 35 36 Note: multiple matched codes are separated by a '+' sign in the return_dict. 37 38 """ 39 if dimension in return_dict: 40 previous = set(return_dict[dimension].split("+")) 41 codes = list(previous.intersection(set(codes))) 42 if not codes: 43 del return_dict[dimension] # no intersecting matches, remove dimension 44 if codes: 45 return_dict[dimension] = "+".join(sorted(set(codes))) 46 47 48def _get_codes( 49 code_list_dict: FlowMetaDict, 50 pattern: str, 51 match_type: MatchType = MatchType.PARTIAL, 52) -> list[str]: 53 """Obtain all codes matching the pattern.""" 54 codes = [] 55 for code, code_list in code_list_dict.items(): 56 name = code_list.get("name", "") 57 if not name: 58 # should not happen, but if it does, raise an error 59 raise ValueError(f"Code '{code}' has no name in codelist") 60 match match_type: 61 case MatchType.EXACT: 62 if name == pattern: 63 codes.append(code) 64 case MatchType.PARTIAL: 65 # Case-insensitive partial match 66 if pattern.lower() in name.lower(): 67 codes.append(code) 68 case MatchType.REGEX: 69 try: 70 if re.search(pattern, name): 71 codes.append(code) 72 except re.error as e: 73 raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e 74 return codes 75 76 77def _validate_flow_and_dimensions(flow_id: str) -> FlowMetaDict: 78 """Validate flow_id and return dimensions. 79 80 Args: 81 flow_id (str): The ID of the data flow to validate. 82 83 Returns: 84 FlowMetaDict: Dictionary containing the flow's dimensions. 85 86 Raises: 87 ValueError: If the flow_id is not valid or has no dimensions. 88 89 """ 90 if flow_id not in data_flows(): 91 raise ValueError(f"Invalid flow_id: {flow_id}.") 92 dimensions = data_dimensions(flow_id) 93 if not dimensions: 94 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 95 return dimensions 96 97 98def _validate_dimension(dimension: str, flow_id: str, dimensions: FlowMetaDict) -> str: 99 """Validate dimension and return codelist name if valid. 100 101 Args: 102 dimension (str): The dimension name to validate. 103 flow_id (str): The flow ID for error messages. 104 dimensions (FlowMetaDict): Dictionary containing the flow's dimensions. 105 106 Returns: 107 str: The codelist name if valid. 108 109 Raises: 110 ValueError: If dimension is not found or doesn't have a codelist. 111 112 """ 113 if dimension not in dimensions: 114 raise ValueError(f"Dimension '{dimension}' not found for flow '{flow_id}'") 115 116 dim_dict = dimensions[dimension] 117 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 118 raise ValueError(f"Dimension '{dimension}' does not have a codelist for flow '{flow_id}'") 119 120 return dim_dict.get("id", "") 121 122 123def _process_match_criteria( 124 criteria: MatchCriteria, flow_id: str, dimensions: FlowMetaDict 125) -> dict[str, str]: 126 """Process match criteria and build the result dictionary. 127 128 Args: 129 criteria (MatchCriteria): The match criteria to process. 130 flow_id (str): The flow ID for error messages. 131 dimensions (FlowMetaDict): Dictionary containing the flow's dimensions. 132 133 Returns: 134 dict[str, str]: Dictionary of dimension codes. 135 136 """ 137 result_dict: dict[str, str] = {} 138 139 for pattern, dimension, match_type in criteria: 140 code_list_name = _validate_dimension(dimension, flow_id, dimensions) 141 codes = _get_codes(code_lists(code_list_name), pattern, match_type) 142 if codes: 143 _package_codes(codes, dimension, result_dict) 144 145 return result_dict 146 147 148# --- public function 149def match_item( 150 pattern: str, 151 dimension: str, 152 match_type: MatchType = MatchType.PARTIAL, 153) -> MatchItem: 154 """Create a new MatchItem for use in select_items() and fetch_selection(). 155 156 Args: 157 pattern (str): The pattern to match. 158 dimension (str): The dimension to match against. 159 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 160 161 Returns: 162 MatchElement: A tuple representing the match element. 163 164 Note: 165 This function is of little value. It is easier to create the tuple directly. 166 167 """ 168 return (pattern, dimension, match_type) 169 170 171def make_wanted( 172 flow_id: str, 173 criteria: MatchCriteria, 174) -> pd.DataFrame: 175 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 176 177 Args: 178 flow_id (str): The ID of the data flow to select items from. 179 criteria (MatchCriteria): A sequence of tuples containing the pattern, 180 dimension name, and match-type (exact, partial, or regex). 181 182 Returns: 183 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 184 into the call of the function fetch_multi(). 185 186 Raises: 187 ValueError: If the flow_id is not valid or if no items match the criteria. 188 189 Notes: 190 - Should build a one line DataFrame. This Frame may select multiple data series, 191 when passed to fetch_multi. It also can be concatenated with other DataFrames 192 to build a larger selection. 193 - If two match elements refer to the same dimension, only the `intersection` of the 194 matches will be returned. 195 196 """ 197 dimensions = _validate_flow_and_dimensions(flow_id) 198 result_dict = _process_match_criteria(criteria, flow_id, dimensions) 199 200 # Add flow_id and return as DataFrame 201 result_dict["flow_id"] = flow_id 202 return pd.DataFrame([result_dict]).astype(str) 203 204 205def fetch_selection( 206 flow_id: str, 207 criteria: MatchCriteria, 208 parameters: dict[str, str] | None = None, 209 *, 210 validate: bool = False, 211 **kwargs: Unpack[GetFileKwargs], 212) -> tuple[pd.DataFrame, pd.DataFrame]: 213 """Fetch data based on a selection criteria for items. 214 215 Args: 216 flow_id (str): The ID of the data flow to fetch. 217 criteria (MatchCriteria): A sequence of match criteria to filter the data. 218 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 219 validate (bool, optional): If True, validate the selection against the flow's 220 required dimensions when generating the URL key. Defaults to False. 221 **kwargs: Additional keyword arguments for the fetch_multi function. 222 223 Returns: 224 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 225 226 """ 227 verbose = kwargs.get("verbose", False) 228 if verbose: 229 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 230 231 selection = make_wanted(flow_id, criteria) 232 return fetch_multi(selection, parameters, validate=validate, **kwargs) 233 234 235if __name__ == "__main__": 236 237 def test_module() -> None: 238 """Test the match_item function.""" 239 # --- test match_item() 240 item = match_item("Australia", "REGION", MatchType.EXACT) 241 if item != ("Australia", "REGION", MatchType.EXACT): 242 print(f"Test failed: {item}") 243 else: 244 print("Test passed, match_item() works as expected.") 245 246 # --- specify a selection from the Wage Price Index (WPI) data flow 247 mat_criteria = [] 248 mat_criteria.append(match_item("Australia", "REGION", MatchType.EXACT)) 249 mat_criteria.append( 250 match_item( 251 "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT 252 ) 253 ) 254 mat_criteria.append( 255 match_item("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL) 256 ) 257 mat_criteria.append(match_item("Seas|Trend", "TSEST", MatchType.REGEX)) 258 mat_criteria.append(match_item("13-Industry aggregate", "INDUSTRY", MatchType.EXACT)) 259 mat_criteria.append(match_item("Private and Public", "SECTOR", MatchType.EXACT)) 260 261 # --- test the selection 262 expected_count = 2 # expecting two data series 263 parameters = {"startPeriod": "2020-Q1", "endPeriod": "2020-Q4", "detail": "full"} 264 data, meta = fetch_selection("WPI", mat_criteria, parameters=parameters, verbose=False) 265 if len(data.columns) == expected_count and meta.shape[0] == expected_count: 266 print("Test passed: Data and metadata have expected dimensions.") 267 else: 268 print(f"Test FAILED: Data columns {len(data.columns)}, Metadata rows {meta.shape[0]}") 269 expected_seasonal = {"Trend", "Seasonally Adjusted"} 270 if set(meta.TSEST.to_list()) == expected_seasonal: 271 print("Test passed: TSEST has expected values.") 272 else: 273 print(f"Test FAILED: TSEST values {meta.TSEST.to_list()}") 274 expected_shape = (4, 2) # 4 quarters of data, over two series 275 if data.shape == expected_shape: 276 print("Test passed: Fetched data has expected shape.") 277 else: 278 print(f"Test FAILED: Fetched data shape {data.shape=} is unexpected {expected_shape=}.") 279 280 test_module()
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
150def match_item( 151 pattern: str, 152 dimension: str, 153 match_type: MatchType = MatchType.PARTIAL, 154) -> MatchItem: 155 """Create a new MatchItem for use in select_items() and fetch_selection(). 156 157 Args: 158 pattern (str): The pattern to match. 159 dimension (str): The dimension to match against. 160 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 161 162 Returns: 163 MatchElement: A tuple representing the match element. 164 165 Note: 166 This function is of little value. It is easier to create the tuple directly. 167 168 """ 169 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
Note: This function is of little value. It is easier to create the tuple directly.
172def make_wanted( 173 flow_id: str, 174 criteria: MatchCriteria, 175) -> pd.DataFrame: 176 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 177 178 Args: 179 flow_id (str): The ID of the data flow to select items from. 180 criteria (MatchCriteria): A sequence of tuples containing the pattern, 181 dimension name, and match-type (exact, partial, or regex). 182 183 Returns: 184 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 185 into the call of the function fetch_multi(). 186 187 Raises: 188 ValueError: If the flow_id is not valid or if no items match the criteria. 189 190 Notes: 191 - Should build a one line DataFrame. This Frame may select multiple data series, 192 when passed to fetch_multi. It also can be concatenated with other DataFrames 193 to build a larger selection. 194 - If two match elements refer to the same dimension, only the `intersection` of the 195 matches will be returned. 196 197 """ 198 dimensions = _validate_flow_and_dimensions(flow_id) 199 result_dict = _process_match_criteria(criteria, flow_id, dimensions) 200 201 # Add flow_id and return as DataFrame 202 result_dict["flow_id"] = flow_id 203 return pd.DataFrame([result_dict]).astype(str)
Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
206def fetch_selection( 207 flow_id: str, 208 criteria: MatchCriteria, 209 parameters: dict[str, str] | None = None, 210 *, 211 validate: bool = False, 212 **kwargs: Unpack[GetFileKwargs], 213) -> tuple[pd.DataFrame, pd.DataFrame]: 214 """Fetch data based on a selection criteria for items. 215 216 Args: 217 flow_id (str): The ID of the data flow to fetch. 218 criteria (MatchCriteria): A sequence of match criteria to filter the data. 219 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 220 validate (bool, optional): If True, validate the selection against the flow's 221 required dimensions when generating the URL key. Defaults to False. 222 **kwargs: Additional keyword arguments for the fetch_multi function. 223 224 Returns: 225 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 226 227 """ 228 verbose = kwargs.get("verbose", False) 229 if verbose: 230 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 231 232 selection = make_wanted(flow_id, criteria) 233 return fetch_multi(selection, parameters, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.