sdmxabs.fetch_selection
Select one or more data series from the ABS Catalogue based on search criteria.
1"""Select one or more data series from the ABS Catalogue based on search criteria.""" 2 3import re 4from collections.abc import Sequence 5from enum import Enum 6from typing import Unpack 7 8import pandas as pd 9 10from sdmxabs.download_cache import GetFileKwargs 11from sdmxabs.fetch_multi import fetch_multi 12from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows 13 14 15# --- some types specific to this module 16class MatchType(Enum): 17 """Enumeration for match types.""" 18 19 EXACT = 1 20 PARTIAL = 2 21 REGEX = 3 22 23 24MatchItem = tuple[str, str, MatchType] # pattern, dimension, MatchType 25MatchCriteria = Sequence[MatchItem] # Sequence of tuples containing (pattern, dimension, MatchType) 26 27 28# --- private functions 29def _package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None: 30 """Package the codes into the return dictionary for a given dimension. 31 32 If the dimension already exists in the return_dict, we will intersect the newly 33 identified codes with the existing codes. If the intersection is a null set, the 34 dimension will be removed from the return_dict (ie. the global match). 35 36 Note: multiple matched codes are separated by a '+' sign in the return_dict. 37 38 """ 39 if dimension in return_dict: 40 previous = set(return_dict[dimension].split("+")) 41 codes = list(previous.intersection(set(codes))) 42 if not codes: 43 del return_dict[dimension] # no intersecting matches, remove dimension 44 if codes: 45 return_dict[dimension] = "+".join(sorted(set(codes))) 46 47 48def _get_codes( 49 code_list_dict: FlowMetaDict, 50 pattern: str, 51 match_type: MatchType = MatchType.PARTIAL, 52) -> list[str]: 53 """Obtain all codes matching the pattern.""" 54 codes = [] 55 for code, code_list in code_list_dict.items(): 56 name = code_list.get("name", "") 57 if not name: 58 # should not happen, but if it does, raise an error 59 raise ValueError(f"Code '{code}' has no name in codelist") 60 match match_type: 61 case MatchType.EXACT: 62 if name == pattern: 63 codes.append(code) 64 case MatchType.PARTIAL: 65 # Case-insensitive partial match 66 if pattern.lower() in name.lower(): 67 codes.append(code) 68 case MatchType.REGEX: 69 try: 70 if re.search(pattern, name): 71 codes.append(code) 72 except re.error as e: 73 raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e 74 return codes 75 76 77def _validate_flow_and_dimensions(flow_id: str) -> FlowMetaDict: 78 """Validate flow_id and return dimensions. 79 80 Args: 81 flow_id (str): The ID of the data flow to validate. 82 83 Returns: 84 FlowMetaDict: Dictionary containing the flow's dimensions. 85 86 Raises: 87 ValueError: If the flow_id is not valid or has no dimensions. 88 89 """ 90 if flow_id not in data_flows(): 91 raise ValueError(f"Invalid flow_id: {flow_id}.") 92 dimensions = data_dimensions(flow_id) 93 if not dimensions: 94 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 95 return dimensions 96 97 98def _validate_dimension(dimension: str, flow_id: str, dimensions: FlowMetaDict) -> str: 99 """Validate dimension and return codelist name if valid. 100 101 Args: 102 dimension (str): The dimension name to validate. 103 flow_id (str): The flow ID for error messages. 104 dimensions (FlowMetaDict): Dictionary containing the flow's dimensions. 105 106 Returns: 107 str: The codelist name if valid. 108 109 Raises: 110 ValueError: If dimension is not found or doesn't have a codelist. 111 112 """ 113 if dimension not in dimensions: 114 raise ValueError(f"Dimension '{dimension}' not found for flow '{flow_id}'") 115 116 dim_dict = dimensions[dimension] 117 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 118 raise ValueError(f"Dimension '{dimension}' does not have a codelist for flow '{flow_id}'") 119 120 return dim_dict.get("id", "") 121 122 123def _process_match_criteria( 124 criteria: MatchCriteria, flow_id: str, dimensions: FlowMetaDict 125) -> dict[str, str]: 126 """Process match criteria and build the result dictionary. 127 128 Args: 129 criteria (MatchCriteria): The match criteria to process. 130 flow_id (str): The flow ID for error messages. 131 dimensions (FlowMetaDict): Dictionary containing the flow's dimensions. 132 133 Returns: 134 dict[str, str]: Dictionary of dimension codes. 135 136 """ 137 result_dict: dict[str, str] = {} 138 139 for pattern, dimension, match_type in criteria: 140 code_list_name = _validate_dimension(dimension, flow_id, dimensions) 141 codes = _get_codes(code_lists(code_list_name), pattern, match_type) 142 if codes: 143 _package_codes(codes, dimension, result_dict) 144 145 return result_dict 146 147 148# --- public function 149def match_item( 150 pattern: str, 151 dimension: str, 152 match_type: MatchType = MatchType.PARTIAL, 153) -> MatchItem: 154 """Create a new MatchItem for use in select_items() and fetch_selection(). 155 156 Args: 157 pattern (str): The pattern to match. 158 dimension (str): The dimension to match against. 159 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 160 161 Returns: 162 MatchElement: A tuple representing the match element. 163 164 """ 165 return (pattern, dimension, match_type) 166 167 168def make_wanted( 169 flow_id: str, 170 criteria: MatchCriteria, 171) -> pd.DataFrame: 172 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 173 174 Args: 175 flow_id (str): The ID of the data flow to select items from. 176 criteria (MatchCriteria): A sequence of tuples containing the pattern, 177 dimension name, and match-type (exact, partial, or regex). 178 179 Returns: 180 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 181 into the call of the function fetch_multi(). 182 183 Raises: 184 ValueError: If the flow_id is not valid or if no items match the criteria. 185 186 Notes: 187 - Should build a one line DataFrame. This Frame may select multiple data series, 188 when passed to fetch_multi. It also can be concatenated with other DataFrames 189 to build a larger selection. 190 - If two match elements refer to the same dimension, only the `intersection` of the 191 matches will be returned. 192 193 """ 194 dimensions = _validate_flow_and_dimensions(flow_id) 195 result_dict = _process_match_criteria(criteria, flow_id, dimensions) 196 197 # Add flow_id and return as DataFrame 198 result_dict["flow_id"] = flow_id 199 return pd.DataFrame([result_dict]).astype(str) 200 201 202def fetch_selection( 203 flow_id: str, 204 criteria: MatchCriteria, 205 parameters: dict[str, str] | None = None, 206 *, 207 validate: bool = False, 208 **kwargs: Unpack[GetFileKwargs], 209) -> tuple[pd.DataFrame, pd.DataFrame]: 210 """Fetch data based on a selection criteria for items. 211 212 Args: 213 flow_id (str): The ID of the data flow to fetch. 214 criteria (MatchCriteria): A sequence of match criteria to filter the data. 215 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 216 validate (bool, optional): If True, validate the selection against the flow's 217 required dimensions when generating the URL key. Defaults to False. 218 **kwargs: Additional keyword arguments for the fetch_multi function. 219 220 Returns: 221 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 222 223 """ 224 verbose = kwargs.get("verbose", False) 225 if verbose: 226 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 227 228 selection = make_wanted(flow_id, criteria) 229 return fetch_multi(selection, parameters, validate=validate, **kwargs) 230 231 232if __name__ == "__main__": 233 234 def test_module() -> None: 235 """Test the match_item function.""" 236 # --- test match_item() 237 item = match_item("Australia", "REGION", MatchType.EXACT) 238 if item != ("Australia", "REGION", MatchType.EXACT): 239 print(f"Test failed: {item}") 240 else: 241 print("Test passed, match_item() works as expected.") 242 243 # --- specify a selection from the Wage Price Index (WPI) data flow 244 mat_criteria = [] 245 mat_criteria.append(match_item("Australia", "REGION", MatchType.EXACT)) 246 mat_criteria.append( 247 match_item( 248 "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT 249 ) 250 ) 251 mat_criteria.append( 252 match_item("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL) 253 ) 254 mat_criteria.append(match_item("Seas|Trend", "TSEST", MatchType.REGEX)) 255 mat_criteria.append(match_item("13-Industry aggregate", "INDUSTRY", MatchType.EXACT)) 256 mat_criteria.append(match_item("Private and Public", "SECTOR", MatchType.EXACT)) 257 258 # --- test the selection 259 expected_count = 2 # expecting two data series 260 parameters = {"startPeriod": "2020-Q1", "endPeriod": "2020-Q4", "detail": "full"} 261 data, meta = fetch_selection("WPI", mat_criteria, parameters=parameters, verbose=False) 262 if len(data.columns) == expected_count and meta.shape[0] == expected_count: 263 print("Test passed: Data and metadata have expected dimensions.") 264 else: 265 print(f"Test FAILED: Data columns {len(data.columns)}, Metadata rows {meta.shape[0]}") 266 expected_seasonal = {"Trend", "Seasonally Adjusted"} 267 if set(meta.TSEST.to_list()) == expected_seasonal: 268 print("Test passed: TSEST has expected values.") 269 else: 270 print(f"Test FAILED: TSEST values {meta.TSEST.to_list()}") 271 expected_shape = (4, 2) # 4 quarters of data, over two series 272 if data.shape == expected_shape: 273 print("Test passed: Fetched data has expected shape.") 274 else: 275 print(f"Test FAILED: Fetched data shape {data.shape=} is unexpected {expected_shape=}.") 276 277 test_module()
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
150def match_item( 151 pattern: str, 152 dimension: str, 153 match_type: MatchType = MatchType.PARTIAL, 154) -> MatchItem: 155 """Create a new MatchItem for use in select_items() and fetch_selection(). 156 157 Args: 158 pattern (str): The pattern to match. 159 dimension (str): The dimension to match against. 160 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 161 162 Returns: 163 MatchElement: A tuple representing the match element. 164 165 """ 166 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
169def make_wanted( 170 flow_id: str, 171 criteria: MatchCriteria, 172) -> pd.DataFrame: 173 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 174 175 Args: 176 flow_id (str): The ID of the data flow to select items from. 177 criteria (MatchCriteria): A sequence of tuples containing the pattern, 178 dimension name, and match-type (exact, partial, or regex). 179 180 Returns: 181 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 182 into the call of the function fetch_multi(). 183 184 Raises: 185 ValueError: If the flow_id is not valid or if no items match the criteria. 186 187 Notes: 188 - Should build a one line DataFrame. This Frame may select multiple data series, 189 when passed to fetch_multi. It also can be concatenated with other DataFrames 190 to build a larger selection. 191 - If two match elements refer to the same dimension, only the `intersection` of the 192 matches will be returned. 193 194 """ 195 dimensions = _validate_flow_and_dimensions(flow_id) 196 result_dict = _process_match_criteria(criteria, flow_id, dimensions) 197 198 # Add flow_id and return as DataFrame 199 result_dict["flow_id"] = flow_id 200 return pd.DataFrame([result_dict]).astype(str)
Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
203def fetch_selection( 204 flow_id: str, 205 criteria: MatchCriteria, 206 parameters: dict[str, str] | None = None, 207 *, 208 validate: bool = False, 209 **kwargs: Unpack[GetFileKwargs], 210) -> tuple[pd.DataFrame, pd.DataFrame]: 211 """Fetch data based on a selection criteria for items. 212 213 Args: 214 flow_id (str): The ID of the data flow to fetch. 215 criteria (MatchCriteria): A sequence of match criteria to filter the data. 216 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 217 validate (bool, optional): If True, validate the selection against the flow's 218 required dimensions when generating the URL key. Defaults to False. 219 **kwargs: Additional keyword arguments for the fetch_multi function. 220 221 Returns: 222 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 223 224 """ 225 verbose = kwargs.get("verbose", False) 226 if verbose: 227 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 228 229 selection = make_wanted(flow_id, criteria) 230 return fetch_multi(selection, parameters, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.