sdmxabs.fetch_selection
Select one or more data series from the ABS Catalogue based on search criteria.
1"""Select one or more data series from the ABS Catalogue based on search criteria.""" 2 3import re 4from collections.abc import Sequence 5from enum import Enum 6from typing import Unpack 7 8import pandas as pd 9 10from sdmxabs.download_cache import GetFileKwargs 11from sdmxabs.fetch_multi import fetch_multi 12from sdmxabs.flow_metadata import ( 13 CODE_LIST_ID, 14 FlowMetaDict, 15 code_lists, 16 structure_from_flow_id, 17) 18 19 20# --- some types specific to this module 21class MatchType(Enum): 22 """Enumeration for match types.""" 23 24 EXACT = 1 25 PARTIAL = 2 26 REGEX = 3 27 28 29MatchItem = tuple[str, str, MatchType] # pattern, dimension, MatchType 30MatchCriteria = Sequence[MatchItem] # Sequence of tuples containing (pattern, dimension, MatchType) 31 32 33# --- private functions 34def _package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None: 35 """Package the codes into the return dictionary for a given dimension. 36 37 If the dimension already exists in the return_dict, we will intersect the newly 38 identified codes with the existing codes. If the intersection is a null set, the 39 dimension will be removed from the return_dict (ie. the global match). 40 41 Note: multiple matched codes are separated by a '+' sign in the return_dict. 42 43 """ 44 if dimension in return_dict: 45 previous = set(return_dict[dimension].split("+")) 46 codes = list(previous.intersection(set(codes))) 47 if not codes: 48 del return_dict[dimension] # no intersecting matches, remove dimension 49 if codes: 50 return_dict[dimension] = "+".join(sorted(set(codes))) 51 52 53def _get_codes( 54 code_list_dict: FlowMetaDict, 55 pattern: str, 56 match_type: MatchType = MatchType.PARTIAL, 57) -> list[str]: 58 """Obtain all codes matching the pattern.""" 59 codes = [] 60 for code, code_list in code_list_dict.items(): 61 name = code_list.get("name", "") 62 if not name: 63 # should not happen, but if it does, raise an error 64 raise ValueError(f"Code '{code}' has no name in codelist") 65 match match_type: 66 case MatchType.EXACT: 67 if name == pattern: 68 codes.append(code) 69 case MatchType.PARTIAL: 70 # Case-insensitive partial match 71 if pattern.lower() in name.lower(): 72 codes.append(code) 73 case MatchType.REGEX: 74 try: 75 if re.search(pattern, name): 76 codes.append(code) 77 except re.error as e: 78 raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e 79 return codes 80 81 82def _process_match_criteria(criteria: MatchCriteria, structure: FlowMetaDict) -> dict[str, str]: 83 """Process match criteria and build the result dictionary. 84 85 Args: 86 criteria (MatchCriteria): The match criteria to process. 87 structure (FlowMetaDict): Dictionary containing the data structure. 88 89 Returns: 90 dict[str, str]: Dictionary of dimension codes. 91 92 """ 93 result_dict: dict[str, str] = {} 94 95 for pattern, dim_name, match_type in criteria: 96 if dim_name not in structure: 97 raise ValueError(f"Dimension '{dim_name}' not found in structure.") 98 dim_dict = structure[dim_name] 99 if not pattern: 100 raise ValueError(f"Pattern for dimension '{dim_name}' cannot be empty.") 101 if "package" not in dim_dict or dim_dict["package"] != "codelist" or CODE_LIST_ID not in dim_dict: 102 raise ValueError(f"Dimension '{dim_name}' does not have a codelist.") 103 code_list_name = dim_dict.get(CODE_LIST_ID, "") 104 codes = _get_codes(code_lists(code_list_name), pattern, match_type) 105 if codes: 106 _package_codes(codes, dim_name, result_dict) 107 108 return result_dict 109 110 111# --- public function 112def match_item( 113 pattern: str, 114 dimension: str, 115 match_type: MatchType = MatchType.PARTIAL, 116) -> MatchItem: 117 """Create a new MatchItem for use in select_items() and fetch_selection(). 118 119 Args: 120 pattern (str): The pattern to match. 121 dimension (str): The dimension to match against. 122 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 123 124 Returns: 125 MatchElement: A tuple representing the match element. 126 127 Note: 128 This function is of little value. It is much easier to create the tuple directly. 129 130 """ 131 return (pattern, dimension, match_type) 132 133 134def make_wanted( 135 flow_id: str, 136 criteria: MatchCriteria, 137) -> pd.DataFrame: 138 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 139 140 Args: 141 flow_id (str): The ID of the data flow to select items from. 142 criteria (MatchCriteria): A sequence of tuples containing the pattern, 143 dimension name, and match-type (exact, partial, or regex). 144 145 Returns: 146 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 147 into the call of the function fetch_multi(). 148 149 Raises: 150 ValueError: If the flow_id is not valid or if no items match the criteria. 151 152 Notes: 153 - Should build a one line DataFrame. This Frame may select multiple data series, 154 when passed to fetch_multi. It also can be concatenated with other DataFrames 155 to build a larger selection. 156 - If two match elements refer to the same dimension, only the `intersection` of the 157 matches will be returned. 158 159 """ 160 structure = structure_from_flow_id(flow_id) 161 result_dict = _process_match_criteria(criteria, structure) 162 163 # Add flow_id and return as DataFrame 164 result_dict["flow_id"] = flow_id 165 return pd.DataFrame([result_dict]).astype(str) 166 167 168def fetch_selection( 169 flow_id: str, 170 criteria: MatchCriteria, 171 parameters: dict[str, str] | None = None, 172 *, 173 validate: bool = False, 174 **kwargs: Unpack[GetFileKwargs], 175) -> tuple[pd.DataFrame, pd.DataFrame]: 176 """Fetch data based on a selection criteria for items. 177 178 Args: 179 flow_id (str): The ID of the data flow to fetch. 180 criteria (MatchCriteria): A sequence of match criteria to filter the data. 181 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 182 validate (bool, optional): If True, validate the selection against the flow's 183 required dimensions when generating the URL key. Defaults to False. 184 **kwargs: Additional keyword arguments for the fetch_multi function. 185 186 Returns: 187 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 188 189 """ 190 verbose = kwargs.get("verbose", False) 191 if verbose: 192 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 193 194 selection = make_wanted(flow_id, criteria) 195 return fetch_multi(selection, parameters, validate=validate, **kwargs) 196 197 198if __name__ == "__main__": 199 200 def test_module() -> None: 201 """Test the match_item function.""" 202 # --- test match_item() 203 item = match_item("Australia", "REGION", MatchType.EXACT) 204 if item != ("Australia", "REGION", MatchType.EXACT): 205 print(f"Test failed: {item}") 206 else: 207 print("Test passed, match_item() works as expected.") 208 209 # --- specify a selection from the Wage Price Index (WPI) data flow 210 mat_criteria = [] 211 mat_criteria.append(match_item("Australia", "REGION", MatchType.EXACT)) 212 mat_criteria.append( 213 match_item( 214 "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT 215 ) 216 ) 217 mat_criteria.append( 218 match_item("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL) 219 ) 220 mat_criteria.append(match_item("Seas|Trend", "TSEST", MatchType.REGEX)) 221 mat_criteria.append(match_item("13-Industry aggregate", "INDUSTRY", MatchType.EXACT)) 222 mat_criteria.append(match_item("Private and Public", "SECTOR", MatchType.EXACT)) 223 224 # --- test the selection 225 expected_count = 2 # expecting two data series 226 parameters = {"startPeriod": "2020-Q1", "endPeriod": "2020-Q4", "detail": "full"} 227 data, meta = fetch_selection("WPI", mat_criteria, parameters=parameters, verbose=False) 228 if len(data.columns) == expected_count and meta.shape[0] == expected_count: 229 print("Test passed: Data and metadata have expected dimensions.") 230 else: 231 print(f"Test FAILED: Data columns {len(data.columns)}, Metadata rows {meta.shape[0]}") 232 expected_seasonal = {"Trend", "Seasonally Adjusted"} 233 print(meta) 234 if set(meta.TSEST.to_list()) == expected_seasonal: 235 print("Test passed: TSEST has expected values.") 236 else: 237 print(f"Test FAILED: TSEST values {meta.TSEST.to_list()}") 238 expected_shape = (4, 2) # 4 quarters of data, over two series 239 if data.shape == expected_shape: 240 print("Test passed: Fetched data has expected shape.") 241 else: 242 print(f"Test FAILED: Fetched data shape {data.shape=} is unexpected {expected_shape=}.") 243 244 test_module()
22class MatchType(Enum): 23 """Enumeration for match types.""" 24 25 EXACT = 1 26 PARTIAL = 2 27 REGEX = 3
Enumeration for match types.
113def match_item( 114 pattern: str, 115 dimension: str, 116 match_type: MatchType = MatchType.PARTIAL, 117) -> MatchItem: 118 """Create a new MatchItem for use in select_items() and fetch_selection(). 119 120 Args: 121 pattern (str): The pattern to match. 122 dimension (str): The dimension to match against. 123 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 124 125 Returns: 126 MatchElement: A tuple representing the match element. 127 128 Note: 129 This function is of little value. It is much easier to create the tuple directly. 130 131 """ 132 return (pattern, dimension, match_type)
Create a new MatchItem for use in select_items() and fetch_selection().
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
Note: This function is of little value. It is much easier to create the tuple directly.
135def make_wanted( 136 flow_id: str, 137 criteria: MatchCriteria, 138) -> pd.DataFrame: 139 """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 140 141 Args: 142 flow_id (str): The ID of the data flow to select items from. 143 criteria (MatchCriteria): A sequence of tuples containing the pattern, 144 dimension name, and match-type (exact, partial, or regex). 145 146 Returns: 147 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 148 into the call of the function fetch_multi(). 149 150 Raises: 151 ValueError: If the flow_id is not valid or if no items match the criteria. 152 153 Notes: 154 - Should build a one line DataFrame. This Frame may select multiple data series, 155 when passed to fetch_multi. It also can be concatenated with other DataFrames 156 to build a larger selection. 157 - If two match elements refer to the same dimension, only the `intersection` of the 158 matches will be returned. 159 160 """ 161 structure = structure_from_flow_id(flow_id) 162 result_dict = _process_match_criteria(criteria, structure) 163 164 # Add flow_id and return as DataFrame 165 result_dict["flow_id"] = flow_id 166 return pd.DataFrame([result_dict]).astype(str)
Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
169def fetch_selection( 170 flow_id: str, 171 criteria: MatchCriteria, 172 parameters: dict[str, str] | None = None, 173 *, 174 validate: bool = False, 175 **kwargs: Unpack[GetFileKwargs], 176) -> tuple[pd.DataFrame, pd.DataFrame]: 177 """Fetch data based on a selection criteria for items. 178 179 Args: 180 flow_id (str): The ID of the data flow to fetch. 181 criteria (MatchCriteria): A sequence of match criteria to filter the data. 182 parameters (dict[str, str] | None, optional): Additional parameters for the fetch. 183 validate (bool, optional): If True, validate the selection against the flow's 184 required dimensions when generating the URL key. Defaults to False. 185 **kwargs: Additional keyword arguments for the fetch_multi function. 186 187 Returns: 188 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 189 190 """ 191 verbose = kwargs.get("verbose", False) 192 if verbose: 193 print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}") 194 195 selection = make_wanted(flow_id, criteria) 196 return fetch_multi(selection, parameters, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.