sdmxabs.select_items
Select items from the ABS Catalogue based on search criteria.
1"""Select items from the ABS Catalogue based on search criteria.""" 2 3import re 4from collections.abc import Sequence 5from enum import Enum 6from typing import Unpack 7 8import pandas as pd 9 10from sdmxabs.download_cache import GetFileKwargs 11from sdmxabs.fetch_multi import fetch_multi 12from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows 13 14 15# --- some types specific to this module 16class MatchType(Enum): 17 """Enumeration for match types.""" 18 19 EXACT = 1 20 PARTIAL = 2 21 REGEX = 3 22 23 24MatchItem = tuple[str, str, MatchType] # pattern, dimension, MatchType 25MatchCriteria = Sequence[MatchItem] # Sequence of tuples containing (pattern, dimension, MatchType) 26 27 28def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None: 29 """Package the codes into the return dictionary for a given dimension. 30 31 If the dimension already exists in the return_dict, we will intersect the newly 32 identified codes with the existing codes. If the intersection is a null set, the 33 dimension will be removed from the return_dict (ie. the global match). 34 35 Note: multiple matched codes are separated by a '+' sign in the return_dict. 36 37 """ 38 if dimension in return_dict: 39 previous = return_dict[dimension].split("+") 40 codes = list(set(previous).intersection(set(codes))) 41 if not codes: 42 del return_dict[dimension] # no matches, remove dimension 43 if codes: 44 return_dict[dimension] = "+".join(list(set(codes))) 45 46 47# --- private functions 48def get_codes( 49 code_list_dict: FlowMetaDict, 50 pattern: str, 51 match_type: MatchType = MatchType.PARTIAL, 52) -> list[str]: 53 """Obtain all codes matching the pattern.""" 54 codes = [] 55 for code, code_list in code_list_dict.items(): 56 name = code_list.get("name", "") 57 match match_type: 58 case MatchType.EXACT: 59 if name == pattern: 60 codes.append(code) 61 case MatchType.PARTIAL: 62 # Case-insensitive partial match 63 if pattern.lower() in name.lower(): 64 codes.append(code) 65 case MatchType.REGEX: 66 if re.match(pattern, name): 67 codes.append(code) 68 return codes 69 70 71# --- public functions 72def match_criterion( 73 pattern: str, 74 dimension: str, 75 match_type: MatchType = MatchType.PARTIAL, 76) -> MatchItem: 77 """Create a new match criterion for use in selection. 78 79 Args: 80 pattern (str): The pattern to match. 81 dimension (str): The dimension to match against. 82 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 83 84 Returns: 85 MatchElement: A tuple representing the match element. 86 87 """ 88 return (pattern, dimension, match_type) 89 90 91def select_items( 92 flow_id: str, 93 criteria: MatchCriteria, 94) -> pd.DataFrame: 95 """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 96 97 Args: 98 flow_id (str): The ID of the data flow to select items from. 99 criteria (MatchElements): A sequence of tuples containing the element name, 100 the value to match, and the match type (exact, partial, or regex). 101 102 Returns: 103 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 104 into the call of the function fetch_multi(). 105 106 Raises: 107 ValueError: If the flow_id is not valid or if no items match the criteria. 108 109 Notes: 110 - Should build a one line DataFrame. This Frame may select multiple data series, 111 when passed to fetch_multi. It also can be concatenated with other DataFrames 112 to build a larger selection. 113 - If two match elements refer to the same dimension, only the `intersection` of the 114 matches will be returned. 115 116 """ 117 # --- some sanity checks 118 if flow_id not in data_flows(): 119 raise ValueError(f"Invalid flow_id: {flow_id}.") 120 dimensions = data_dimensions(flow_id) 121 if not dimensions: 122 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 123 124 # --- lets build the codelist dictionary 125 return_dict: dict[str, str] = {} 126 for pattern, dimension, match_type in criteria: 127 if dimension not in dimensions: 128 print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)") 129 continue 130 dim_dict = dimensions[dimension] 131 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 132 print(f"Dimension '{dimension}' does not have a codelist; (skipping)") 133 continue 134 code_list_name = dim_dict.get("id") 135 codes = get_codes(code_lists(code_list_name), pattern, match_type) 136 package_codes(codes, dimension, return_dict) 137 138 # --- return as a (one row) `wanted` DataFrame 139 return_dict["flow_id"] = flow_id 140 return pd.DataFrame([return_dict]).astype(str) 141 142 143def fetch_selection( 144 flow_id: str, 145 criteria: MatchCriteria, 146 *, 147 validate: bool = False, 148 **kwargs: Unpack[GetFileKwargs], 149) -> tuple[pd.DataFrame, pd.DataFrame]: 150 """Fetch data based on a selection criteria for items. 151 152 Args: 153 flow_id (str): The ID of the data flow to fetch. 154 criteria (MatchCriteria): A sequence of match criteria to filter the data. 155 validate (bool, optional): If True, validate the selection against the flow's 156 required dimensions. Defaults to False. 157 **kwargs: Additional keyword arguments for the fetch_multi function. 158 159 Returns: 160 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 161 162 """ 163 selection = select_items(flow_id, criteria) 164 return fetch_multi(selection, validate=validate, **kwargs) 165 166 167# --- quick and dirty testing 168if __name__ == "__main__": 169 # --- specify a selection from the Wage Price Index (WPI) data flow 170 mat_criteria = [] 171 mat_criteria.append(match_criterion("Australia", "REGION", MatchType.EXACT)) 172 mat_criteria.append( 173 match_criterion( 174 "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT 175 ) 176 ) 177 mat_criteria.append( 178 match_criterion("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL) 179 ) 180 mat_criteria.append(match_criterion("Seas|Trend", "TSEST", MatchType.REGEX)) 181 mat_criteria.append(match_criterion("13-Industry aggregate", "INDUSTRY", MatchType.EXACT)) 182 mat_criteria.append(match_criterion("Private and Public", "SECTOR", MatchType.EXACT)) 183 184 # --- test the selection 185 print(select_items("WPI", mat_criteria)) 186 data, meta = fetch_selection("WPI", mat_criteria) 187 print(f"Number of data series: {len(meta)}") # should be 2 188 print(meta.T) # should have the Trend and Seasonally Adjusted series
17class MatchType(Enum): 18 """Enumeration for match types.""" 19 20 EXACT = 1 21 PARTIAL = 2 22 REGEX = 3
Enumeration for match types.
29def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None: 30 """Package the codes into the return dictionary for a given dimension. 31 32 If the dimension already exists in the return_dict, we will intersect the newly 33 identified codes with the existing codes. If the intersection is a null set, the 34 dimension will be removed from the return_dict (ie. the global match). 35 36 Note: multiple matched codes are separated by a '+' sign in the return_dict. 37 38 """ 39 if dimension in return_dict: 40 previous = return_dict[dimension].split("+") 41 codes = list(set(previous).intersection(set(codes))) 42 if not codes: 43 del return_dict[dimension] # no matches, remove dimension 44 if codes: 45 return_dict[dimension] = "+".join(list(set(codes)))
Package the codes into the return dictionary for a given dimension.
If the dimension already exists in the return_dict, we will intersect the newly identified codes with the existing codes. If the intersection is a null set, the dimension will be removed from the return_dict (ie. the global match).
Note: multiple matched codes are separated by a '+' sign in the return_dict.
49def get_codes( 50 code_list_dict: FlowMetaDict, 51 pattern: str, 52 match_type: MatchType = MatchType.PARTIAL, 53) -> list[str]: 54 """Obtain all codes matching the pattern.""" 55 codes = [] 56 for code, code_list in code_list_dict.items(): 57 name = code_list.get("name", "") 58 match match_type: 59 case MatchType.EXACT: 60 if name == pattern: 61 codes.append(code) 62 case MatchType.PARTIAL: 63 # Case-insensitive partial match 64 if pattern.lower() in name.lower(): 65 codes.append(code) 66 case MatchType.REGEX: 67 if re.match(pattern, name): 68 codes.append(code) 69 return codes
Obtain all codes matching the pattern.
73def match_criterion( 74 pattern: str, 75 dimension: str, 76 match_type: MatchType = MatchType.PARTIAL, 77) -> MatchItem: 78 """Create a new match criterion for use in selection. 79 80 Args: 81 pattern (str): The pattern to match. 82 dimension (str): The dimension to match against. 83 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 84 85 Returns: 86 MatchElement: A tuple representing the match element. 87 88 """ 89 return (pattern, dimension, match_type)
Create a new match criterion for use in selection.
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
92def select_items( 93 flow_id: str, 94 criteria: MatchCriteria, 95) -> pd.DataFrame: 96 """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata. 97 98 Args: 99 flow_id (str): The ID of the data flow to select items from. 100 criteria (MatchElements): A sequence of tuples containing the element name, 101 the value to match, and the match type (exact, partial, or regex). 102 103 Returns: 104 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 105 into the call of the function fetch_multi(). 106 107 Raises: 108 ValueError: If the flow_id is not valid or if no items match the criteria. 109 110 Notes: 111 - Should build a one line DataFrame. This Frame may select multiple data series, 112 when passed to fetch_multi. It also can be concatenated with other DataFrames 113 to build a larger selection. 114 - If two match elements refer to the same dimension, only the `intersection` of the 115 matches will be returned. 116 117 """ 118 # --- some sanity checks 119 if flow_id not in data_flows(): 120 raise ValueError(f"Invalid flow_id: {flow_id}.") 121 dimensions = data_dimensions(flow_id) 122 if not dimensions: 123 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 124 125 # --- lets build the codelist dictionary 126 return_dict: dict[str, str] = {} 127 for pattern, dimension, match_type in criteria: 128 if dimension not in dimensions: 129 print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)") 130 continue 131 dim_dict = dimensions[dimension] 132 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 133 print(f"Dimension '{dimension}' does not have a codelist; (skipping)") 134 continue 135 code_list_name = dim_dict.get("id") 136 codes = get_codes(code_lists(code_list_name), pattern, match_type) 137 package_codes(codes, dimension, return_dict) 138 139 # --- return as a (one row) `wanted` DataFrame 140 return_dict["flow_id"] = flow_id 141 return pd.DataFrame([return_dict]).astype(str)
Build the wanted Dataframe for use by fetch_multi() by matching flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
144def fetch_selection( 145 flow_id: str, 146 criteria: MatchCriteria, 147 *, 148 validate: bool = False, 149 **kwargs: Unpack[GetFileKwargs], 150) -> tuple[pd.DataFrame, pd.DataFrame]: 151 """Fetch data based on a selection criteria for items. 152 153 Args: 154 flow_id (str): The ID of the data flow to fetch. 155 criteria (MatchCriteria): A sequence of match criteria to filter the data. 156 validate (bool, optional): If True, validate the selection against the flow's 157 required dimensions. Defaults to False. 158 **kwargs: Additional keyword arguments for the fetch_multi function. 159 160 Returns: 161 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 162 163 """ 164 selection = select_items(flow_id, criteria) 165 return fetch_multi(selection, validate=validate, **kwargs)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. validate (bool, optional): If True, validate the selection against the flow's required dimensions. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.