sdmxabs.select_items
Select items from the ABS Catalogue based on search criteria.
1"""Select items from the ABS Catalogue based on search criteria.""" 2 3import re 4from collections.abc import Sequence 5from enum import Enum 6 7import pandas as pd 8 9from sdmxabs.fetch_multi import fetch_multi 10from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows 11 12 13# --- some types specific to this module 14class MatchType(Enum): 15 """Enumeration for match types.""" 16 17 EXACT = 1 18 PARTIAL = 2 19 REGEX = 3 20 21 22MatchItem = tuple[str, str, MatchType] 23MatchCriteria = Sequence[MatchItem] 24 25 26# --- private functions 27def get_codes( 28 code_list_dict: FlowMetaDict, 29 pattern: str, 30 match_type: MatchType = MatchType.PARTIAL, 31) -> list[str]: 32 """Obtain all codes matching the pattern.""" 33 codes = [] 34 for code, code_list in code_list_dict.items(): 35 name = code_list.get("name", "") 36 match match_type: 37 case MatchType.EXACT: 38 if name == pattern: 39 codes.append(code) 40 case MatchType.PARTIAL: 41 if pattern in name: 42 codes.append(code) 43 case MatchType.REGEX: 44 if re.match(pattern, name): 45 codes.append(code) 46 return codes 47 48 49def get_code_list_dict(dimension: str, dim_dict: dict[str, str]) -> FlowMetaDict: 50 """Get the codelist dictionary for a given dimension.""" 51 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 52 print(f"Dimension '{dimension}' does not have a codelist; (skipping)") 53 return {} 54 code_list_name = dim_dict.get("id") 55 return code_lists(code_list_name) 56 57 58# --- public functions 59def match_criterion( 60 pattern: str, 61 dimension: str, 62 match_type: MatchType = MatchType.PARTIAL, 63) -> MatchItem: 64 """Create a new match criterion for use in selection. 65 66 Args: 67 pattern (str): The pattern to match. 68 dimension (str): The dimension to match against. 69 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 70 71 Returns: 72 MatchElement: A tuple representing the match element. 73 74 """ 75 return (pattern, dimension, match_type) 76 77 78def select_items( 79 flow_id: str, 80 criteria: MatchCriteria, 81) -> pd.DataFrame: 82 """Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata. 83 84 Args: 85 flow_id (str): The ID of the data flow to select items from. 86 criteria (MatchElements): A sequence of tuples containing the element name, 87 the value to match, and the match type (exact, partial, or regex). 88 89 Returns: 90 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 91 into the call of the function fetch_multi(). 92 93 Raises: 94 ValueError: If the flow_id is not valid or if no items match the criteria. 95 96 Notes: 97 - Should build a one line DataFrame. This Frame may select multiple data series, 98 when passed to fetch_multi. It also can be concatenated with other DataFrames 99 to build a larger selection. 100 - If two match elements refer to the same dimension, only the `intersection` of the 101 matches will be returned. 102 103 """ 104 # --- some sanity checks 105 if flow_id not in data_flows(): 106 raise ValueError(f"Invalid flow_id: {flow_id}.") 107 dimensions = data_dimensions(flow_id) 108 if not dimensions: 109 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 110 111 # --- lets build the codelist dictionary 112 return_dict: dict[str, str] = {} 113 for pattern, dimension, match_type in criteria: 114 if dimension not in dimensions: 115 print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)") 116 continue 117 dim_dict = dimensions[dimension] 118 code_list_dict = get_code_list_dict(dimension, dim_dict) 119 if not code_list_dict: 120 continue 121 122 codes = get_codes(code_list_dict, pattern, match_type) 123 124 # --- combine (as an intersection) with previous matches for this dimension 125 if dimension in return_dict: 126 previous = return_dict[dimension].split("+") 127 codes = list(set(previous).intersection(set(codes))) 128 if not codes: 129 del return_dict[dimension] # no matches, remove dimension 130 if codes: 131 return_dict[dimension] = "+".join(list(set(codes))) 132 133 # --- return a DataFrame 134 return_dict["flow_id"] = flow_id 135 return pd.DataFrame([return_dict]) 136 137 138def fetch_selection( 139 flow_id: str, 140 criteria: MatchCriteria, 141) -> tuple[pd.DataFrame, pd.DataFrame]: 142 """Fetch data based on a selection criteria for items. 143 144 Args: 145 flow_id (str): The ID of the data flow to fetch. 146 criteria (MatchCriteria): A sequence of match criteria to filter the data. 147 148 Returns: 149 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 150 151 """ 152 # --- select items based on the criteria 153 selection = select_items(flow_id, criteria) 154 155 # --- fetch the data using the selected items 156 return fetch_multi(selection) 157 158 159# --- quick and dirty testing 160if __name__ == "__main__": 161 # --- specify a selection from the Wage Price Index (WPI) data flow 162 mat_criteria = [] 163 mat_criteria.append(match_criterion("Australia", "REGION", MatchType.EXACT)) 164 mat_criteria.append( 165 match_criterion( 166 "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT 167 ) 168 ) 169 mat_criteria.append( 170 match_criterion("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL) 171 ) 172 mat_criteria.append(match_criterion("Seas|Trend", "TSEST", MatchType.REGEX)) 173 mat_criteria.append(match_criterion("13-Industry aggregate", "INDUSTRY", MatchType.EXACT)) 174 mat_criteria.append(match_criterion("Private and Public", "SECTOR", MatchType.EXACT)) 175 176 # --- test the selection 177 print(select_items("WPI", mat_criteria)) 178 data, meta = fetch_selection("WPI", mat_criteria) 179 print(f"Number of data series: {len(meta)}") # should be 2 180 print(meta.T) # should have the Trend and Seasonally Adjusted series
15class MatchType(Enum): 16 """Enumeration for match types.""" 17 18 EXACT = 1 19 PARTIAL = 2 20 REGEX = 3
Enumeration for match types.
28def get_codes( 29 code_list_dict: FlowMetaDict, 30 pattern: str, 31 match_type: MatchType = MatchType.PARTIAL, 32) -> list[str]: 33 """Obtain all codes matching the pattern.""" 34 codes = [] 35 for code, code_list in code_list_dict.items(): 36 name = code_list.get("name", "") 37 match match_type: 38 case MatchType.EXACT: 39 if name == pattern: 40 codes.append(code) 41 case MatchType.PARTIAL: 42 if pattern in name: 43 codes.append(code) 44 case MatchType.REGEX: 45 if re.match(pattern, name): 46 codes.append(code) 47 return codes
Obtain all codes matching the pattern.
50def get_code_list_dict(dimension: str, dim_dict: dict[str, str]) -> FlowMetaDict: 51 """Get the codelist dictionary for a given dimension.""" 52 if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict: 53 print(f"Dimension '{dimension}' does not have a codelist; (skipping)") 54 return {} 55 code_list_name = dim_dict.get("id") 56 return code_lists(code_list_name)
Get the codelist dictionary for a given dimension.
60def match_criterion( 61 pattern: str, 62 dimension: str, 63 match_type: MatchType = MatchType.PARTIAL, 64) -> MatchItem: 65 """Create a new match criterion for use in selection. 66 67 Args: 68 pattern (str): The pattern to match. 69 dimension (str): The dimension to match against. 70 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 71 72 Returns: 73 MatchElement: A tuple representing the match element. 74 75 """ 76 return (pattern, dimension, match_type)
Create a new match criterion for use in selection.
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
79def select_items( 80 flow_id: str, 81 criteria: MatchCriteria, 82) -> pd.DataFrame: 83 """Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata. 84 85 Args: 86 flow_id (str): The ID of the data flow to select items from. 87 criteria (MatchElements): A sequence of tuples containing the element name, 88 the value to match, and the match type (exact, partial, or regex). 89 90 Returns: 91 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 92 into the call of the function fetch_multi(). 93 94 Raises: 95 ValueError: If the flow_id is not valid or if no items match the criteria. 96 97 Notes: 98 - Should build a one line DataFrame. This Frame may select multiple data series, 99 when passed to fetch_multi. It also can be concatenated with other DataFrames 100 to build a larger selection. 101 - If two match elements refer to the same dimension, only the `intersection` of the 102 matches will be returned. 103 104 """ 105 # --- some sanity checks 106 if flow_id not in data_flows(): 107 raise ValueError(f"Invalid flow_id: {flow_id}.") 108 dimensions = data_dimensions(flow_id) 109 if not dimensions: 110 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 111 112 # --- lets build the codelist dictionary 113 return_dict: dict[str, str] = {} 114 for pattern, dimension, match_type in criteria: 115 if dimension not in dimensions: 116 print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)") 117 continue 118 dim_dict = dimensions[dimension] 119 code_list_dict = get_code_list_dict(dimension, dim_dict) 120 if not code_list_dict: 121 continue 122 123 codes = get_codes(code_list_dict, pattern, match_type) 124 125 # --- combine (as an intersection) with previous matches for this dimension 126 if dimension in return_dict: 127 previous = return_dict[dimension].split("+") 128 codes = list(set(previous).intersection(set(codes))) 129 if not codes: 130 del return_dict[dimension] # no matches, remove dimension 131 if codes: 132 return_dict[dimension] = "+".join(list(set(codes))) 133 134 # --- return a DataFrame 135 return_dict["flow_id"] = flow_id 136 return pd.DataFrame([return_dict])
Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.
139def fetch_selection( 140 flow_id: str, 141 criteria: MatchCriteria, 142) -> tuple[pd.DataFrame, pd.DataFrame]: 143 """Fetch data based on a selection criteria for items. 144 145 Args: 146 flow_id (str): The ID of the data flow to fetch. 147 criteria (MatchCriteria): A sequence of match criteria to filter the data. 148 149 Returns: 150 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 151 152 """ 153 # --- select items based on the criteria 154 selection = select_items(flow_id, criteria) 155 156 # --- fetch the data using the selected items 157 return fetch_multi(selection)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.