readabs.read_abs_by_desc
Get specific ABS data series by searching for the ABS data item descriptions.
This module provides functionality to search and retrieve ABS data series by their descriptions rather than series IDs.
1"""Get specific ABS data series by searching for the ABS data item descriptions. 2 3This module provides functionality to search and retrieve ABS data series 4by their descriptions rather than series IDs. 5""" 6 7import inspect 8from typing import Any 9 10# Analytic imports 11import pandas as pd 12 13# local imports 14from readabs.abs_meta_data import metacol as mc 15from readabs.read_abs_cat import read_abs_cat 16from readabs.search_abs_meta import find_abs_id 17 18 19# --- private functions 20def _work_to_do(wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None) -> bool: 21 """Check if there is any work to do.""" 22 if wanted is None or len(wanted) == 0: 23 print("No data requested.") 24 return False 25 return True 26 27 28def _wlist_to_wdict(wanted: list[str]) -> dict[str, str]: 29 """Convert a list of strings to a dictionary of strings:strings. 30 31 Note: the keys and values are the same. 32 Note: any duplicate elements in the list will be lost. 33 """ 34 return {k: k for k in wanted} 35 36 37def _get_search_terms(input_dict: dict[str, Any], output_dict: dict[str, str]) -> dict[str, str]: 38 """Build a selector dictionary from the input dictionary.""" 39 search_names = {abbr: term for abbr, term in inspect.getmembers(mc) if not abbr.startswith("_")} 40 for mc_abbr, meta_column in search_names.items(): 41 if mc_abbr in input_dict: 42 # the selector dictionary is back-to_front 43 # ie. {value_sought: column_name} 44 output_dict[input_dict[mc_abbr]] = meta_column 45 return output_dict 46 47 48def _get_args(keys: list[str], input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]: 49 """Build a retrieval dictionary from the input dictionary.""" 50 for key in keys: 51 if key in input_dict: 52 output_dict[key] = input_dict[key] 53 return output_dict 54 55 56def _get_search_args(input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]: 57 """Extract the search arguments from the input dictionary.""" 58 keys = ["validate_unique", "exact_match", "regex", "verbose"] 59 return _get_args(keys, input_dict, output_dict) 60 61 62def _get_retrieval_args(input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]: 63 """Extract the retrieval arguments from the input dictionary.""" 64 keys = [ 65 "ignore_errors", 66 "get_zip", 67 "get_excel_if_no_zip", 68 "get_excel", 69 "cache_only", 70 "single_excel_only", 71 "single_zip_only", 72 "verbose", 73 ] 74 return _get_args(keys, input_dict, output_dict) 75 76 77def _get_item_from_str( 78 item: str, 79 data_dict: dict[str, pd.DataFrame], 80 data_meta: pd.DataFrame, 81 item_selector: dict[str, str], 82 search_args: dict[str, Any], 83) -> tuple[pd.Series, pd.DataFrame]: 84 """Get a data series from the data dictionary and metadata. 85 86 Give the series its series-id as a name. 87 """ 88 if not data_dict or data_meta.empty: 89 raise ValueError( 90 "If the wanted data is a string, a populated abs_dict " + "and abs_meta must be provided." 91 ) 92 item_selector[item] = mc.did # back_to_front 93 table, series_id, units = find_abs_id(data_meta, item_selector, **search_args) 94 95 series = data_dict[table][series_id] 96 series.name = series_id 97 series_meta = data_meta.loc[ 98 (data_meta[mc.table] == table) & (data_meta[mc.id] == series_id) & (data_meta[mc.unit] == units) 99 ] 100 return series, series_meta 101 102 103def _get_item_from_dict( 104 item_dict: dict[str, Any], 105 data_dict: dict[str, pd.DataFrame], 106 data_meta: pd.DataFrame, 107 item_selector: dict[str, str], 108 search_args: dict[str, Any], 109 **kwargs: Any, 110) -> tuple[pd.Series, pd.DataFrame]: 111 # preparation 112 if "did" not in item_dict: 113 raise ValueError("Each inner dictionary must contain a 'did' key.") 114 item = item_dict.pop("did") 115 item_selector = _get_search_terms(item_dict, item_selector) 116 item_search_args = _get_search_args(item_dict, search_args) 117 118 if not data_dict or data_meta.empty: 119 # data retrieval reqquired 120 if "cat" not in item_dict: 121 raise ValueError( 122 "Each inner dictionary must contain a 'cat' key, " 123 "if an abs_dict is not provided/empty or the " 124 "abs_meta is not provided/empty." 125 ) 126 ret_args = _get_retrieval_args(kwargs, {}) 127 ret_args = _get_retrieval_args(item_dict, ret_args) 128 data_dict, data_meta = read_abs_cat(cat=item_dict["cat"], **ret_args) 129 130 # series extraction based on search terms 131 series, series_meta = _get_item_from_str( 132 item=item, 133 data_dict=data_dict, 134 data_meta=data_meta, 135 item_selector=item_selector, 136 search_args=item_search_args, 137 ) 138 return series, series_meta 139 140 141# --- public functions 142def read_abs_by_desc( 143 wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], 144 **kwargs: Any, 145) -> tuple[dict[str, pd.Series], pd.DataFrame]: 146 """Get specific ABS data series by searching the ABS meta data. 147 148 Parameters 149 ---------- 150 wanted : list of str, dict of str:str, or dict of str:dict 151 The data 152 item descriptions to search for. If a list, it will be a list of 153 descriptions to search for. If a dictionary, the keys will a name. 154 The dictionary values can be either a string (the data item 155 description to search for) or a dictionary of keyword arguments, one of 156 which would be the data item description to search for. 157 **kwargs : Any 158 Keyword arguments to control the data retrieval. 159 The keyword arguments can include the following: 160 - abs_dict : dict - the dictionary of ABS data to search (from 161 read_abs_cat()). 162 - abs_meta : DataFrame - the metadata for the ABS data (from 163 read_abs_cat()). 164 - for the retrieval of data, the "cat" argument must be present. 165 The following arguments, if present, will also be used (ie. 166 passed to read_abs_cat()): ["ignore_errors", "get_zip", 167 "get_excel_if_no_zip", "get_excel", "cache_only", 168 "single_excel_only", "single_zip_only", "verbose"]. 169 - for the selection of data, the following metacol names, if present, 170 will be used to construct the selector: "cat", "did" 171 "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 172 "cmonth", "table", "tdesc". 173 - finally, the following arguments will be passed to the find_abs_id() 174 and search_abs_meta() functions: ["validate_unique", "exact_match", 175 "regex", "verbose"]. 176 177 Notes 178 ----- 179 - if "wanted" is of type list[str] or dict[str, str], the kwargs should 180 include sufficient keys from the metacol dataclass to get the data. 181 Typically, the "cat" key, the "table" key, and the "stype" key would 182 be required. The did key would taken from the wanted list or 183 dictionary. 184 if wanted is of type dict[str, dict[str, Any]], the inner dictionary 185 must contain a "did" key. The other keys that can be used for the 186 data retrieval are the same as the metacol dataclass fileds, namely: 187 "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 188 "cmonth", "table", "tdesc". 189 - if abs_dict and abs_meta are provided within the kwargs, they will be 190 used to locate and extract the selected data. 191 - if abs_dict and abs_meta are not provided, then, (1) wanted must be of 192 type dict[str, dict[str, Any]] and (2) the inner dictionary must 193 contain a "cat" key so the data can be retrieved. Other keys that 194 can be used for the data retrieval are the same as for read_abs_cat(), 195 namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", 196 "get_excel", "single_excel_only", "single_zip_only", "cache_only"]. 197 198 199 Returns 200 ------- 201 Returns a tuple of two items: 202 - A dictionary of pandas Series objects, where the keys are the series 203 descriptions. The series.name attribute will be the ABS series-id. 204 - A pandas DataFrame containing the metadata for the series. 205 206 Example 207 ------- 208 209 ```python 210 import readabs as ra 211 from pandas import DataFrame 212 cat_num = "5206.0" # The ABS National Accounts 213 data, meta = ra.read_abs_cat(cat=cat_num) 214 wanted = ["Gross domestic product: Chain volume measures ;",] 215 selected, selected_meta = ra.read_abs_by_desc( 216 wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates" 217 ) 218 ``` 219 220 """ 221 # - preparation 222 if not _work_to_do(wanted): 223 return {}, pd.DataFrame() 224 if isinstance(wanted, list): 225 wanted = _wlist_to_wdict(wanted) 226 abs_dict = kwargs.get("abs_dict", {}) 227 abs_meta = kwargs.get("abs_meta", pd.DataFrame()) 228 kwarg_selector = _get_search_terms(kwargs, {}) 229 search_args = _get_search_args(kwargs, {}) 230 231 return_dict = {} 232 return_meta = pd.DataFrame() 233 for key, value in wanted.items(): 234 item_selector = kwarg_selector.copy() 235 item_search_args = search_args.copy() 236 if isinstance(value, str): 237 series, meta = _get_item_from_str( 238 item=value, 239 data_dict=abs_dict, 240 data_meta=abs_meta, 241 item_selector=item_selector, 242 search_args=item_search_args, 243 ) 244 245 elif isinstance(value, dict): 246 series, meta = _get_item_from_dict( 247 item_dict=value, 248 data_dict=abs_dict, 249 data_meta=abs_meta, 250 item_selector=item_selector, 251 search_args=item_search_args, 252 **kwargs, 253 ) 254 else: 255 raise TypeError( 256 "Each value in the wanted list/dictionary must be either a string " + "or a dictionary." 257 ) 258 259 # save search results 260 return_dict[key] = series 261 return_meta = pd.concat([return_meta, meta]) 262 263 return return_dict, return_meta 264 265 266# --- testing --- 267if __name__ == "__main__": 268 # --- test 1: get a list of dids 269 def test1() -> None: 270 """Test case: get a list of dids.""" 271 cat = "5206.0" 272 table = "5206001_Key_Aggregates" 273 data_dict, data_meta = read_abs_cat(cat=cat, single_excel_only=table, verbose=False) 274 stype = "Seasonally Adjusted" 275 get_these = data_meta.loc[ 276 (data_meta[mc.table] == table) 277 & (data_meta[mc.stype] == stype) 278 & data_meta[mc.unit].str.contains("Million") 279 & data_meta[mc.did].str.contains("Chain volume measures") 280 ][mc.did].to_list() 281 print(f"get_these: {get_these}") 282 283 selected, selected_meta = read_abs_by_desc( 284 wanted=get_these, 285 abs_dict=data_dict, 286 abs_meta=data_meta, 287 # exact_match=True, verbose=True, 288 table=table, 289 stype=stype, 290 ) 291 print(selected, selected_meta) 292 293 test1() 294 295 # --- test 2: get a dictionary of dids 296 def test2() -> None: 297 """Test case: get a dictionary of dids.""" 298 gdp_table = "5206001_Key_Aggregates" 299 uer_table = "6202001" 300 sa = "Seasonally Adjusted" 301 get_these = { 302 # two series, each from two different ABS Catalogue Numbers 303 "GDP": { 304 "cat": "5206.0", 305 "table": gdp_table, 306 "stype": sa, 307 "did": "Gross domestic product: Chain volume measures ;", 308 "single_excel_only": gdp_table, 309 }, 310 "Unemployment Rate": { 311 "cat": "6202.0", 312 "table": uer_table, 313 "stype": sa, 314 "did": "Unemployment rate ; Persons ;", 315 "single_excel_only": uer_table, 316 }, 317 } 318 selected, selected_meta = read_abs_by_desc( 319 wanted=get_these, 320 ) 321 322 print(selected_meta) 323 print(selected) 324 325 test2()
143def read_abs_by_desc( 144 wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], 145 **kwargs: Any, 146) -> tuple[dict[str, pd.Series], pd.DataFrame]: 147 """Get specific ABS data series by searching the ABS meta data. 148 149 Parameters 150 ---------- 151 wanted : list of str, dict of str:str, or dict of str:dict 152 The data 153 item descriptions to search for. If a list, it will be a list of 154 descriptions to search for. If a dictionary, the keys will a name. 155 The dictionary values can be either a string (the data item 156 description to search for) or a dictionary of keyword arguments, one of 157 which would be the data item description to search for. 158 **kwargs : Any 159 Keyword arguments to control the data retrieval. 160 The keyword arguments can include the following: 161 - abs_dict : dict - the dictionary of ABS data to search (from 162 read_abs_cat()). 163 - abs_meta : DataFrame - the metadata for the ABS data (from 164 read_abs_cat()). 165 - for the retrieval of data, the "cat" argument must be present. 166 The following arguments, if present, will also be used (ie. 167 passed to read_abs_cat()): ["ignore_errors", "get_zip", 168 "get_excel_if_no_zip", "get_excel", "cache_only", 169 "single_excel_only", "single_zip_only", "verbose"]. 170 - for the selection of data, the following metacol names, if present, 171 will be used to construct the selector: "cat", "did" 172 "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 173 "cmonth", "table", "tdesc". 174 - finally, the following arguments will be passed to the find_abs_id() 175 and search_abs_meta() functions: ["validate_unique", "exact_match", 176 "regex", "verbose"]. 177 178 Notes 179 ----- 180 - if "wanted" is of type list[str] or dict[str, str], the kwargs should 181 include sufficient keys from the metacol dataclass to get the data. 182 Typically, the "cat" key, the "table" key, and the "stype" key would 183 be required. The did key would taken from the wanted list or 184 dictionary. 185 if wanted is of type dict[str, dict[str, Any]], the inner dictionary 186 must contain a "did" key. The other keys that can be used for the 187 data retrieval are the same as the metacol dataclass fileds, namely: 188 "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 189 "cmonth", "table", "tdesc". 190 - if abs_dict and abs_meta are provided within the kwargs, they will be 191 used to locate and extract the selected data. 192 - if abs_dict and abs_meta are not provided, then, (1) wanted must be of 193 type dict[str, dict[str, Any]] and (2) the inner dictionary must 194 contain a "cat" key so the data can be retrieved. Other keys that 195 can be used for the data retrieval are the same as for read_abs_cat(), 196 namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", 197 "get_excel", "single_excel_only", "single_zip_only", "cache_only"]. 198 199 200 Returns 201 ------- 202 Returns a tuple of two items: 203 - A dictionary of pandas Series objects, where the keys are the series 204 descriptions. The series.name attribute will be the ABS series-id. 205 - A pandas DataFrame containing the metadata for the series. 206 207 Example 208 ------- 209 210 ```python 211 import readabs as ra 212 from pandas import DataFrame 213 cat_num = "5206.0" # The ABS National Accounts 214 data, meta = ra.read_abs_cat(cat=cat_num) 215 wanted = ["Gross domestic product: Chain volume measures ;",] 216 selected, selected_meta = ra.read_abs_by_desc( 217 wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates" 218 ) 219 ``` 220 221 """ 222 # - preparation 223 if not _work_to_do(wanted): 224 return {}, pd.DataFrame() 225 if isinstance(wanted, list): 226 wanted = _wlist_to_wdict(wanted) 227 abs_dict = kwargs.get("abs_dict", {}) 228 abs_meta = kwargs.get("abs_meta", pd.DataFrame()) 229 kwarg_selector = _get_search_terms(kwargs, {}) 230 search_args = _get_search_args(kwargs, {}) 231 232 return_dict = {} 233 return_meta = pd.DataFrame() 234 for key, value in wanted.items(): 235 item_selector = kwarg_selector.copy() 236 item_search_args = search_args.copy() 237 if isinstance(value, str): 238 series, meta = _get_item_from_str( 239 item=value, 240 data_dict=abs_dict, 241 data_meta=abs_meta, 242 item_selector=item_selector, 243 search_args=item_search_args, 244 ) 245 246 elif isinstance(value, dict): 247 series, meta = _get_item_from_dict( 248 item_dict=value, 249 data_dict=abs_dict, 250 data_meta=abs_meta, 251 item_selector=item_selector, 252 search_args=item_search_args, 253 **kwargs, 254 ) 255 else: 256 raise TypeError( 257 "Each value in the wanted list/dictionary must be either a string " + "or a dictionary." 258 ) 259 260 # save search results 261 return_dict[key] = series 262 return_meta = pd.concat([return_meta, meta]) 263 264 return return_dict, return_meta
Get specific ABS data series by searching the ABS meta data.
Parameters
wanted : list of str, dict of str:str, or dict of str:dict The data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dictionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for. **kwargs : Any Keyword arguments to control the data retrieval. The keyword arguments can include the following: - abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()). - abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()). - for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "single_zip_only", "verbose"]. - for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc". - finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].
Notes
- if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
- if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
- if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
Returns
Returns a tuple of two items:
- A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
- A pandas DataFrame containing the metadata for the series.
Example
import readabs as ra
from pandas import DataFrame
cat_num = "5206.0" # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)