readabs.read_abs_by_desc

Get specific ABS data series by searching for the ABS data item descriptions.

This module provides functionality to search and retrieve ABS data series by their descriptions rather than series IDs.

  1"""Get specific ABS data series by searching for the ABS data item descriptions.
  2
  3This module provides functionality to search and retrieve ABS data series
  4by their descriptions rather than series IDs.
  5"""
  6
  7import inspect
  8from typing import Any
  9
 10# Analytic imports
 11import pandas as pd
 12
 13# local imports
 14from readabs.abs_meta_data import metacol as mc
 15from readabs.read_abs_cat import read_abs_cat
 16from readabs.search_abs_meta import find_abs_id
 17
 18
 19# --- private functions
 20def _work_to_do(wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None) -> bool:
 21    """Check if there is any work to do."""
 22    if wanted is None or len(wanted) == 0:
 23        print("No data requested.")
 24        return False
 25    return True
 26
 27
 28def _wlist_to_wdict(wanted: list[str]) -> dict[str, str]:
 29    """Convert a list of strings to a dictionary of strings:strings.
 30
 31    Note: the keys and values are the same.
 32    Note: any duplicate elements in the list will be lost.
 33    """
 34    return {k: k for k in wanted}
 35
 36
 37def _get_search_terms(input_dict: dict[str, Any], output_dict: dict[str, str]) -> dict[str, str]:
 38    """Build a selector dictionary from the input dictionary."""
 39    search_names = {abbr: term for abbr, term in inspect.getmembers(mc) if not abbr.startswith("_")}
 40    for mc_abbr, meta_column in search_names.items():
 41        if mc_abbr in input_dict:
 42            # the selector dictionary is back-to_front
 43            # ie. {value_sought: column_name}
 44            output_dict[input_dict[mc_abbr]] = meta_column
 45    return output_dict
 46
 47
 48def _get_args(keys: list[str], input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]:
 49    """Build a retrieval dictionary from the input dictionary."""
 50    for key in keys:
 51        if key in input_dict:
 52            output_dict[key] = input_dict[key]
 53    return output_dict
 54
 55
 56def _get_search_args(input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]:
 57    """Extract the search arguments from the input dictionary."""
 58    keys = ["validate_unique", "exact_match", "regex", "verbose"]
 59    return _get_args(keys, input_dict, output_dict)
 60
 61
 62def _get_retrieval_args(input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]:
 63    """Extract the retrieval arguments from the input dictionary."""
 64    keys = [
 65        "ignore_errors",
 66        "get_zip",
 67        "get_excel_if_no_zip",
 68        "get_excel",
 69        "cache_only",
 70        "single_excel_only",
 71        "single_zip_only",
 72        "verbose",
 73    ]
 74    return _get_args(keys, input_dict, output_dict)
 75
 76
 77def _get_item_from_str(
 78    item: str,
 79    data_dict: dict[str, pd.DataFrame],
 80    data_meta: pd.DataFrame,
 81    item_selector: dict[str, str],
 82    search_args: dict[str, Any],
 83) -> tuple[pd.Series, pd.DataFrame]:
 84    """Get a data series from the data dictionary and metadata.
 85
 86    Give the series its series-id as a name.
 87    """
 88    if not data_dict or data_meta.empty:
 89        raise ValueError(
 90            "If the wanted data is a string, a populated abs_dict " + "and abs_meta must be provided."
 91        )
 92    item_selector[item] = mc.did  # back_to_front
 93    table, series_id, units = find_abs_id(data_meta, item_selector, **search_args)
 94
 95    series = data_dict[table][series_id]
 96    series.name = series_id
 97    series_meta = data_meta.loc[
 98        (data_meta[mc.table] == table) & (data_meta[mc.id] == series_id) & (data_meta[mc.unit] == units)
 99    ]
100    return series, series_meta
101
102
103def _get_item_from_dict(
104    item_dict: dict[str, Any],
105    data_dict: dict[str, pd.DataFrame],
106    data_meta: pd.DataFrame,
107    item_selector: dict[str, str],
108    search_args: dict[str, Any],
109    **kwargs: Any,
110) -> tuple[pd.Series, pd.DataFrame]:
111    # preparation
112    if "did" not in item_dict:
113        raise ValueError("Each inner dictionary must contain a 'did' key.")
114    item = item_dict.pop("did")
115    item_selector = _get_search_terms(item_dict, item_selector)
116    item_search_args = _get_search_args(item_dict, search_args)
117
118    if not data_dict or data_meta.empty:
119        # data retrieval reqquired
120        if "cat" not in item_dict:
121            raise ValueError(
122                "Each inner dictionary must contain a 'cat' key, "
123                "if an abs_dict is not provided/empty or the "
124                "abs_meta is not provided/empty."
125            )
126        ret_args = _get_retrieval_args(kwargs, {})
127        ret_args = _get_retrieval_args(item_dict, ret_args)
128        data_dict, data_meta = read_abs_cat(cat=item_dict["cat"], **ret_args)
129
130    # series extraction based on search terms
131    series, series_meta = _get_item_from_str(
132        item=item,
133        data_dict=data_dict,
134        data_meta=data_meta,
135        item_selector=item_selector,
136        search_args=item_search_args,
137    )
138    return series, series_meta
139
140
141# --- public functions
142def read_abs_by_desc(
143    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
144    **kwargs: Any,
145) -> tuple[dict[str, pd.Series], pd.DataFrame]:
146    """Get specific ABS data series by searching the ABS meta data.
147
148    Parameters
149    ----------
150    wanted : list of str, dict of str:str, or dict of str:dict
151        The data
152        item descriptions to search for. If a list, it will be a list of
153        descriptions to search for. If a dictionary, the keys will a name.
154        The dictionary values can be either a string (the data item
155        description to search for) or a dictionary of keyword arguments, one of
156        which would be the data item description to search for.
157    **kwargs : Any
158        Keyword arguments to control the data retrieval.
159        The keyword arguments can include the following:
160        - abs_dict : dict - the dictionary of ABS data to search (from
161            read_abs_cat()).
162        - abs_meta : DataFrame - the metadata for the ABS data (from
163            read_abs_cat()).
164        - for the retrieval of data, the "cat" argument must be present.
165            The following arguments, if present, will also be used (ie.
166            passed to read_abs_cat()): ["ignore_errors", "get_zip",
167            "get_excel_if_no_zip", "get_excel", "cache_only",
168            "single_excel_only", "single_zip_only", "verbose"].
169        - for the selection of data, the following metacol names, if present,
170            will be used to construct the selector: "cat", "did"
171            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
172            "cmonth", "table", "tdesc".
173        - finally, the following arguments will be passed to the find_abs_id()
174            and search_abs_meta() functions: ["validate_unique", "exact_match",
175            "regex", "verbose"].
176
177    Notes
178    -----
179    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
180        include sufficient keys from the metacol dataclass to get the data.
181        Typically, the "cat" key, the "table" key, and the "stype" key would
182        be required. The did key would taken from the wanted list or
183        dictionary.
184    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
185        must contain a "did" key. The other keys that can be used for the
186        data retrieval are the same as the metacol dataclass fileds, namely:
187        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
188        "cmonth", "table", "tdesc".
189    - if abs_dict and abs_meta are provided within the kwargs, they will be
190        used to locate and extract the selected data.
191    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
192        type dict[str, dict[str, Any]] and (2) the inner dictionary must
193        contain a "cat" key so the data can be retrieved. Other keys that
194        can be used for the data retrieval are the same as for read_abs_cat(),
195        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
196        "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
197
198
199    Returns
200    -------
201    Returns a tuple of two items:
202    - A dictionary of pandas Series objects, where the keys are the series
203      descriptions. The series.name attribute will be the ABS series-id.
204    - A pandas DataFrame containing the metadata for the series.
205
206    Example
207    -------
208
209    ```python
210    import readabs as ra
211    from pandas import DataFrame
212    cat_num = "5206.0"  # The ABS National Accounts
213    data, meta = ra.read_abs_cat(cat=cat_num)
214    wanted = ["Gross domestic product: Chain volume measures ;",]
215    selected, selected_meta = ra.read_abs_by_desc(
216        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
217    )
218    ```
219
220    """
221    # - preparation
222    if not _work_to_do(wanted):
223        return {}, pd.DataFrame()
224    if isinstance(wanted, list):
225        wanted = _wlist_to_wdict(wanted)
226    abs_dict = kwargs.get("abs_dict", {})
227    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
228    kwarg_selector = _get_search_terms(kwargs, {})
229    search_args = _get_search_args(kwargs, {})
230
231    return_dict = {}
232    return_meta = pd.DataFrame()
233    for key, value in wanted.items():
234        item_selector = kwarg_selector.copy()
235        item_search_args = search_args.copy()
236        if isinstance(value, str):
237            series, meta = _get_item_from_str(
238                item=value,
239                data_dict=abs_dict,
240                data_meta=abs_meta,
241                item_selector=item_selector,
242                search_args=item_search_args,
243            )
244
245        elif isinstance(value, dict):
246            series, meta = _get_item_from_dict(
247                item_dict=value,
248                data_dict=abs_dict,
249                data_meta=abs_meta,
250                item_selector=item_selector,
251                search_args=item_search_args,
252                **kwargs,
253            )
254        else:
255            raise TypeError(
256                "Each value in the wanted list/dictionary must be either a string " + "or a dictionary."
257            )
258
259        # save search results
260        return_dict[key] = series
261        return_meta = pd.concat([return_meta, meta])
262
263    return return_dict, return_meta
264
265
266# --- testing ---
267if __name__ == "__main__":
268    # --- test 1: get a list of dids
269    def test1() -> None:
270        """Test case: get a list of dids."""
271        cat = "5206.0"
272        table = "5206001_Key_Aggregates"
273        data_dict, data_meta = read_abs_cat(cat=cat, single_excel_only=table, verbose=False)
274        stype = "Seasonally Adjusted"
275        get_these = data_meta.loc[
276            (data_meta[mc.table] == table)
277            & (data_meta[mc.stype] == stype)
278            & data_meta[mc.unit].str.contains("Million")
279            & data_meta[mc.did].str.contains("Chain volume measures")
280        ][mc.did].to_list()
281        print(f"get_these: {get_these}")
282
283        selected, selected_meta = read_abs_by_desc(
284            wanted=get_these,
285            abs_dict=data_dict,
286            abs_meta=data_meta,
287            # exact_match=True, verbose=True,
288            table=table,
289            stype=stype,
290        )
291        print(selected, selected_meta)
292
293    test1()
294
295    # --- test 2: get a dictionary of dids
296    def test2() -> None:
297        """Test case: get a dictionary of dids."""
298        gdp_table = "5206001_Key_Aggregates"
299        uer_table = "6202001"
300        sa = "Seasonally Adjusted"
301        get_these = {
302            # two series, each from two different ABS Catalogue Numbers
303            "GDP": {
304                "cat": "5206.0",
305                "table": gdp_table,
306                "stype": sa,
307                "did": "Gross domestic product: Chain volume measures ;",
308                "single_excel_only": gdp_table,
309            },
310            "Unemployment Rate": {
311                "cat": "6202.0",
312                "table": uer_table,
313                "stype": sa,
314                "did": "Unemployment rate ;  Persons ;",
315                "single_excel_only": uer_table,
316            },
317        }
318        selected, selected_meta = read_abs_by_desc(
319            wanted=get_these,
320        )
321
322        print(selected_meta)
323        print(selected)
324
325    test2()
def read_abs_by_desc( wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], **kwargs: Any) -> tuple[dict[str, pandas.Series], pandas.DataFrame]:
143def read_abs_by_desc(
144    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
145    **kwargs: Any,
146) -> tuple[dict[str, pd.Series], pd.DataFrame]:
147    """Get specific ABS data series by searching the ABS meta data.
148
149    Parameters
150    ----------
151    wanted : list of str, dict of str:str, or dict of str:dict
152        The data
153        item descriptions to search for. If a list, it will be a list of
154        descriptions to search for. If a dictionary, the keys will a name.
155        The dictionary values can be either a string (the data item
156        description to search for) or a dictionary of keyword arguments, one of
157        which would be the data item description to search for.
158    **kwargs : Any
159        Keyword arguments to control the data retrieval.
160        The keyword arguments can include the following:
161        - abs_dict : dict - the dictionary of ABS data to search (from
162            read_abs_cat()).
163        - abs_meta : DataFrame - the metadata for the ABS data (from
164            read_abs_cat()).
165        - for the retrieval of data, the "cat" argument must be present.
166            The following arguments, if present, will also be used (ie.
167            passed to read_abs_cat()): ["ignore_errors", "get_zip",
168            "get_excel_if_no_zip", "get_excel", "cache_only",
169            "single_excel_only", "single_zip_only", "verbose"].
170        - for the selection of data, the following metacol names, if present,
171            will be used to construct the selector: "cat", "did"
172            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
173            "cmonth", "table", "tdesc".
174        - finally, the following arguments will be passed to the find_abs_id()
175            and search_abs_meta() functions: ["validate_unique", "exact_match",
176            "regex", "verbose"].
177
178    Notes
179    -----
180    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
181        include sufficient keys from the metacol dataclass to get the data.
182        Typically, the "cat" key, the "table" key, and the "stype" key would
183        be required. The did key would taken from the wanted list or
184        dictionary.
185    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
186        must contain a "did" key. The other keys that can be used for the
187        data retrieval are the same as the metacol dataclass fileds, namely:
188        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
189        "cmonth", "table", "tdesc".
190    - if abs_dict and abs_meta are provided within the kwargs, they will be
191        used to locate and extract the selected data.
192    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
193        type dict[str, dict[str, Any]] and (2) the inner dictionary must
194        contain a "cat" key so the data can be retrieved. Other keys that
195        can be used for the data retrieval are the same as for read_abs_cat(),
196        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
197        "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
198
199
200    Returns
201    -------
202    Returns a tuple of two items:
203    - A dictionary of pandas Series objects, where the keys are the series
204      descriptions. The series.name attribute will be the ABS series-id.
205    - A pandas DataFrame containing the metadata for the series.
206
207    Example
208    -------
209
210    ```python
211    import readabs as ra
212    from pandas import DataFrame
213    cat_num = "5206.0"  # The ABS National Accounts
214    data, meta = ra.read_abs_cat(cat=cat_num)
215    wanted = ["Gross domestic product: Chain volume measures ;",]
216    selected, selected_meta = ra.read_abs_by_desc(
217        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
218    )
219    ```
220
221    """
222    # - preparation
223    if not _work_to_do(wanted):
224        return {}, pd.DataFrame()
225    if isinstance(wanted, list):
226        wanted = _wlist_to_wdict(wanted)
227    abs_dict = kwargs.get("abs_dict", {})
228    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
229    kwarg_selector = _get_search_terms(kwargs, {})
230    search_args = _get_search_args(kwargs, {})
231
232    return_dict = {}
233    return_meta = pd.DataFrame()
234    for key, value in wanted.items():
235        item_selector = kwarg_selector.copy()
236        item_search_args = search_args.copy()
237        if isinstance(value, str):
238            series, meta = _get_item_from_str(
239                item=value,
240                data_dict=abs_dict,
241                data_meta=abs_meta,
242                item_selector=item_selector,
243                search_args=item_search_args,
244            )
245
246        elif isinstance(value, dict):
247            series, meta = _get_item_from_dict(
248                item_dict=value,
249                data_dict=abs_dict,
250                data_meta=abs_meta,
251                item_selector=item_selector,
252                search_args=item_search_args,
253                **kwargs,
254            )
255        else:
256            raise TypeError(
257                "Each value in the wanted list/dictionary must be either a string " + "or a dictionary."
258            )
259
260        # save search results
261        return_dict[key] = series
262        return_meta = pd.concat([return_meta, meta])
263
264    return return_dict, return_meta

Get specific ABS data series by searching the ABS meta data.

Parameters

wanted : list of str, dict of str:str, or dict of str:dict The data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dictionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for. **kwargs : Any Keyword arguments to control the data retrieval. The keyword arguments can include the following: - abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()). - abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()). - for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "single_zip_only", "verbose"]. - for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc". - finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].

Notes

  • if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
  • if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
  • if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "single_zip_only", "cache_only"].

Returns

Returns a tuple of two items:

  • A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
  • A pandas DataFrame containing the metadata for the series.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "5206.0"  # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
    wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)