readabs.search_abs_meta

Search a DataFrame of ABS meta data using search terms.

Using a dictionary of search terms, identify the row or rows that match all of the search terms.

  1"""Search a DataFrame of ABS meta data using search terms.
  2
  3Using a dictionary of search terms, identify the row or rows that match
  4all of the search terms.
  5"""
  6
  7from typing import Any
  8
  9from pandas import DataFrame, Index
 10
 11# local imports
 12from readabs.abs_meta_data import metacol as mc
 13from readabs.read_abs_cat import read_abs_cat
 14
 15
 16def search_abs_meta(
 17    meta: DataFrame,  # sourced from read_abs_series() or read_abs_cat()
 18    search_terms: dict[str, str],  # {search_term: meta_data_column_name, ...}
 19    *,
 20    exact_match: bool = False,
 21    regex: bool = False,
 22    validate_unique: bool = False,  # useful safety-net if you expect only one match
 23    **kwargs: Any,  # verbose flag
 24) -> DataFrame:
 25    """Extract from the ABS meta data those rows that match the search_terms.
 26
 27    Iteratively search the meta data one search_term at a time.
 28
 29    Parameters
 30    ----------
 31    meta : DataFrame
 32        A pandas DataFrame of metadata from the ABS
 33        (via read_abs_cat() or read_abs_series()).
 34    search_terms : dict[str, str]
 35        A dictionary {search_phrase: meta_column_name, ...} of search terms.
 36        Note: the search terms must be unique, as a dictionary cannot hold the
 37        same search term to be applied to different columns.
 38    exact_match : bool = False
 39        Whether to match using == (exact) or .str.contains() (inexact).
 40    regex : bool = False
 41        Whether to use regular expressions in the search.
 42    validate_unique : bool = False
 43        Raise a ValueError if the search result is not unique.
 44    **kwargs : Any
 45        Additional keyword arguments. The only keyword argument
 46        that is used is verbose.
 47    verbose : bool = False
 48        Print additional information while searching; which can
 49        be useful when diagnosing problems with search terms.
 50
 51    Returns
 52    -------
 53    DataFrame
 54        Returns a pandas DataFrame of matching rows (subseted from meta).
 55        Note, The index for the returned meta data will always comprise ABS
 56        series_ids. Duplicate indexes will be removed from the meta data
 57        (ie. where the same ABS series appears in more than one table, this
 58        function will only report the first match).
 59
 60    Metacol
 61    -------
 62    Because the meta data is a DataFrame, the columns can be referenced by either
 63    their full textual name, or by the short name defined in the metacol object.
 64    For example, if metacol is imported as mc, to refer to the
 65    `Data Item Description` column, the user can refer to it as mc.did.
 66
 67    Example
 68    -------
 69    ```python
 70    from readabs import metacol as mc  # alias for the ABS meta data column names
 71    from readabs import read_abs_cat, search_abs_meta
 72    cat_num = "6202.0"  # The ABS labour force survey
 73    data, meta = read_abs_cat(cat_num)
 74    search_terms = {
 75        "Unemployment rate": mc.did,  # the data item description
 76        "Persons": mc.did,
 77        "Seasonally Adjusted": mc.stype,
 78        "Percent": mc.unit,
 79        "6202001": mc.table,
 80    }
 81    rows = search_abs_meta(meta, search_terms, verbose=True)
 82    print(rows)  # should have three rows : FT/PT/All Unemployment rates
 83    ```
 84
 85    """
 86    # get the verbose-flag from kwargs
 87    verbose = kwargs.get("verbose", False)
 88
 89    # establish the starting point
 90    meta_select = meta.copy()  # preserve the original meta data
 91    if verbose:
 92        print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}")
 93        print(f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data.")
 94
 95    # iteratively search
 96    for phrase, column in search_terms.items():
 97        if verbose:
 98            print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}")
 99
100        pick_me = (
101            (meta_select[column] == phrase)
102            if (exact_match or column == mc.table)
103            else meta_select[column].str.contains(phrase, regex=regex)
104        )
105        meta_select = meta_select[pick_me]
106        if verbose:
107            print(f"In find_rows() have found {len(meta_select)}")
108
109    # search complete - check results - and return
110    meta_select.index = Index(meta_select[mc.id])
111    meta_select = meta_select[~meta_select.index.duplicated(keep="first")]
112
113    if verbose:
114        print(f"Final selection is {len(meta_select)} rows.")
115
116    elif len(meta_select) == 0:
117        print("Nothing selected?")
118
119    if validate_unique and len(meta_select) != 1:
120        raise ValueError("The selected meta data should only contain one row.")
121
122    return meta_select
123
124
125def find_abs_id(
126    meta: DataFrame,
127    search_terms: dict[str, str],
128    **kwargs: Any,
129) -> tuple[str, str, str]:  # table, series_id, units
130    """Find a unique ABS series identifier in the ABS metadata.
131
132    Parameters
133    ----------
134    meta : DataFrame
135        A pandas DataFrame of metadata from the ABS
136        (via read_abs_cat() or read_abs_series()).
137    search_terms : dict[str, str]
138        A dictionary {search_phrase: meta_column_name, ...} of search terms.
139        Note: the search terms must be unique, as a dictionary cannot hold the
140        same search term to be applied to different columns.
141    **kwargs : Any
142        Additional keyword arguments. The only additional keyword argument
143        that is used is validate_unique.
144    validate_unique : bool = True
145        Raise a ValueError if the search result is not a single
146        unique match. Note: the default is True for safety.
147
148    Returns
149    -------
150    tuple[str, str, str]
151        A tuple of the table, series_id and units for the unique
152        series_id that matches the search terms.
153
154    Metacol
155    -------
156    Because the meta data is a DataFrame, the columns can be referenced by either
157    their full textual name, or by the short name defined in the metacol object.
158    For example, if metacol is imported as mc, to refer to the
159    `Data Item Description` column, the user can refer to it as mc.did.
160
161    Example
162    -------
163    ```python
164    from readabs import metacol as mc  # alias for the ABS meta data column names
165    from readabs import read_abs_cat, find_abs_id, recalibrate
166    cat_num = "6202.0"  # The ABS labour force survey
167    data, meta = read_abs_cat(cat_num)
168    search_terms = {
169        "Employed total ;  Persons ;": mc.did,
170        "Seasonally Adjusted": mc.stype,
171        "6202001": mc.table,
172    }
173    table, series_id, units = find_abs_id(meta, search_terms)
174    print(f"Table: {table} Series ID: {series_id} Units: {units}")
175    recal_series, recal_units = recalibrate(data[table][series_id], units)
176    ```
177
178    """
179    validate_unique = kwargs.pop("validate_unique", True)
180    found = search_abs_meta(meta, search_terms, validate_unique=validate_unique, **kwargs).iloc[0]
181    table, series_id, units = (
182        found[mc.table],
183        found[mc.id],
184        found[mc.unit],
185    )
186
187    return table, series_id, units
188
189
190if __name__ == "__main__":
191
192    def test_search_abs_meta() -> None:
193        """Test the search_abs_meta() function."""
194        cat_num = "6202.0"  # The ABS labour force survey
195        _data, meta = read_abs_cat(cat_num)
196        search_terms = {
197            "Unemployment rate": mc.did,  # the data item description
198            "Persons": mc.did,
199            "Seasonally Adjusted": mc.stype,
200            "Percent": mc.unit,
201            "6202001": mc.table,
202        }
203        rows = search_abs_meta(meta, search_terms, verbose=True)
204        print(rows)  # should have three rows : FT/PT/All Unemplooyment rates
205
206    test_search_abs_meta()
207
208    def test_find_abs_id() -> None:
209        """Test the find_abs_id() function."""
210        cat_num = "6202.0"  # The ABS labour force survey
211        _data, meta = read_abs_cat(cat_num)
212        search_terms = {
213            "Employed total ;  Persons ;": mc.did,
214            "Seasonally Adjusted": mc.stype,
215            "6202001": mc.table,
216        }
217        table, series_id, units = find_abs_id(meta, search_terms)
218        print(f"Table: {table} Series ID: {series_id} Units: {units}")
219
220    test_find_abs_id()
def search_abs_meta( meta: pandas.DataFrame, search_terms: dict[str, str], *, exact_match: bool = False, regex: bool = False, validate_unique: bool = False, **kwargs: Any) -> pandas.DataFrame:
 17def search_abs_meta(
 18    meta: DataFrame,  # sourced from read_abs_series() or read_abs_cat()
 19    search_terms: dict[str, str],  # {search_term: meta_data_column_name, ...}
 20    *,
 21    exact_match: bool = False,
 22    regex: bool = False,
 23    validate_unique: bool = False,  # useful safety-net if you expect only one match
 24    **kwargs: Any,  # verbose flag
 25) -> DataFrame:
 26    """Extract from the ABS meta data those rows that match the search_terms.
 27
 28    Iteratively search the meta data one search_term at a time.
 29
 30    Parameters
 31    ----------
 32    meta : DataFrame
 33        A pandas DataFrame of metadata from the ABS
 34        (via read_abs_cat() or read_abs_series()).
 35    search_terms : dict[str, str]
 36        A dictionary {search_phrase: meta_column_name, ...} of search terms.
 37        Note: the search terms must be unique, as a dictionary cannot hold the
 38        same search term to be applied to different columns.
 39    exact_match : bool = False
 40        Whether to match using == (exact) or .str.contains() (inexact).
 41    regex : bool = False
 42        Whether to use regular expressions in the search.
 43    validate_unique : bool = False
 44        Raise a ValueError if the search result is not unique.
 45    **kwargs : Any
 46        Additional keyword arguments. The only keyword argument
 47        that is used is verbose.
 48    verbose : bool = False
 49        Print additional information while searching; which can
 50        be useful when diagnosing problems with search terms.
 51
 52    Returns
 53    -------
 54    DataFrame
 55        Returns a pandas DataFrame of matching rows (subseted from meta).
 56        Note, The index for the returned meta data will always comprise ABS
 57        series_ids. Duplicate indexes will be removed from the meta data
 58        (ie. where the same ABS series appears in more than one table, this
 59        function will only report the first match).
 60
 61    Metacol
 62    -------
 63    Because the meta data is a DataFrame, the columns can be referenced by either
 64    their full textual name, or by the short name defined in the metacol object.
 65    For example, if metacol is imported as mc, to refer to the
 66    `Data Item Description` column, the user can refer to it as mc.did.
 67
 68    Example
 69    -------
 70    ```python
 71    from readabs import metacol as mc  # alias for the ABS meta data column names
 72    from readabs import read_abs_cat, search_abs_meta
 73    cat_num = "6202.0"  # The ABS labour force survey
 74    data, meta = read_abs_cat(cat_num)
 75    search_terms = {
 76        "Unemployment rate": mc.did,  # the data item description
 77        "Persons": mc.did,
 78        "Seasonally Adjusted": mc.stype,
 79        "Percent": mc.unit,
 80        "6202001": mc.table,
 81    }
 82    rows = search_abs_meta(meta, search_terms, verbose=True)
 83    print(rows)  # should have three rows : FT/PT/All Unemployment rates
 84    ```
 85
 86    """
 87    # get the verbose-flag from kwargs
 88    verbose = kwargs.get("verbose", False)
 89
 90    # establish the starting point
 91    meta_select = meta.copy()  # preserve the original meta data
 92    if verbose:
 93        print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}")
 94        print(f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data.")
 95
 96    # iteratively search
 97    for phrase, column in search_terms.items():
 98        if verbose:
 99            print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}")
100
101        pick_me = (
102            (meta_select[column] == phrase)
103            if (exact_match or column == mc.table)
104            else meta_select[column].str.contains(phrase, regex=regex)
105        )
106        meta_select = meta_select[pick_me]
107        if verbose:
108            print(f"In find_rows() have found {len(meta_select)}")
109
110    # search complete - check results - and return
111    meta_select.index = Index(meta_select[mc.id])
112    meta_select = meta_select[~meta_select.index.duplicated(keep="first")]
113
114    if verbose:
115        print(f"Final selection is {len(meta_select)} rows.")
116
117    elif len(meta_select) == 0:
118        print("Nothing selected?")
119
120    if validate_unique and len(meta_select) != 1:
121        raise ValueError("The selected meta data should only contain one row.")
122
123    return meta_select

Extract from the ABS meta data those rows that match the search_terms.

Iteratively search the meta data one search_term at a time.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. exact_match : bool = False Whether to match using == (exact) or .str.contains() (inexact). regex : bool = False Whether to use regular expressions in the search. validate_unique : bool = False Raise a ValueError if the search result is not unique. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is verbose. verbose : bool = False Print additional information while searching; which can be useful when diagnosing problems with search terms.

Returns

DataFrame Returns a pandas DataFrame of matching rows (subseted from meta). Note, The index for the returned meta data will always comprise ABS series_ids. Duplicate indexes will be removed from the meta data (ie. where the same ABS series appears in more than one table, this function will only report the first match).

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, search_abs_meta
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Unemployment rate": mc.did,  # the data item description
    "Persons": mc.did,
    "Seasonally Adjusted": mc.stype,
    "Percent": mc.unit,
    "6202001": mc.table,
}
rows = search_abs_meta(meta, search_terms, verbose=True)
print(rows)  # should have three rows : FT/PT/All Unemployment rates
def find_abs_id( meta: pandas.DataFrame, search_terms: dict[str, str], **kwargs: Any) -> tuple[str, str, str]:
126def find_abs_id(
127    meta: DataFrame,
128    search_terms: dict[str, str],
129    **kwargs: Any,
130) -> tuple[str, str, str]:  # table, series_id, units
131    """Find a unique ABS series identifier in the ABS metadata.
132
133    Parameters
134    ----------
135    meta : DataFrame
136        A pandas DataFrame of metadata from the ABS
137        (via read_abs_cat() or read_abs_series()).
138    search_terms : dict[str, str]
139        A dictionary {search_phrase: meta_column_name, ...} of search terms.
140        Note: the search terms must be unique, as a dictionary cannot hold the
141        same search term to be applied to different columns.
142    **kwargs : Any
143        Additional keyword arguments. The only additional keyword argument
144        that is used is validate_unique.
145    validate_unique : bool = True
146        Raise a ValueError if the search result is not a single
147        unique match. Note: the default is True for safety.
148
149    Returns
150    -------
151    tuple[str, str, str]
152        A tuple of the table, series_id and units for the unique
153        series_id that matches the search terms.
154
155    Metacol
156    -------
157    Because the meta data is a DataFrame, the columns can be referenced by either
158    their full textual name, or by the short name defined in the metacol object.
159    For example, if metacol is imported as mc, to refer to the
160    `Data Item Description` column, the user can refer to it as mc.did.
161
162    Example
163    -------
164    ```python
165    from readabs import metacol as mc  # alias for the ABS meta data column names
166    from readabs import read_abs_cat, find_abs_id, recalibrate
167    cat_num = "6202.0"  # The ABS labour force survey
168    data, meta = read_abs_cat(cat_num)
169    search_terms = {
170        "Employed total ;  Persons ;": mc.did,
171        "Seasonally Adjusted": mc.stype,
172        "6202001": mc.table,
173    }
174    table, series_id, units = find_abs_id(meta, search_terms)
175    print(f"Table: {table} Series ID: {series_id} Units: {units}")
176    recal_series, recal_units = recalibrate(data[table][series_id], units)
177    ```
178
179    """
180    validate_unique = kwargs.pop("validate_unique", True)
181    found = search_abs_meta(meta, search_terms, validate_unique=validate_unique, **kwargs).iloc[0]
182    table, series_id, units = (
183        found[mc.table],
184        found[mc.id],
185        found[mc.unit],
186    )
187
188    return table, series_id, units

Find a unique ABS series identifier in the ABS metadata.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. **kwargs : Any Additional keyword arguments. The only additional keyword argument that is used is validate_unique. validate_unique : bool = True Raise a ValueError if the search result is not a single unique match. Note: the default is True for safety.

Returns

tuple[str, str, str] A tuple of the table, series_id and units for the unique series_id that matches the search terms.

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, find_abs_id, recalibrate
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Employed total ;  Persons ;": mc.did,
    "Seasonally Adjusted": mc.stype,
    "6202001": mc.table,
}
table, series_id, units = find_abs_id(meta, search_terms)
print(f"Table: {table} Series ID: {series_id} Units: {units}")
recal_series, recal_units = recalibrate(data[table][series_id], units)