sdmxabs.select_items

Select items from the ABS Catalogue based on search criteria.

  1"""Select items from the ABS Catalogue based on search criteria."""
  2
  3import re
  4from collections.abc import Sequence
  5from enum import Enum
  6from typing import Unpack
  7
  8import pandas as pd
  9
 10from sdmxabs.download_cache import GetFileKwargs
 11from sdmxabs.fetch_multi import fetch_multi
 12from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows
 13
 14
 15# --- some types specific to this module
 16class MatchType(Enum):
 17    """Enumeration for match types."""
 18
 19    EXACT = 1
 20    PARTIAL = 2
 21    REGEX = 3
 22
 23
 24MatchItem = tuple[str, str, MatchType]  # pattern, dimension, MatchType
 25MatchCriteria = Sequence[MatchItem]  # Sequence of tuples containing (pattern, dimension, MatchType)
 26
 27
 28def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
 29    """Package the codes into the return dictionary for a given dimension.
 30
 31    If the dimension already exists in the return_dict, we will intersect the newly
 32    identified  codes with the existing codes. If the intersection is a null set, the
 33    dimension will be removed from the return_dict (ie. the global match).
 34
 35    Note: multiple matched codes are separated by a '+' sign in the return_dict.
 36
 37    """
 38    if dimension in return_dict:
 39        previous = return_dict[dimension].split("+")
 40        codes = list(set(previous).intersection(set(codes)))
 41        if not codes:
 42            del return_dict[dimension]  # no matches, remove dimension
 43    if codes:
 44        return_dict[dimension] = "+".join(list(set(codes)))
 45
 46
 47# --- private functions
 48def get_codes(
 49    code_list_dict: FlowMetaDict,
 50    pattern: str,
 51    match_type: MatchType = MatchType.PARTIAL,
 52) -> list[str]:
 53    """Obtain all codes matching the pattern."""
 54    codes = []
 55    for code, code_list in code_list_dict.items():
 56        name = code_list.get("name", "")
 57        match match_type:
 58            case MatchType.EXACT:
 59                if name == pattern:
 60                    codes.append(code)
 61            case MatchType.PARTIAL:
 62                # Case-insensitive partial match
 63                if pattern.lower() in name.lower():
 64                    codes.append(code)
 65            case MatchType.REGEX:
 66                if re.match(pattern, name):
 67                    codes.append(code)
 68    return codes
 69
 70
 71# --- public functions
 72def match_criterion(
 73    pattern: str,
 74    dimension: str,
 75    match_type: MatchType = MatchType.PARTIAL,
 76) -> MatchItem:
 77    """Create a new match criterion for use in selection.
 78
 79    Args:
 80        pattern (str): The pattern to match.
 81        dimension (str): The dimension to match against.
 82        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
 83
 84    Returns:
 85        MatchElement: A tuple representing the match element.
 86
 87    """
 88    return (pattern, dimension, match_type)
 89
 90
 91def select_items(
 92    flow_id: str,
 93    criteria: MatchCriteria,
 94) -> pd.DataFrame:
 95    """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
 96
 97    Args:
 98        flow_id (str): The ID of the data flow to select items from.
 99        criteria (MatchElements): A sequence of tuples containing the element name,
100            the value to match, and the match type (exact, partial, or regex).
101
102    Returns:
103        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
104            into the call of the function fetch_multi().
105
106    Raises:
107        ValueError: If the flow_id is not valid or if no items match the criteria.
108
109    Notes:
110    -   Should build a one line DataFrame. This Frame may select multiple data series,
111        when passed to fetch_multi. It also can be concatenated with other DataFrames
112        to build a larger selection.
113    -   If two match elements refer to the same dimension, only the `intersection` of the
114        matches will be returned.
115
116    """
117    # --- some sanity checks
118    if flow_id not in data_flows():
119        raise ValueError(f"Invalid flow_id: {flow_id}.")
120    dimensions = data_dimensions(flow_id)
121    if not dimensions:
122        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
123
124    # --- lets build the codelist dictionary
125    return_dict: dict[str, str] = {}
126    for pattern, dimension, match_type in criteria:
127        if dimension not in dimensions:
128            print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)")
129            continue
130        dim_dict = dimensions[dimension]
131        if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
132            print(f"Dimension '{dimension}' does not have a codelist; (skipping)")
133            continue
134        code_list_name = dim_dict.get("id")
135        codes = get_codes(code_lists(code_list_name), pattern, match_type)
136        package_codes(codes, dimension, return_dict)
137
138    # --- return as a (one row) `wanted` DataFrame
139    return_dict["flow_id"] = flow_id
140    return pd.DataFrame([return_dict]).astype(str)
141
142
143def fetch_selection(
144    flow_id: str,
145    criteria: MatchCriteria,
146    *,
147    validate: bool = False,
148    **kwargs: Unpack[GetFileKwargs],
149) -> tuple[pd.DataFrame, pd.DataFrame]:
150    """Fetch data based on a selection criteria for items.
151
152    Args:
153        flow_id (str): The ID of the data flow to fetch.
154        criteria (MatchCriteria): A sequence of match criteria to filter the data.
155        validate (bool, optional): If True, validate the selection against the flow's
156            required dimensions. Defaults to False.
157        **kwargs: Additional keyword arguments for the fetch_multi function.
158
159    Returns:
160        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
161
162    """
163    selection = select_items(flow_id, criteria)
164    return fetch_multi(selection, validate=validate, **kwargs)
165
166
167# --- quick and dirty testing
168if __name__ == "__main__":
169    # --- specify a selection from the Wage Price Index (WPI) data flow
170    mat_criteria = []
171    mat_criteria.append(match_criterion("Australia", "REGION", MatchType.EXACT))
172    mat_criteria.append(
173        match_criterion(
174            "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT
175        )
176    )
177    mat_criteria.append(
178        match_criterion("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL)
179    )
180    mat_criteria.append(match_criterion("Seas|Trend", "TSEST", MatchType.REGEX))
181    mat_criteria.append(match_criterion("13-Industry aggregate", "INDUSTRY", MatchType.EXACT))
182    mat_criteria.append(match_criterion("Private and Public", "SECTOR", MatchType.EXACT))
183
184    # --- test the selection
185    print(select_items("WPI", mat_criteria))
186    data, meta = fetch_selection("WPI", mat_criteria)
187    print(f"Number of data series: {len(meta)}")  # should be 2
188    print(meta.T)  # should have the Trend and Seasonally Adjusted series
class MatchType(enum.Enum):
17class MatchType(Enum):
18    """Enumeration for match types."""
19
20    EXACT = 1
21    PARTIAL = 2
22    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
MatchItem = tuple[str, str, MatchType]
MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
29def package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
30    """Package the codes into the return dictionary for a given dimension.
31
32    If the dimension already exists in the return_dict, we will intersect the newly
33    identified  codes with the existing codes. If the intersection is a null set, the
34    dimension will be removed from the return_dict (ie. the global match).
35
36    Note: multiple matched codes are separated by a '+' sign in the return_dict.
37
38    """
39    if dimension in return_dict:
40        previous = return_dict[dimension].split("+")
41        codes = list(set(previous).intersection(set(codes)))
42        if not codes:
43            del return_dict[dimension]  # no matches, remove dimension
44    if codes:
45        return_dict[dimension] = "+".join(list(set(codes)))

Package the codes into the return dictionary for a given dimension.

If the dimension already exists in the return_dict, we will intersect the newly identified codes with the existing codes. If the intersection is a null set, the dimension will be removed from the return_dict (ie. the global match).

Note: multiple matched codes are separated by a '+' sign in the return_dict.

def get_codes( code_list_dict: dict[str, dict[str, str]], pattern: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> list[str]:
49def get_codes(
50    code_list_dict: FlowMetaDict,
51    pattern: str,
52    match_type: MatchType = MatchType.PARTIAL,
53) -> list[str]:
54    """Obtain all codes matching the pattern."""
55    codes = []
56    for code, code_list in code_list_dict.items():
57        name = code_list.get("name", "")
58        match match_type:
59            case MatchType.EXACT:
60                if name == pattern:
61                    codes.append(code)
62            case MatchType.PARTIAL:
63                # Case-insensitive partial match
64                if pattern.lower() in name.lower():
65                    codes.append(code)
66            case MatchType.REGEX:
67                if re.match(pattern, name):
68                    codes.append(code)
69    return codes

Obtain all codes matching the pattern.

def match_criterion( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
73def match_criterion(
74    pattern: str,
75    dimension: str,
76    match_type: MatchType = MatchType.PARTIAL,
77) -> MatchItem:
78    """Create a new match criterion for use in selection.
79
80    Args:
81        pattern (str): The pattern to match.
82        dimension (str): The dimension to match against.
83        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
84
85    Returns:
86        MatchElement: A tuple representing the match element.
87
88    """
89    return (pattern, dimension, match_type)

Create a new match criterion for use in selection.

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

def select_items( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
 92def select_items(
 93    flow_id: str,
 94    criteria: MatchCriteria,
 95) -> pd.DataFrame:
 96    """Build the `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
 97
 98    Args:
 99        flow_id (str): The ID of the data flow to select items from.
100        criteria (MatchElements): A sequence of tuples containing the element name,
101            the value to match, and the match type (exact, partial, or regex).
102
103    Returns:
104        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
105            into the call of the function fetch_multi().
106
107    Raises:
108        ValueError: If the flow_id is not valid or if no items match the criteria.
109
110    Notes:
111    -   Should build a one line DataFrame. This Frame may select multiple data series,
112        when passed to fetch_multi. It also can be concatenated with other DataFrames
113        to build a larger selection.
114    -   If two match elements refer to the same dimension, only the `intersection` of the
115        matches will be returned.
116
117    """
118    # --- some sanity checks
119    if flow_id not in data_flows():
120        raise ValueError(f"Invalid flow_id: {flow_id}.")
121    dimensions = data_dimensions(flow_id)
122    if not dimensions:
123        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
124
125    # --- lets build the codelist dictionary
126    return_dict: dict[str, str] = {}
127    for pattern, dimension, match_type in criteria:
128        if dimension not in dimensions:
129            print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)")
130            continue
131        dim_dict = dimensions[dimension]
132        if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
133            print(f"Dimension '{dimension}' does not have a codelist; (skipping)")
134            continue
135        code_list_name = dim_dict.get("id")
136        codes = get_codes(code_lists(code_list_name), pattern, match_type)
137        package_codes(codes, dimension, return_dict)
138
139    # --- return as a (one row) `wanted` DataFrame
140    return_dict["flow_id"] = flow_id
141    return pd.DataFrame([return_dict]).astype(str)

Build the wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
144def fetch_selection(
145    flow_id: str,
146    criteria: MatchCriteria,
147    *,
148    validate: bool = False,
149    **kwargs: Unpack[GetFileKwargs],
150) -> tuple[pd.DataFrame, pd.DataFrame]:
151    """Fetch data based on a selection criteria for items.
152
153    Args:
154        flow_id (str): The ID of the data flow to fetch.
155        criteria (MatchCriteria): A sequence of match criteria to filter the data.
156        validate (bool, optional): If True, validate the selection against the flow's
157            required dimensions. Defaults to False.
158        **kwargs: Additional keyword arguments for the fetch_multi function.
159
160    Returns:
161        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
162
163    """
164    selection = select_items(flow_id, criteria)
165    return fetch_multi(selection, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. validate (bool, optional): If True, validate the selection against the flow's required dimensions. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.