sdmxabs.select_items

Select items from the ABS Catalogue based on search criteria.

  1"""Select items from the ABS Catalogue based on search criteria."""
  2
  3import re
  4from collections.abc import Sequence
  5from enum import Enum
  6
  7import pandas as pd
  8
  9from sdmxabs.fetch_multi import fetch_multi
 10from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows
 11
 12
 13# --- some types specific to this module
 14class MatchType(Enum):
 15    """Enumeration for match types."""
 16
 17    EXACT = 1
 18    PARTIAL = 2
 19    REGEX = 3
 20
 21
 22MatchItem = tuple[str, str, MatchType]
 23MatchCriteria = Sequence[MatchItem]
 24
 25
 26# --- private functions
 27def get_codes(
 28    code_list_dict: FlowMetaDict,
 29    pattern: str,
 30    match_type: MatchType = MatchType.PARTIAL,
 31) -> list[str]:
 32    """Obtain all codes matching the pattern."""
 33    codes = []
 34    for code, code_list in code_list_dict.items():
 35        name = code_list.get("name", "")
 36        match match_type:
 37            case MatchType.EXACT:
 38                if name == pattern:
 39                    codes.append(code)
 40            case MatchType.PARTIAL:
 41                if pattern in name:
 42                    codes.append(code)
 43            case MatchType.REGEX:
 44                if re.match(pattern, name):
 45                    codes.append(code)
 46    return codes
 47
 48
 49def get_code_list_dict(dimension: str, dim_dict: dict[str, str]) -> FlowMetaDict:
 50    """Get the codelist dictionary for a given dimension."""
 51    if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
 52        print(f"Dimension '{dimension}' does not have a codelist; (skipping)")
 53        return {}
 54    code_list_name = dim_dict.get("id")
 55    return code_lists(code_list_name)
 56
 57
 58# --- public functions
 59def match_criterion(
 60    pattern: str,
 61    dimension: str,
 62    match_type: MatchType = MatchType.PARTIAL,
 63) -> MatchItem:
 64    """Create a new match criterion for use in selection.
 65
 66    Args:
 67        pattern (str): The pattern to match.
 68        dimension (str): The dimension to match against.
 69        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
 70
 71    Returns:
 72        MatchElement: A tuple representing the match element.
 73
 74    """
 75    return (pattern, dimension, match_type)
 76
 77
 78def select_items(
 79    flow_id: str,
 80    criteria: MatchCriteria,
 81) -> pd.DataFrame:
 82    """Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata.
 83
 84    Args:
 85        flow_id (str): The ID of the data flow to select items from.
 86        criteria (MatchElements): A sequence of tuples containing the element name,
 87            the value to match, and the match type (exact, partial, or regex).
 88
 89    Returns:
 90        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
 91            into the call of the function fetch_multi().
 92
 93    Raises:
 94        ValueError: If the flow_id is not valid or if no items match the criteria.
 95
 96    Notes:
 97    -   Should build a one line DataFrame. This Frame may select multiple data series,
 98        when passed to fetch_multi. It also can be concatenated with other DataFrames
 99        to build a larger selection.
100    -   If two match elements refer to the same dimension, only the `intersection` of the
101        matches will be returned.
102
103    """
104    # --- some sanity checks
105    if flow_id not in data_flows():
106        raise ValueError(f"Invalid flow_id: {flow_id}.")
107    dimensions = data_dimensions(flow_id)
108    if not dimensions:
109        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
110
111    # --- lets build the codelist dictionary
112    return_dict: dict[str, str] = {}
113    for pattern, dimension, match_type in criteria:
114        if dimension not in dimensions:
115            print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)")
116            continue
117        dim_dict = dimensions[dimension]
118        code_list_dict = get_code_list_dict(dimension, dim_dict)
119        if not code_list_dict:
120            continue
121
122        codes = get_codes(code_list_dict, pattern, match_type)
123
124        # --- combine (as an intersection) with previous matches for this dimension
125        if dimension in return_dict:
126            previous = return_dict[dimension].split("+")
127            codes = list(set(previous).intersection(set(codes)))
128            if not codes:
129                del return_dict[dimension]  # no matches, remove dimension
130        if codes:
131            return_dict[dimension] = "+".join(list(set(codes)))
132
133    # --- return a DataFrame
134    return_dict["flow_id"] = flow_id
135    return pd.DataFrame([return_dict])
136
137
138def fetch_selection(
139    flow_id: str,
140    criteria: MatchCriteria,
141) -> tuple[pd.DataFrame, pd.DataFrame]:
142    """Fetch data based on a selection criteria for items.
143
144    Args:
145        flow_id (str): The ID of the data flow to fetch.
146        criteria (MatchCriteria): A sequence of match criteria to filter the data.
147
148    Returns:
149        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
150
151    """
152    # --- select items based on the criteria
153    selection = select_items(flow_id, criteria)
154
155    # --- fetch the data using the selected items
156    return fetch_multi(selection)
157
158
159# --- quick and dirty testing
160if __name__ == "__main__":
161    # --- specify a selection from the Wage Price Index (WPI) data flow
162    mat_criteria = []
163    mat_criteria.append(match_criterion("Australia", "REGION", MatchType.EXACT))
164    mat_criteria.append(
165        match_criterion(
166            "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT
167        )
168    )
169    mat_criteria.append(
170        match_criterion("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL)
171    )
172    mat_criteria.append(match_criterion("Seas|Trend", "TSEST", MatchType.REGEX))
173    mat_criteria.append(match_criterion("13-Industry aggregate", "INDUSTRY", MatchType.EXACT))
174    mat_criteria.append(match_criterion("Private and Public", "SECTOR", MatchType.EXACT))
175
176    # --- test the selection
177    print(select_items("WPI", mat_criteria))
178    data, meta = fetch_selection("WPI", mat_criteria)
179    print(f"Number of data series: {len(meta)}")  # should be 2
180    print(meta.T)  # should have the Trend and Seasonally Adjusted series
class MatchType(enum.Enum):
15class MatchType(Enum):
16    """Enumeration for match types."""
17
18    EXACT = 1
19    PARTIAL = 2
20    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
MatchItem = tuple[str, str, MatchType]
MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
def get_codes( code_list_dict: dict[str, dict[str, str]], pattern: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> list[str]:
28def get_codes(
29    code_list_dict: FlowMetaDict,
30    pattern: str,
31    match_type: MatchType = MatchType.PARTIAL,
32) -> list[str]:
33    """Obtain all codes matching the pattern."""
34    codes = []
35    for code, code_list in code_list_dict.items():
36        name = code_list.get("name", "")
37        match match_type:
38            case MatchType.EXACT:
39                if name == pattern:
40                    codes.append(code)
41            case MatchType.PARTIAL:
42                if pattern in name:
43                    codes.append(code)
44            case MatchType.REGEX:
45                if re.match(pattern, name):
46                    codes.append(code)
47    return codes

Obtain all codes matching the pattern.

def get_code_list_dict(dimension: str, dim_dict: dict[str, str]) -> dict[str, dict[str, str]]:
50def get_code_list_dict(dimension: str, dim_dict: dict[str, str]) -> FlowMetaDict:
51    """Get the codelist dictionary for a given dimension."""
52    if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
53        print(f"Dimension '{dimension}' does not have a codelist; (skipping)")
54        return {}
55    code_list_name = dim_dict.get("id")
56    return code_lists(code_list_name)

Get the codelist dictionary for a given dimension.

def match_criterion( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
60def match_criterion(
61    pattern: str,
62    dimension: str,
63    match_type: MatchType = MatchType.PARTIAL,
64) -> MatchItem:
65    """Create a new match criterion for use in selection.
66
67    Args:
68        pattern (str): The pattern to match.
69        dimension (str): The dimension to match against.
70        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
71
72    Returns:
73        MatchElement: A tuple representing the match element.
74
75    """
76    return (pattern, dimension, match_type)

Create a new match criterion for use in selection.

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

def select_items( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
 79def select_items(
 80    flow_id: str,
 81    criteria: MatchCriteria,
 82) -> pd.DataFrame:
 83    """Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata.
 84
 85    Args:
 86        flow_id (str): The ID of the data flow to select items from.
 87        criteria (MatchElements): A sequence of tuples containing the element name,
 88            the value to match, and the match type (exact, partial, or regex).
 89
 90    Returns:
 91        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
 92            into the call of the function fetch_multi().
 93
 94    Raises:
 95        ValueError: If the flow_id is not valid or if no items match the criteria.
 96
 97    Notes:
 98    -   Should build a one line DataFrame. This Frame may select multiple data series,
 99        when passed to fetch_multi. It also can be concatenated with other DataFrames
100        to build a larger selection.
101    -   If two match elements refer to the same dimension, only the `intersection` of the
102        matches will be returned.
103
104    """
105    # --- some sanity checks
106    if flow_id not in data_flows():
107        raise ValueError(f"Invalid flow_id: {flow_id}.")
108    dimensions = data_dimensions(flow_id)
109    if not dimensions:
110        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
111
112    # --- lets build the codelist dictionary
113    return_dict: dict[str, str] = {}
114    for pattern, dimension, match_type in criteria:
115        if dimension not in dimensions:
116            print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)")
117            continue
118        dim_dict = dimensions[dimension]
119        code_list_dict = get_code_list_dict(dimension, dim_dict)
120        if not code_list_dict:
121            continue
122
123        codes = get_codes(code_list_dict, pattern, match_type)
124
125        # --- combine (as an intersection) with previous matches for this dimension
126        if dimension in return_dict:
127            previous = return_dict[dimension].split("+")
128            codes = list(set(previous).intersection(set(codes)))
129            if not codes:
130                del return_dict[dimension]  # no matches, remove dimension
131        if codes:
132            return_dict[dimension] = "+".join(list(set(codes)))
133
134    # --- return a DataFrame
135    return_dict["flow_id"] = flow_id
136    return pd.DataFrame([return_dict])

Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
139def fetch_selection(
140    flow_id: str,
141    criteria: MatchCriteria,
142) -> tuple[pd.DataFrame, pd.DataFrame]:
143    """Fetch data based on a selection criteria for items.
144
145    Args:
146        flow_id (str): The ID of the data flow to fetch.
147        criteria (MatchCriteria): A sequence of match criteria to filter the data.
148
149    Returns:
150        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
151
152    """
153    # --- select items based on the criteria
154    selection = select_items(flow_id, criteria)
155
156    # --- fetch the data using the selected items
157    return fetch_multi(selection)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.