sdmxabs.fetch_selection

Select one or more data series from the ABS Catalogue based on search criteria.

  1"""Select one or more data series from the ABS Catalogue based on search criteria."""
  2
  3import re
  4from collections.abc import Sequence
  5from enum import Enum
  6from typing import Unpack
  7
  8import pandas as pd
  9
 10from sdmxabs.download_cache import GetFileKwargs
 11from sdmxabs.fetch_multi import fetch_multi
 12from sdmxabs.flow_metadata import (
 13    CODE_LIST_ID,
 14    FlowMetaDict,
 15    code_lists,
 16    structure_from_flow_id,
 17)
 18
 19
 20# --- some types specific to this module
 21class MatchType(Enum):
 22    """Enumeration for match types."""
 23
 24    EXACT = 1
 25    PARTIAL = 2
 26    REGEX = 3
 27
 28
 29MatchItem = tuple[str, str, MatchType]  # pattern, dimension, MatchType
 30MatchCriteria = Sequence[MatchItem]  # Sequence of tuples containing (pattern, dimension, MatchType)
 31
 32
 33# --- private functions
 34def _package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
 35    """Package the codes into the return dictionary for a given dimension.
 36
 37    If the dimension already exists in the return_dict, we will intersect the newly
 38    identified  codes with the existing codes. If the intersection is a null set, the
 39    dimension will be removed from the return_dict (ie. the global match).
 40
 41    Note: multiple matched codes are separated by a '+' sign in the return_dict.
 42
 43    """
 44    if dimension in return_dict:
 45        previous = set(return_dict[dimension].split("+"))
 46        codes = list(previous.intersection(set(codes)))
 47        if not codes:
 48            del return_dict[dimension]  # no intersecting matches, remove dimension
 49    if codes:
 50        return_dict[dimension] = "+".join(sorted(set(codes)))
 51
 52
 53def _get_codes(
 54    code_list_dict: FlowMetaDict,
 55    pattern: str,
 56    match_type: MatchType = MatchType.PARTIAL,
 57) -> list[str]:
 58    """Obtain all codes matching the pattern."""
 59    codes = []
 60    for code, code_list in code_list_dict.items():
 61        name = code_list.get("name", "")
 62        if not name:
 63            # should not happen, but if it does, raise an error
 64            raise ValueError(f"Code '{code}' has no name in codelist")
 65        match match_type:
 66            case MatchType.EXACT:
 67                if name == pattern:
 68                    codes.append(code)
 69            case MatchType.PARTIAL:
 70                # Case-insensitive partial match
 71                if pattern.lower() in name.lower():
 72                    codes.append(code)
 73            case MatchType.REGEX:
 74                try:
 75                    if re.search(pattern, name):
 76                        codes.append(code)
 77                except re.error as e:
 78                    raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
 79    return codes
 80
 81
 82def _process_match_criteria(criteria: MatchCriteria, structure: FlowMetaDict) -> dict[str, str]:
 83    """Process match criteria and build the result dictionary.
 84
 85    Args:
 86        criteria (MatchCriteria): The match criteria to process.
 87        structure (FlowMetaDict): Dictionary containing the data structure.
 88
 89    Returns:
 90        dict[str, str]: Dictionary of dimension codes.
 91
 92    """
 93    result_dict: dict[str, str] = {}
 94
 95    for pattern, dim_name, match_type in criteria:
 96        if dim_name not in structure:
 97            raise ValueError(f"Dimension '{dim_name}' not found in structure.")
 98        dim_dict = structure[dim_name]
 99        if not pattern:
100            raise ValueError(f"Pattern for dimension '{dim_name}' cannot be empty.")
101        if "package" not in dim_dict or dim_dict["package"] != "codelist" or CODE_LIST_ID not in dim_dict:
102            raise ValueError(f"Dimension '{dim_name}' does not have a codelist.")
103        code_list_name = dim_dict.get(CODE_LIST_ID, "")
104        codes = _get_codes(code_lists(code_list_name), pattern, match_type)
105        if codes:
106            _package_codes(codes, dim_name, result_dict)
107
108    return result_dict
109
110
111# --- public function
112def match_item(
113    pattern: str,
114    dimension: str,
115    match_type: MatchType = MatchType.PARTIAL,
116) -> MatchItem:
117    """Create a new MatchItem for use in select_items() and fetch_selection().
118
119    Args:
120        pattern (str): The pattern to match.
121        dimension (str): The dimension to match against.
122        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
123
124    Returns:
125        MatchElement: A tuple representing the match element.
126
127    Note:
128        This function is of little value. It is much easier to create the tuple directly.
129
130    """
131    return (pattern, dimension, match_type)
132
133
134def make_wanted(
135    flow_id: str,
136    criteria: MatchCriteria,
137) -> pd.DataFrame:
138    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
139
140    Args:
141        flow_id (str): The ID of the data flow to select items from.
142        criteria (MatchCriteria): A sequence of tuples containing the pattern,
143            dimension name, and match-type (exact, partial, or regex).
144
145    Returns:
146        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
147            into the call of the function fetch_multi().
148
149    Raises:
150        ValueError: If the flow_id is not valid or if no items match the criteria.
151
152    Notes:
153    -   Should build a one line DataFrame. This Frame may select multiple data series,
154        when passed to fetch_multi. It also can be concatenated with other DataFrames
155        to build a larger selection.
156    -   If two match elements refer to the same dimension, only the `intersection` of the
157        matches will be returned.
158
159    """
160    structure = structure_from_flow_id(flow_id)
161    result_dict = _process_match_criteria(criteria, structure)
162
163    # Add flow_id and return as DataFrame
164    result_dict["flow_id"] = flow_id
165    return pd.DataFrame([result_dict]).astype(str)
166
167
168def fetch_selection(
169    flow_id: str,
170    criteria: MatchCriteria,
171    parameters: dict[str, str] | None = None,
172    *,
173    validate: bool = False,
174    **kwargs: Unpack[GetFileKwargs],
175) -> tuple[pd.DataFrame, pd.DataFrame]:
176    """Fetch data based on a selection criteria for items.
177
178    Args:
179        flow_id (str): The ID of the data flow to fetch.
180        criteria (MatchCriteria): A sequence of match criteria to filter the data.
181        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
182        validate (bool, optional): If True, validate the selection against the flow's
183            required dimensions when generating the URL key. Defaults to False.
184        **kwargs: Additional keyword arguments for the fetch_multi function.
185
186    Returns:
187        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
188
189    """
190    verbose = kwargs.get("verbose", False)
191    if verbose:
192        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
193
194    selection = make_wanted(flow_id, criteria)
195    return fetch_multi(selection, parameters, validate=validate, **kwargs)
196
197
198if __name__ == "__main__":
199
200    def test_module() -> None:
201        """Test the match_item function."""
202        # --- test match_item()
203        item = match_item("Australia", "REGION", MatchType.EXACT)
204        if item != ("Australia", "REGION", MatchType.EXACT):
205            print(f"Test failed: {item}")
206        else:
207            print("Test passed, match_item() works as expected.")
208
209        # --- specify a selection from the Wage Price Index (WPI) data flow
210        mat_criteria = []
211        mat_criteria.append(match_item("Australia", "REGION", MatchType.EXACT))
212        mat_criteria.append(
213            match_item(
214                "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT
215            )
216        )
217        mat_criteria.append(
218            match_item("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL)
219        )
220        mat_criteria.append(match_item("Seas|Trend", "TSEST", MatchType.REGEX))
221        mat_criteria.append(match_item("13-Industry aggregate", "INDUSTRY", MatchType.EXACT))
222        mat_criteria.append(match_item("Private and Public", "SECTOR", MatchType.EXACT))
223
224        # --- test the selection
225        expected_count = 2  # expecting two data series
226        parameters = {"startPeriod": "2020-Q1", "endPeriod": "2020-Q4", "detail": "full"}
227        data, meta = fetch_selection("WPI", mat_criteria, parameters=parameters, verbose=False)
228        if len(data.columns) == expected_count and meta.shape[0] == expected_count:
229            print("Test passed: Data and metadata have expected dimensions.")
230        else:
231            print(f"Test FAILED: Data columns {len(data.columns)}, Metadata rows {meta.shape[0]}")
232        expected_seasonal = {"Trend", "Seasonally Adjusted"}
233        print(meta)
234        if set(meta.TSEST.to_list()) == expected_seasonal:
235            print("Test passed: TSEST has expected values.")
236        else:
237            print(f"Test FAILED: TSEST values {meta.TSEST.to_list()}")
238        expected_shape = (4, 2)  # 4 quarters of data, over two series
239        if data.shape == expected_shape:
240            print("Test passed: Fetched data has expected shape.")
241        else:
242            print(f"Test FAILED: Fetched data shape {data.shape=} is unexpected {expected_shape=}.")
243
244    test_module()
class MatchType(enum.Enum):
22class MatchType(Enum):
23    """Enumeration for match types."""
24
25    EXACT = 1
26    PARTIAL = 2
27    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
MatchItem = tuple[str, str, MatchType]
MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
def match_item( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
113def match_item(
114    pattern: str,
115    dimension: str,
116    match_type: MatchType = MatchType.PARTIAL,
117) -> MatchItem:
118    """Create a new MatchItem for use in select_items() and fetch_selection().
119
120    Args:
121        pattern (str): The pattern to match.
122        dimension (str): The dimension to match against.
123        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
124
125    Returns:
126        MatchElement: A tuple representing the match element.
127
128    Note:
129        This function is of little value. It is much easier to create the tuple directly.
130
131    """
132    return (pattern, dimension, match_type)

Create a new MatchItem for use in select_items() and fetch_selection().

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

Note: This function is of little value. It is much easier to create the tuple directly.

def make_wanted( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
135def make_wanted(
136    flow_id: str,
137    criteria: MatchCriteria,
138) -> pd.DataFrame:
139    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
140
141    Args:
142        flow_id (str): The ID of the data flow to select items from.
143        criteria (MatchCriteria): A sequence of tuples containing the pattern,
144            dimension name, and match-type (exact, partial, or regex).
145
146    Returns:
147        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
148            into the call of the function fetch_multi().
149
150    Raises:
151        ValueError: If the flow_id is not valid or if no items match the criteria.
152
153    Notes:
154    -   Should build a one line DataFrame. This Frame may select multiple data series,
155        when passed to fetch_multi. It also can be concatenated with other DataFrames
156        to build a larger selection.
157    -   If two match elements refer to the same dimension, only the `intersection` of the
158        matches will be returned.
159
160    """
161    structure = structure_from_flow_id(flow_id)
162    result_dict = _process_match_criteria(criteria, structure)
163
164    # Add flow_id and return as DataFrame
165    result_dict["flow_id"] = flow_id
166    return pd.DataFrame([result_dict]).astype(str)

Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
169def fetch_selection(
170    flow_id: str,
171    criteria: MatchCriteria,
172    parameters: dict[str, str] | None = None,
173    *,
174    validate: bool = False,
175    **kwargs: Unpack[GetFileKwargs],
176) -> tuple[pd.DataFrame, pd.DataFrame]:
177    """Fetch data based on a selection criteria for items.
178
179    Args:
180        flow_id (str): The ID of the data flow to fetch.
181        criteria (MatchCriteria): A sequence of match criteria to filter the data.
182        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
183        validate (bool, optional): If True, validate the selection against the flow's
184            required dimensions when generating the URL key. Defaults to False.
185        **kwargs: Additional keyword arguments for the fetch_multi function.
186
187    Returns:
188        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
189
190    """
191    verbose = kwargs.get("verbose", False)
192    if verbose:
193        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
194
195    selection = make_wanted(flow_id, criteria)
196    return fetch_multi(selection, parameters, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.