sdmxabs.fetch_selection

Select one or more data series from the ABS Catalogue based on search criteria.

  1"""Select one or more data series from the ABS Catalogue based on search criteria."""
  2
  3import re
  4from collections.abc import Sequence
  5from enum import Enum
  6from typing import Unpack
  7
  8import pandas as pd
  9
 10from sdmxabs.download_cache import GetFileKwargs
 11from sdmxabs.fetch_multi import fetch_multi
 12from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows
 13
 14
 15# --- some types specific to this module
 16class MatchType(Enum):
 17    """Enumeration for match types."""
 18
 19    EXACT = 1
 20    PARTIAL = 2
 21    REGEX = 3
 22
 23
 24MatchItem = tuple[str, str, MatchType]  # pattern, dimension, MatchType
 25MatchCriteria = Sequence[MatchItem]  # Sequence of tuples containing (pattern, dimension, MatchType)
 26
 27
 28# --- private functions
 29def _package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
 30    """Package the codes into the return dictionary for a given dimension.
 31
 32    If the dimension already exists in the return_dict, we will intersect the newly
 33    identified  codes with the existing codes. If the intersection is a null set, the
 34    dimension will be removed from the return_dict (ie. the global match).
 35
 36    Note: multiple matched codes are separated by a '+' sign in the return_dict.
 37
 38    """
 39    if dimension in return_dict:
 40        previous = set(return_dict[dimension].split("+"))
 41        codes = list(previous.intersection(set(codes)))
 42        if not codes:
 43            del return_dict[dimension]  # no intersecting matches, remove dimension
 44    if codes:
 45        return_dict[dimension] = "+".join(sorted(set(codes)))
 46
 47
 48def _get_codes(
 49    code_list_dict: FlowMetaDict,
 50    pattern: str,
 51    match_type: MatchType = MatchType.PARTIAL,
 52) -> list[str]:
 53    """Obtain all codes matching the pattern."""
 54    codes = []
 55    for code, code_list in code_list_dict.items():
 56        name = code_list.get("name", "")
 57        if not name:
 58            # should not happen, but if it does, raise an error
 59            raise ValueError(f"Code '{code}' has no name in codelist")
 60        match match_type:
 61            case MatchType.EXACT:
 62                if name == pattern:
 63                    codes.append(code)
 64            case MatchType.PARTIAL:
 65                # Case-insensitive partial match
 66                if pattern.lower() in name.lower():
 67                    codes.append(code)
 68            case MatchType.REGEX:
 69                try:
 70                    if re.search(pattern, name):
 71                        codes.append(code)
 72                except re.error as e:
 73                    raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
 74    return codes
 75
 76
 77def _validate_flow_and_dimensions(flow_id: str) -> FlowMetaDict:
 78    """Validate flow_id and return dimensions.
 79
 80    Args:
 81        flow_id (str): The ID of the data flow to validate.
 82
 83    Returns:
 84        FlowMetaDict: Dictionary containing the flow's dimensions.
 85
 86    Raises:
 87        ValueError: If the flow_id is not valid or has no dimensions.
 88
 89    """
 90    if flow_id not in data_flows():
 91        raise ValueError(f"Invalid flow_id: {flow_id}.")
 92    dimensions = data_dimensions(flow_id)
 93    if not dimensions:
 94        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
 95    return dimensions
 96
 97
 98def _validate_dimension(dimension: str, flow_id: str, dimensions: FlowMetaDict) -> str:
 99    """Validate dimension and return codelist name if valid.
100
101    Args:
102        dimension (str): The dimension name to validate.
103        flow_id (str): The flow ID for error messages.
104        dimensions (FlowMetaDict): Dictionary containing the flow's dimensions.
105
106    Returns:
107        str: The codelist name if valid.
108
109    Raises:
110        ValueError: If dimension is not found or doesn't have a codelist.
111
112    """
113    if dimension not in dimensions:
114        raise ValueError(f"Dimension '{dimension}' not found for flow '{flow_id}'")
115
116    dim_dict = dimensions[dimension]
117    if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
118        raise ValueError(f"Dimension '{dimension}' does not have a codelist for flow '{flow_id}'")
119
120    return dim_dict.get("id", "")
121
122
123def _process_match_criteria(
124    criteria: MatchCriteria, flow_id: str, dimensions: FlowMetaDict
125) -> dict[str, str]:
126    """Process match criteria and build the result dictionary.
127
128    Args:
129        criteria (MatchCriteria): The match criteria to process.
130        flow_id (str): The flow ID for error messages.
131        dimensions (FlowMetaDict): Dictionary containing the flow's dimensions.
132
133    Returns:
134        dict[str, str]: Dictionary of dimension codes.
135
136    """
137    result_dict: dict[str, str] = {}
138
139    for pattern, dimension, match_type in criteria:
140        code_list_name = _validate_dimension(dimension, flow_id, dimensions)
141        codes = _get_codes(code_lists(code_list_name), pattern, match_type)
142        if codes:
143            _package_codes(codes, dimension, result_dict)
144
145    return result_dict
146
147
148# --- public function
149def match_item(
150    pattern: str,
151    dimension: str,
152    match_type: MatchType = MatchType.PARTIAL,
153) -> MatchItem:
154    """Create a new MatchItem for use in select_items() and fetch_selection().
155
156    Args:
157        pattern (str): The pattern to match.
158        dimension (str): The dimension to match against.
159        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
160
161    Returns:
162        MatchElement: A tuple representing the match element.
163
164    """
165    return (pattern, dimension, match_type)
166
167
168def make_wanted(
169    flow_id: str,
170    criteria: MatchCriteria,
171) -> pd.DataFrame:
172    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
173
174    Args:
175        flow_id (str): The ID of the data flow to select items from.
176        criteria (MatchCriteria): A sequence of tuples containing the pattern,
177            dimension name, and match-type (exact, partial, or regex).
178
179    Returns:
180        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
181            into the call of the function fetch_multi().
182
183    Raises:
184        ValueError: If the flow_id is not valid or if no items match the criteria.
185
186    Notes:
187    -   Should build a one line DataFrame. This Frame may select multiple data series,
188        when passed to fetch_multi. It also can be concatenated with other DataFrames
189        to build a larger selection.
190    -   If two match elements refer to the same dimension, only the `intersection` of the
191        matches will be returned.
192
193    """
194    dimensions = _validate_flow_and_dimensions(flow_id)
195    result_dict = _process_match_criteria(criteria, flow_id, dimensions)
196
197    # Add flow_id and return as DataFrame
198    result_dict["flow_id"] = flow_id
199    return pd.DataFrame([result_dict]).astype(str)
200
201
202def fetch_selection(
203    flow_id: str,
204    criteria: MatchCriteria,
205    parameters: dict[str, str] | None = None,
206    *,
207    validate: bool = False,
208    **kwargs: Unpack[GetFileKwargs],
209) -> tuple[pd.DataFrame, pd.DataFrame]:
210    """Fetch data based on a selection criteria for items.
211
212    Args:
213        flow_id (str): The ID of the data flow to fetch.
214        criteria (MatchCriteria): A sequence of match criteria to filter the data.
215        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
216        validate (bool, optional): If True, validate the selection against the flow's
217            required dimensions when generating the URL key. Defaults to False.
218        **kwargs: Additional keyword arguments for the fetch_multi function.
219
220    Returns:
221        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
222
223    """
224    verbose = kwargs.get("verbose", False)
225    if verbose:
226        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
227
228    selection = make_wanted(flow_id, criteria)
229    return fetch_multi(selection, parameters, validate=validate, **kwargs)
230
231
232if __name__ == "__main__":
233
234    def test_module() -> None:
235        """Test the match_item function."""
236        # --- test match_item()
237        item = match_item("Australia", "REGION", MatchType.EXACT)
238        if item != ("Australia", "REGION", MatchType.EXACT):
239            print(f"Test failed: {item}")
240        else:
241            print("Test passed, match_item() works as expected.")
242
243        # --- specify a selection from the Wage Price Index (WPI) data flow
244        mat_criteria = []
245        mat_criteria.append(match_item("Australia", "REGION", MatchType.EXACT))
246        mat_criteria.append(
247            match_item(
248                "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT
249            )
250        )
251        mat_criteria.append(
252            match_item("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL)
253        )
254        mat_criteria.append(match_item("Seas|Trend", "TSEST", MatchType.REGEX))
255        mat_criteria.append(match_item("13-Industry aggregate", "INDUSTRY", MatchType.EXACT))
256        mat_criteria.append(match_item("Private and Public", "SECTOR", MatchType.EXACT))
257
258        # --- test the selection
259        expected_count = 2  # expecting two data series
260        parameters = {"startPeriod": "2020-Q1", "endPeriod": "2020-Q4", "detail": "full"}
261        data, meta = fetch_selection("WPI", mat_criteria, parameters=parameters, verbose=False)
262        if len(data.columns) == expected_count and meta.shape[0] == expected_count:
263            print("Test passed: Data and metadata have expected dimensions.")
264        else:
265            print(f"Test FAILED: Data columns {len(data.columns)}, Metadata rows {meta.shape[0]}")
266        expected_seasonal = {"Trend", "Seasonally Adjusted"}
267        if set(meta.TSEST.to_list()) == expected_seasonal:
268            print("Test passed: TSEST has expected values.")
269        else:
270            print(f"Test FAILED: TSEST values {meta.TSEST.to_list()}")
271        expected_shape = (4, 2)  # 4 quarters of data, over two series
272        if data.shape == expected_shape:
273            print("Test passed: Fetched data has expected shape.")
274        else:
275            print(f"Test FAILED: Fetched data shape {data.shape=} is unexpected {expected_shape=}.")
276
277    test_module()
class MatchType(enum.Enum):
17class MatchType(Enum):
18    """Enumeration for match types."""
19
20    EXACT = 1
21    PARTIAL = 2
22    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
MatchItem = tuple[str, str, MatchType]
MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
def match_item( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
150def match_item(
151    pattern: str,
152    dimension: str,
153    match_type: MatchType = MatchType.PARTIAL,
154) -> MatchItem:
155    """Create a new MatchItem for use in select_items() and fetch_selection().
156
157    Args:
158        pattern (str): The pattern to match.
159        dimension (str): The dimension to match against.
160        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
161
162    Returns:
163        MatchElement: A tuple representing the match element.
164
165    """
166    return (pattern, dimension, match_type)

Create a new MatchItem for use in select_items() and fetch_selection().

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

def make_wanted( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
169def make_wanted(
170    flow_id: str,
171    criteria: MatchCriteria,
172) -> pd.DataFrame:
173    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
174
175    Args:
176        flow_id (str): The ID of the data flow to select items from.
177        criteria (MatchCriteria): A sequence of tuples containing the pattern,
178            dimension name, and match-type (exact, partial, or regex).
179
180    Returns:
181        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
182            into the call of the function fetch_multi().
183
184    Raises:
185        ValueError: If the flow_id is not valid or if no items match the criteria.
186
187    Notes:
188    -   Should build a one line DataFrame. This Frame may select multiple data series,
189        when passed to fetch_multi. It also can be concatenated with other DataFrames
190        to build a larger selection.
191    -   If two match elements refer to the same dimension, only the `intersection` of the
192        matches will be returned.
193
194    """
195    dimensions = _validate_flow_and_dimensions(flow_id)
196    result_dict = _process_match_criteria(criteria, flow_id, dimensions)
197
198    # Add flow_id and return as DataFrame
199    result_dict["flow_id"] = flow_id
200    return pd.DataFrame([result_dict]).astype(str)

Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
203def fetch_selection(
204    flow_id: str,
205    criteria: MatchCriteria,
206    parameters: dict[str, str] | None = None,
207    *,
208    validate: bool = False,
209    **kwargs: Unpack[GetFileKwargs],
210) -> tuple[pd.DataFrame, pd.DataFrame]:
211    """Fetch data based on a selection criteria for items.
212
213    Args:
214        flow_id (str): The ID of the data flow to fetch.
215        criteria (MatchCriteria): A sequence of match criteria to filter the data.
216        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
217        validate (bool, optional): If True, validate the selection against the flow's
218            required dimensions when generating the URL key. Defaults to False.
219        **kwargs: Additional keyword arguments for the fetch_multi function.
220
221    Returns:
222        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
223
224    """
225    verbose = kwargs.get("verbose", False)
226    if verbose:
227        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
228
229    selection = make_wanted(flow_id, criteria)
230    return fetch_multi(selection, parameters, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.