sdmxabs.fetch_selection

Select one or more data series from the ABS Catalogue based on search criteria.

  1"""Select one or more data series from the ABS Catalogue based on search criteria."""
  2
  3import re
  4from collections.abc import Sequence
  5from enum import Enum
  6from typing import Unpack
  7
  8import pandas as pd
  9
 10from sdmxabs.download_cache import GetFileKwargs
 11from sdmxabs.fetch_multi import fetch_multi
 12from sdmxabs.flow_metadata import FlowMetaDict, code_lists, data_dimensions, data_flows
 13
 14
 15# --- some types specific to this module
 16class MatchType(Enum):
 17    """Enumeration for match types."""
 18
 19    EXACT = 1
 20    PARTIAL = 2
 21    REGEX = 3
 22
 23
 24MatchItem = tuple[str, str, MatchType]  # pattern, dimension, MatchType
 25MatchCriteria = Sequence[MatchItem]  # Sequence of tuples containing (pattern, dimension, MatchType)
 26
 27
 28# --- private functions
 29def _package_codes(codes: list[str], dimension: str, return_dict: dict[str, str]) -> None:
 30    """Package the codes into the return dictionary for a given dimension.
 31
 32    If the dimension already exists in the return_dict, we will intersect the newly
 33    identified  codes with the existing codes. If the intersection is a null set, the
 34    dimension will be removed from the return_dict (ie. the global match).
 35
 36    Note: multiple matched codes are separated by a '+' sign in the return_dict.
 37
 38    """
 39    if dimension in return_dict:
 40        previous = set(return_dict[dimension].split("+"))
 41        codes = list(previous.intersection(set(codes)))
 42        if not codes:
 43            del return_dict[dimension]  # no intersecting matches, remove dimension
 44    if codes:
 45        return_dict[dimension] = "+".join(sorted(set(codes)))
 46
 47
 48def _get_codes(
 49    code_list_dict: FlowMetaDict,
 50    pattern: str,
 51    match_type: MatchType = MatchType.PARTIAL,
 52) -> list[str]:
 53    """Obtain all codes matching the pattern."""
 54    codes = []
 55    for code, code_list in code_list_dict.items():
 56        name = code_list.get("name", "")
 57        if not name:
 58            # should not happen, but if it does, raise an error
 59            raise ValueError(f"Code '{code}' has no name in codelist")
 60        match match_type:
 61            case MatchType.EXACT:
 62                if name == pattern:
 63                    codes.append(code)
 64            case MatchType.PARTIAL:
 65                # Case-insensitive partial match
 66                if pattern.lower() in name.lower():
 67                    codes.append(code)
 68            case MatchType.REGEX:
 69                try:
 70                    if re.search(pattern, name):
 71                        codes.append(code)
 72                except re.error as e:
 73                    raise ValueError(f"Invalid regex pattern '{pattern}': {e}") from e
 74    return codes
 75
 76
 77def _validate_flow_and_dimensions(flow_id: str) -> FlowMetaDict:
 78    """Validate flow_id and return dimensions.
 79
 80    Args:
 81        flow_id (str): The ID of the data flow to validate.
 82
 83    Returns:
 84        FlowMetaDict: Dictionary containing the flow's dimensions.
 85
 86    Raises:
 87        ValueError: If the flow_id is not valid or has no dimensions.
 88
 89    """
 90    if flow_id not in data_flows():
 91        raise ValueError(f"Invalid flow_id: {flow_id}.")
 92    dimensions = data_dimensions(flow_id)
 93    if not dimensions:
 94        raise ValueError(f"No dimensions found for flow_id: {flow_id}.")
 95    return dimensions
 96
 97
 98def _validate_dimension(dimension: str, flow_id: str, dimensions: FlowMetaDict) -> str:
 99    """Validate dimension and return codelist name if valid.
100
101    Args:
102        dimension (str): The dimension name to validate.
103        flow_id (str): The flow ID for error messages.
104        dimensions (FlowMetaDict): Dictionary containing the flow's dimensions.
105
106    Returns:
107        str: The codelist name if valid.
108
109    Raises:
110        ValueError: If dimension is not found or doesn't have a codelist.
111
112    """
113    if dimension not in dimensions:
114        raise ValueError(f"Dimension '{dimension}' not found for flow '{flow_id}'")
115
116    dim_dict = dimensions[dimension]
117    if "package" not in dim_dict or dim_dict["package"] != "codelist" or "id" not in dim_dict:
118        raise ValueError(f"Dimension '{dimension}' does not have a codelist for flow '{flow_id}'")
119
120    return dim_dict.get("id", "")
121
122
123def _process_match_criteria(
124    criteria: MatchCriteria, flow_id: str, dimensions: FlowMetaDict
125) -> dict[str, str]:
126    """Process match criteria and build the result dictionary.
127
128    Args:
129        criteria (MatchCriteria): The match criteria to process.
130        flow_id (str): The flow ID for error messages.
131        dimensions (FlowMetaDict): Dictionary containing the flow's dimensions.
132
133    Returns:
134        dict[str, str]: Dictionary of dimension codes.
135
136    """
137    result_dict: dict[str, str] = {}
138
139    for pattern, dimension, match_type in criteria:
140        code_list_name = _validate_dimension(dimension, flow_id, dimensions)
141        codes = _get_codes(code_lists(code_list_name), pattern, match_type)
142        if codes:
143            _package_codes(codes, dimension, result_dict)
144
145    return result_dict
146
147
148# --- public function
149def match_item(
150    pattern: str,
151    dimension: str,
152    match_type: MatchType = MatchType.PARTIAL,
153) -> MatchItem:
154    """Create a new MatchItem for use in select_items() and fetch_selection().
155
156    Args:
157        pattern (str): The pattern to match.
158        dimension (str): The dimension to match against.
159        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
160
161    Returns:
162        MatchElement: A tuple representing the match element.
163
164    Note:
165        This function is of little value. It is easier to create the tuple directly.
166
167    """
168    return (pattern, dimension, match_type)
169
170
171def make_wanted(
172    flow_id: str,
173    criteria: MatchCriteria,
174) -> pd.DataFrame:
175    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
176
177    Args:
178        flow_id (str): The ID of the data flow to select items from.
179        criteria (MatchCriteria): A sequence of tuples containing the pattern,
180            dimension name, and match-type (exact, partial, or regex).
181
182    Returns:
183        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
184            into the call of the function fetch_multi().
185
186    Raises:
187        ValueError: If the flow_id is not valid or if no items match the criteria.
188
189    Notes:
190    -   Should build a one line DataFrame. This Frame may select multiple data series,
191        when passed to fetch_multi. It also can be concatenated with other DataFrames
192        to build a larger selection.
193    -   If two match elements refer to the same dimension, only the `intersection` of the
194        matches will be returned.
195
196    """
197    dimensions = _validate_flow_and_dimensions(flow_id)
198    result_dict = _process_match_criteria(criteria, flow_id, dimensions)
199
200    # Add flow_id and return as DataFrame
201    result_dict["flow_id"] = flow_id
202    return pd.DataFrame([result_dict]).astype(str)
203
204
205def fetch_selection(
206    flow_id: str,
207    criteria: MatchCriteria,
208    parameters: dict[str, str] | None = None,
209    *,
210    validate: bool = False,
211    **kwargs: Unpack[GetFileKwargs],
212) -> tuple[pd.DataFrame, pd.DataFrame]:
213    """Fetch data based on a selection criteria for items.
214
215    Args:
216        flow_id (str): The ID of the data flow to fetch.
217        criteria (MatchCriteria): A sequence of match criteria to filter the data.
218        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
219        validate (bool, optional): If True, validate the selection against the flow's
220            required dimensions when generating the URL key. Defaults to False.
221        **kwargs: Additional keyword arguments for the fetch_multi function.
222
223    Returns:
224        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
225
226    """
227    verbose = kwargs.get("verbose", False)
228    if verbose:
229        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
230
231    selection = make_wanted(flow_id, criteria)
232    return fetch_multi(selection, parameters, validate=validate, **kwargs)
233
234
235if __name__ == "__main__":
236
237    def test_module() -> None:
238        """Test the match_item function."""
239        # --- test match_item()
240        item = match_item("Australia", "REGION", MatchType.EXACT)
241        if item != ("Australia", "REGION", MatchType.EXACT):
242            print(f"Test failed: {item}")
243        else:
244            print("Test passed, match_item() works as expected.")
245
246        # --- specify a selection from the Wage Price Index (WPI) data flow
247        mat_criteria = []
248        mat_criteria.append(match_item("Australia", "REGION", MatchType.EXACT))
249        mat_criteria.append(
250            match_item(
251                "Percentage change from corresponding quarter of previous year", "MEASURE", MatchType.EXACT
252            )
253        )
254        mat_criteria.append(
255            match_item("Total hourly rates of pay excluding bonuses", "INDEX", MatchType.PARTIAL)
256        )
257        mat_criteria.append(match_item("Seas|Trend", "TSEST", MatchType.REGEX))
258        mat_criteria.append(match_item("13-Industry aggregate", "INDUSTRY", MatchType.EXACT))
259        mat_criteria.append(match_item("Private and Public", "SECTOR", MatchType.EXACT))
260
261        # --- test the selection
262        expected_count = 2  # expecting two data series
263        parameters = {"startPeriod": "2020-Q1", "endPeriod": "2020-Q4", "detail": "full"}
264        data, meta = fetch_selection("WPI", mat_criteria, parameters=parameters, verbose=False)
265        if len(data.columns) == expected_count and meta.shape[0] == expected_count:
266            print("Test passed: Data and metadata have expected dimensions.")
267        else:
268            print(f"Test FAILED: Data columns {len(data.columns)}, Metadata rows {meta.shape[0]}")
269        expected_seasonal = {"Trend", "Seasonally Adjusted"}
270        if set(meta.TSEST.to_list()) == expected_seasonal:
271            print("Test passed: TSEST has expected values.")
272        else:
273            print(f"Test FAILED: TSEST values {meta.TSEST.to_list()}")
274        expected_shape = (4, 2)  # 4 quarters of data, over two series
275        if data.shape == expected_shape:
276            print("Test passed: Fetched data has expected shape.")
277        else:
278            print(f"Test FAILED: Fetched data shape {data.shape=} is unexpected {expected_shape=}.")
279
280    test_module()
class MatchType(enum.Enum):
17class MatchType(Enum):
18    """Enumeration for match types."""
19
20    EXACT = 1
21    PARTIAL = 2
22    REGEX = 3

Enumeration for match types.

EXACT = <MatchType.EXACT: 1>
PARTIAL = <MatchType.PARTIAL: 2>
REGEX = <MatchType.REGEX: 3>
MatchItem = tuple[str, str, MatchType]
MatchCriteria = collections.abc.Sequence[tuple[str, str, MatchType]]
def match_item( pattern: str, dimension: str, match_type: MatchType = <MatchType.PARTIAL: 2>) -> tuple[str, str, MatchType]:
150def match_item(
151    pattern: str,
152    dimension: str,
153    match_type: MatchType = MatchType.PARTIAL,
154) -> MatchItem:
155    """Create a new MatchItem for use in select_items() and fetch_selection().
156
157    Args:
158        pattern (str): The pattern to match.
159        dimension (str): The dimension to match against.
160        match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
161
162    Returns:
163        MatchElement: A tuple representing the match element.
164
165    Note:
166        This function is of little value. It is easier to create the tuple directly.
167
168    """
169    return (pattern, dimension, match_type)

Create a new MatchItem for use in select_items() and fetch_selection().

Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.

Returns: MatchElement: A tuple representing the match element.

Note: This function is of little value. It is easier to create the tuple directly.

def make_wanted( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]]) -> pandas.core.frame.DataFrame:
172def make_wanted(
173    flow_id: str,
174    criteria: MatchCriteria,
175) -> pd.DataFrame:
176    """Build a `wanted` Dataframe for use by fetch_multi() by matching flow metadata.
177
178    Args:
179        flow_id (str): The ID of the data flow to select items from.
180        criteria (MatchCriteria): A sequence of tuples containing the pattern,
181            dimension name, and match-type (exact, partial, or regex).
182
183    Returns:
184        pd.DataFrame: A DataFrame containing the selected items, which can be dropped
185            into the call of the function fetch_multi().
186
187    Raises:
188        ValueError: If the flow_id is not valid or if no items match the criteria.
189
190    Notes:
191    -   Should build a one line DataFrame. This Frame may select multiple data series,
192        when passed to fetch_multi. It also can be concatenated with other DataFrames
193        to build a larger selection.
194    -   If two match elements refer to the same dimension, only the `intersection` of the
195        matches will be returned.
196
197    """
198    dimensions = _validate_flow_and_dimensions(flow_id)
199    result_dict = _process_match_criteria(criteria, flow_id, dimensions)
200
201    # Add flow_id and return as DataFrame
202    result_dict["flow_id"] = flow_id
203    return pd.DataFrame([result_dict]).astype(str)

Build a wanted Dataframe for use by fetch_multi() by matching flow metadata.

Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchCriteria): A sequence of tuples containing the pattern, dimension name, and match-type (exact, partial, or regex).

Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().

Raises: ValueError: If the flow_id is not valid or if no items match the criteria.

Notes:

  • Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
  • If two match elements refer to the same dimension, only the intersection of the matches will be returned.
def fetch_selection( flow_id: str, criteria: Sequence[tuple[str, str, MatchType]], parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
206def fetch_selection(
207    flow_id: str,
208    criteria: MatchCriteria,
209    parameters: dict[str, str] | None = None,
210    *,
211    validate: bool = False,
212    **kwargs: Unpack[GetFileKwargs],
213) -> tuple[pd.DataFrame, pd.DataFrame]:
214    """Fetch data based on a selection criteria for items.
215
216    Args:
217        flow_id (str): The ID of the data flow to fetch.
218        criteria (MatchCriteria): A sequence of match criteria to filter the data.
219        parameters (dict[str, str] | None, optional): Additional parameters for the fetch.
220        validate (bool, optional): If True, validate the selection against the flow's
221            required dimensions when generating the URL key. Defaults to False.
222        **kwargs: Additional keyword arguments for the fetch_multi function.
223
224    Returns:
225        tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
226
227    """
228    verbose = kwargs.get("verbose", False)
229    if verbose:
230        print(f"fetch_selection(): {flow_id=} {criteria=} {parameters=} {validate=} {kwargs=}")
231
232    selection = make_wanted(flow_id, criteria)
233    return fetch_multi(selection, parameters, validate=validate, **kwargs)

Fetch data based on a selection criteria for items.

Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data. parameters (dict[str, str] | None, optional): Additional parameters for the fetch. validate (bool, optional): If True, validate the selection against the flow's required dimensions when generating the URL key. Defaults to False. **kwargs: Additional keyword arguments for the fetch_multi function.

Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.