sdmxabs.fetch_multi

Fetch multiple datasets from the SDMX API.

  1"""Fetch multiple datasets from the SDMX API."""
  2
  3from io import StringIO
  4from typing import Unpack
  5
  6import pandas as pd
  7
  8from sdmxabs.download_cache import CacheError, GetFileKwargs, HttpError
  9from sdmxabs.fetch import fetch
 10
 11# --- private function
 12IndexInformation = tuple[type, str | None]  # (Index type, frequency if PeriodIndex)
 13
 14
 15def _validate_index_compatibility(
 16    data: pd.DataFrame, reference_index_info: IndexInformation | None
 17) -> IndexInformation:
 18    """Validate that the index of the current DataFrame is compatible with the reference index."""
 19    # establish the index information for the current DataFrame
 20    if isinstance(data.index, pd.PeriodIndex):
 21        current_index_info: IndexInformation = (type(data.index), data.index.freqstr)
 22    else:
 23        current_index_info = (type(data.index), None)
 24
 25    # if this is the first DataFrame, set the reference index info
 26    if reference_index_info is None:
 27        reference_index_info = current_index_info
 28
 29    # if this is not the first DataFrame, check for index compatibility
 30    elif current_index_info != reference_index_info:
 31        raise ValueError(
 32            f"Index mismatch: cannot mix {reference_index_info} "
 33            f"with {current_index_info}. "
 34            f"All datasets must have the same index type (e.g., all quarterly or all monthly data)."
 35        )
 36
 37    return reference_index_info
 38
 39
 40def _extract(
 41    wanted: pd.DataFrame,
 42    parameters: dict[str, str] | None,
 43    *,
 44    validate: bool = False,
 45    **kwargs: Unpack[GetFileKwargs],
 46) -> tuple[pd.DataFrame, pd.DataFrame]:  # data / metadata
 47    """Extract the data and metadata for each row in the dimensions DataFrame.
 48
 49    Args:
 50        wanted (pd.DataFrame): DataFrame containing the dimensions to fetch.
 51                               DataFrame cells with NAN values will be ignored.
 52                               The DataFrame must have a populated 'flow_id' column.
 53        validate (bool): If True, the function will validate the dimensions and values
 54                         against the ABS SDMX API codelists. Defaults to False.
 55        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
 56
 57    Returns:
 58        tuple[pd.DataFrame, pd.DataFrame]: A DataFrame with the fetched data and
 59                                        a DataFrame with the metadata.
 60
 61    Raises:
 62        ValueError: if any input data is not as expected, or if incompatible
 63                   index types are detected (e.g., mixing quarterly and monthly data).
 64
 65    Note: CacheError and HttpError are raised by the fetch function.
 66          These will be caught and reported to standard output.
 67
 68    """
 69    # --- initial setup - empty return results
 70    return_meta = {}
 71    return_data = {}
 72    counter = 0
 73    reference_index_info: IndexInformation | None = None
 74
 75    # --- loop over the rows of the wanted DataFrame
 76    for _index, row in wanted.iterrows():
 77        # --- get the arguments for the fetch (ignoring NaN values)
 78        row_dict: dict[str, str] = row.dropna().to_dict()
 79        flow_id = row_dict.pop("flow_id", "")
 80        if not flow_id:
 81            # --- if there is no flow_id, we will skip this row
 82            print(f"Skipping row with no flow_id: {row_dict}")
 83            continue
 84
 85        # --- fetch the data and meta data for each row of the selection table
 86        try:
 87            data, meta = fetch(flow_id, dims=row_dict, parameters=parameters, validate=validate, **kwargs)
 88        except (CacheError, HttpError, ValueError) as e:
 89            # --- if there is an error, we will skip this row
 90            print(f"Error fetching {flow_id} with dimensions {row_dict}: {e}")
 91            continue
 92        if data.empty or meta.empty:
 93            # --- this should not happen, but if it does, we will skip this row
 94            print(f"No data for {flow_id} with dimensions {row_dict}")
 95            continue
 96
 97        # --- validate index compatibility - including frequency compatibility for PeriodIndex
 98        reference_index_info = _validate_index_compatibility(data, reference_index_info)
 99
100        # --- manage duplicates
101        for col in data.columns:
102            counter += 1
103            save_name = col
104            if save_name in return_data:
105                save_name += f"_{counter:03d}"
106            return_data[save_name] = data[col]
107            return_meta[save_name] = meta.loc[col]
108
109    return pd.DataFrame(return_data), pd.DataFrame(return_meta).T
110
111
112# --- public function
113def fetch_multi(
114    wanted: pd.DataFrame,
115    parameters: dict[str, str] | None = None,
116    *,
117    validate: bool = False,
118    **kwargs: Unpack[GetFileKwargs],
119) -> tuple[pd.DataFrame, pd.DataFrame]:
120    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
121
122    Args:
123        wanted: A DataFrame with rows for each desired data set (of one or more series).
124                Each row should contain the necessary identifiers to fetch the dataset.
125                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
126                The 'flow_id' column is mandatory, and the rest are optional.
127                Note: the DataFrame index is not used in the fetching process.
128        parameters: A dictionary of additional parameters to pass to the fetch function.
129        validate: If True, the function will validate dimensions and values against
130                  the ABS SDMX API codelists. Defaults to False.
131        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
132
133    Returns:
134        A tuple containing two DataFrames:
135        - The first DataFrame contains the fetched data.
136        - The second DataFrame contains metadata about the fetched datasets.
137
138    Raises:
139        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
140
141    Note:
142        CacheError and HttpError are raised by the fetch function.
143        These will be caught and reported to standard output.
144
145    Note:
146        The function validates that all datasets have compatible index types.
147        A ValueError will be raised if incompatible index types are detected
148        (e.g., mixing quarterly and monthly data).
149
150    """
151    # --- debugging output
152    verbose = kwargs.get("verbose", False)
153    if verbose:
154        print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}")
155
156    # --- quick sanity checks
157    if wanted.empty:
158        print("wanted DataFrame is empty, returning empty DataFrames.")
159        return pd.DataFrame(), pd.DataFrame()
160    if "flow_id" not in wanted.columns:
161        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
162
163    # --- do the work
164    return _extract(wanted, parameters, validate=validate, **kwargs)
165
166
167if __name__ == "__main__":
168
169    def module_test() -> None:
170        """Run a simple test of the module."""
171        wanted_text = """
172        flow_id, MEASURE, INDEX, TSEST, REGION, DATA_ITEM, SECTOR, FREQ
173        CPI,           3, 10001,    10,     50,         -,      -,    Q
174        CPI,           3, 999902,   20,     50,         -,      -,    Q
175        CPI,           3, 999903,   20,     50,         -,      -,    Q
176        ANA_EXP,     DCH,      -,   20,    AUS,       FCE,    PHS,    Q
177        ANA_EXP, PCT_DCH,      -,   20,    AUS,       FCE,    PHS,    Q
178        """
179        wanted = pd.read_csv(StringIO(wanted_text), dtype=str, skipinitialspace=True)
180        parameters = {"startPeriod": "2020-Q1", "endPeriod": "2020-Q4", "detail": "full"}
181        fetched_data, _fetched_meta = fetch_multi(
182            wanted,
183            parameters=parameters,
184            validate=False,
185            modality="prefer-url",
186        )
187        expected = (4, 5)
188        if fetched_data.shape == expected:
189            print(f"Test passed: {fetched_data.shape=}.")
190        else:
191            print(f"Test FAILED: data shape {fetched_data.shape} is unexpected {expected=}.")
192
193    module_test()
IndexInformation = tuple[type, str | None]
def fetch_multi( wanted: pandas.core.frame.DataFrame, parameters: dict[str, str] | None = None, *, validate: bool = False, **kwargs: Unpack[sdmxabs.GetFileKwargs]) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
114def fetch_multi(
115    wanted: pd.DataFrame,
116    parameters: dict[str, str] | None = None,
117    *,
118    validate: bool = False,
119    **kwargs: Unpack[GetFileKwargs],
120) -> tuple[pd.DataFrame, pd.DataFrame]:
121    """Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
122
123    Args:
124        wanted: A DataFrame with rows for each desired data set (of one or more series).
125                Each row should contain the necessary identifiers to fetch the dataset.
126                The columns will be 'flow_id', plus the ABS dimensions relevant to the flow.
127                The 'flow_id' column is mandatory, and the rest are optional.
128                Note: the DataFrame index is not used in the fetching process.
129        parameters: A dictionary of additional parameters to pass to the fetch function.
130        validate: If True, the function will validate dimensions and values against
131                  the ABS SDMX API codelists. Defaults to False.
132        **kwargs: Additional keyword arguments passed to the underlying data fetching function.
133
134    Returns:
135        A tuple containing two DataFrames:
136        - The first DataFrame contains the fetched data.
137        - The second DataFrame contains metadata about the fetched datasets.
138
139    Raises:
140        ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame.
141
142    Note:
143        CacheError and HttpError are raised by the fetch function.
144        These will be caught and reported to standard output.
145
146    Note:
147        The function validates that all datasets have compatible index types.
148        A ValueError will be raised if incompatible index types are detected
149        (e.g., mixing quarterly and monthly data).
150
151    """
152    # --- debugging output
153    verbose = kwargs.get("verbose", False)
154    if verbose:
155        print(f"fetch_multi(): {wanted=}, {parameters=}, {validate=}, {kwargs=}")
156
157    # --- quick sanity checks
158    if wanted.empty:
159        print("wanted DataFrame is empty, returning empty DataFrames.")
160        return pd.DataFrame(), pd.DataFrame()
161    if "flow_id" not in wanted.columns:
162        raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.")
163
164    # --- do the work
165    return _extract(wanted, parameters, validate=validate, **kwargs)

Fetch multiple SDMX datasets based on a DataFrame of desired datasets.

Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. parameters: A dictionary of additional parameters to pass to the fetch function. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.

Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.

Raises: ValueError: If the 'flow_id' column is missing from the wanted DataFrame.

Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.

Note: The function validates that all datasets have compatible index types. A ValueError will be raised if incompatible index types are detected (e.g., mixing quarterly and monthly data).