sdmxabs
Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.
1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.""" 2 3from importlib.metadata import PackageNotFoundError, version 4 5from .download_cache import ( 6 CacheError, 7 GetFileKwargs, 8 HttpError, 9 ModalityType, 10) 11from .fetch import fetch 12from .fetch_multi import fetch_multi 13from .flow_metadata import code_lists, data_dimensions, data_flows 14from .select_items import MatchCriteria, MatchItem, MatchType, fetch_selection, match_criterion, select_items 15 16# --- version and author 17try: 18 __version__ = version(__name__) 19except PackageNotFoundError: 20 __version__ = "0.0.0" # Fallback for development mode 21__author__ = "Bryan Palmer" 22 23# --- establish the package contents 24__all__ = [ 25 "CacheError", 26 "GetFileKwargs", 27 "HttpError", 28 "MatchCriteria", 29 "MatchItem", 30 "MatchType", 31 "ModalityType", 32 "__author__", 33 "__version__", 34 "code_lists", 35 "data_dimensions", 36 "data_flows", 37 "fetch", 38 "fetch_multi", 39 "fetch_selection", 40 "match_criterion", 41 "select_items", 42]
A problem retrieving data from the cache.
37class GetFileKwargs(TypedDict): 38 """TypedDict for acqure_url function arguments.""" 39 40 verbose: NotRequired[bool] 41 """If True, print information about the data retrieval process.""" 42 modality: NotRequired[ModalityType] 43 """Kind of retrieval: "prefer_cache", "prefer_url"."""
TypedDict for acqure_url function arguments.
A problem retrieving data from HTTP.
15class MatchType(Enum): 16 """Enumeration for match types.""" 17 18 EXACT = 1 19 PARTIAL = 2 20 REGEX = 3
Enumeration for match types.
90@cache 91def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 92 """Get the code list metadata from the ABS SDMX API. 93 94 Args: 95 cl_id (str): The ID of the code list to retrieve. 96 **kwargs: Additional keyword arguments passed to acquire_url(). 97 98 Returns: 99 FlowMetaDict: A dictionary containing the codes and 100 their associated key=value pairs. A "name" key should always 101 be present. A "parent" key may also be present. 102 103 Raises: 104 HttpError: If there is an issue with the HTTP request. 105 CacheError: If there is an issue with the cache. 106 ValueError: If no XML root is found in the response. 107 108 """ 109 tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs) 110 111 codes: FlowMetaDict = {} 112 for code in tree.findall(".//str:Code", NAME_SPACES): 113 code_id = code.get("id", None) 114 if code_id is None: 115 continue 116 elements: dict[str, str] = {} 117 name = code.find("com:Name", NAME_SPACES) 118 elements["name"] = str(name.text) if name is not None else "(missing)" 119 parent = code.find("str:Parent", NAME_SPACES) 120 parent_id = "" 121 if parent is not None: 122 ref = parent.find("Ref", NAME_SPACES) 123 if ref is not None: 124 parent_id = str(ref.get("id", "")) 125 elements["parent"] = parent_id 126 codes[code_id] = elements 127 128 return codes
Get the code list metadata from the ABS SDMX API.
Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: FlowMetaDict: A dictionary containing the codes and their associated key=value pairs. A "name" key should always be present. A "parent" key may also be present.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
55@cache 56def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 57 """Get the data dimensions metadata from the ABS SDMX API. 58 59 Args: 60 flow_id (str): The ID of the dataflow to retrieve dimensions for. 61 **kwargs: Additional keyword arguments passed to acquire_url(). 62 63 Returns: 64 dict[str, dict[str, str]]: A dictionary containing the dimensions and 65 their metadata in key=value pairs. 66 67 Raises: 68 HttpError: If there is an issue with the HTTP request. 69 CacheError: If there is an issue with the cache. 70 ValueError: If no XML root is found in the response. 71 72 """ 73 tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs) 74 75 dimensions = {} 76 for dim in tree.findall(".//str:Dimension", NAME_SPACES): 77 dim_id = dim.get("id") 78 dim_pos = dim.get("position") 79 if dim_id is None or dim_pos is None: 80 continue 81 contents = {"position": dim_pos} 82 if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and ( 83 enumer := lr.find("str:Enumeration/Ref", NAME_SPACES) 84 ) is not None: 85 contents = contents | enumer.attrib 86 dimensions[dim_id] = contents 87 return dimensions
Get the data dimensions metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dimensions and their metadata in key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
22@cache 23def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> FlowMetaDict: 24 """Get the toplevel metadata from the ABS SDMX API. 25 26 Args: 27 flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". 28 **kwargs: Additional keyword arguments passed to acquire_url(). 29 30 Returns: 31 dict[str, dict[str, str]]: A dictionary containing the dataflow IDs 32 and their metadatain key=value pairs. 33 34 Raises: 35 HttpError: If there is an issue with the HTTP request. 36 CacheError: If there is an issue with the cache. 37 ValueError: If no XML root is found in the response. 38 39 """ 40 tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs) 41 42 d_flows: FlowMetaDict = {} 43 for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES): 44 attributes: dict[str, str] = dataflow.attrib.copy() 45 if "id" not in attributes: 46 continue 47 df_id = attributes.pop("id") 48 name_elem = dataflow.find("com:Name", NAME_SPACES) 49 df_name = name_elem.text if name_elem is not None else "(no name)" 50 attributes["name"] = str(df_name) 51 d_flows[df_id] = attributes 52 return d_flows
Get the toplevel metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().
Returns: dict[str, dict[str, str]]: A dictionary containing the dataflow IDs and their metadatain key=value pairs.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
135def fetch( 136 flow_id: str, 137 dims: dict[str, str] | None = None, 138 constraints: dict[str, str] | None = None, # not implemented yet 139 *, 140 validate: bool = False, 141 **kwargs: Unpack[GetFileKwargs], 142) -> tuple[pd.DataFrame, pd.DataFrame]: 143 """Fetch data from the ABS SDMX API. 144 145 Args: 146 flow_id (str): The ID of the data flow from which to retrieve data items. 147 dims (dict[str, str], optional): A dictionary of dimensions to select the 148 data items. If None, the ABS fetch request will be for all data items, 149 which can be slow. 150 constraints (dict[str, str], optional): A dictionary of constraints to apply 151 to the data items. If None, no constraints are applied. 152 validate (bool): If True, print validation diagnostics for the proposed 153 dimensions against the metadata requirements. Defaults to False. 154 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 155 156 Returns: a tuple of two DataFrames: 157 - The first DataFrame contains the fetched data. 158 - The second DataFrame contains the metadata. 159 160 Raises: 161 HttpError: If there is an issue with the HTTP request. 162 CacheError: If there is an issue with the cache. 163 ValueError: If no XML root is found in the response. 164 165 """ 166 # --- prepare to get the XML root from the ABS SDMX API 167 kwargs["modality"] = kwargs.get("modality", "prefer-cache") 168 key = build_key( 169 flow_id, 170 dims, 171 validate=validate, 172 ) 173 _not_implemented = constraints 174 url = f"{URL_STEM}/data/{flow_id}/{key}" 175 xml_root = acquire_xml(url, **kwargs) 176 return extract(flow_id, xml_root)
Fetch data from the ABS SDMX API.
Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
75def fetch_multi( 76 wanted: pd.DataFrame, 77 *, 78 validate: bool = False, 79 **kwargs: Unpack[GetFileKwargs], 80) -> tuple[pd.DataFrame, pd.DataFrame]: 81 """Fetch multiple SDMX datasets based on a DataFrame of desired datasets. 82 83 Args: 84 wanted: A DataFrame with rows for each desired data set (of one or more series). 85 Each row should contain the necessary identifiers to fetch the dataset. 86 The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. 87 The 'flow_id' column is mandatory, and the rest are optional. 88 Note: the DataFrame index is not used in the fetching process. 89 validate: If True, the function will validate dimensions and values against 90 the ABS SDMX API codelists. Defaults to False. 91 **kwargs: Additional keyword arguments passed to the underlying data fetching function. 92 93 Returns: 94 A tuple containing two DataFrames: 95 - The first DataFrame contains the fetched data. 96 - The second DataFrame contains metadata about the fetched datasets. 97 98 Raises: 99 ValueError: If the 'flow_id' column is missing from the `wanted` DataFrame. 100 101 Note: 102 CacheError and HttpError are raised by the fetch function. 103 These will be caught and reported to standard output. 104 105 Caution: 106 The selected data should all have the same index. You cannot mix (for example) 107 Quarterly and Monthly data in the same DataFrame. 108 109 """ 110 # --- quick sanity checks 111 if wanted.empty: 112 print("wanted DataFrame is empty, returning empty DataFrames.") 113 return pd.DataFrame(), pd.DataFrame() 114 if "flow_id" not in wanted.columns: 115 raise ValueError("The 'flow_id' column is required in the 'wanted' DataFrame.") 116 117 # --- do the work 118 return extract(wanted, validate=validate, **kwargs)
Fetch multiple SDMX datasets based on a DataFrame of desired datasets.
Args: wanted: A DataFrame with rows for each desired data set (of one or more series). Each row should contain the necessary identifiers to fetch the dataset. The columns will be 'flow_id', plus the ABS dimensions relevant to the flow. The 'flow_id' column is mandatory, and the rest are optional. Note: the DataFrame index is not used in the fetching process. validate: If True, the function will validate dimensions and values against the ABS SDMX API codelists. Defaults to False. **kwargs: Additional keyword arguments passed to the underlying data fetching function.
Returns: A tuple containing two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains metadata about the fetched datasets.
Raises:
ValueError: If the 'flow_id' column is missing from the wanted DataFrame.
Note: CacheError and HttpError are raised by the fetch function. These will be caught and reported to standard output.
Caution: The selected data should all have the same index. You cannot mix (for example) Quarterly and Monthly data in the same DataFrame.
139def fetch_selection( 140 flow_id: str, 141 criteria: MatchCriteria, 142) -> tuple[pd.DataFrame, pd.DataFrame]: 143 """Fetch data based on a selection criteria for items. 144 145 Args: 146 flow_id (str): The ID of the data flow to fetch. 147 criteria (MatchCriteria): A sequence of match criteria to filter the data. 148 149 Returns: 150 tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata. 151 152 """ 153 # --- select items based on the criteria 154 selection = select_items(flow_id, criteria) 155 156 # --- fetch the data using the selected items 157 return fetch_multi(selection)
Fetch data based on a selection criteria for items.
Args: flow_id (str): The ID of the data flow to fetch. criteria (MatchCriteria): A sequence of match criteria to filter the data.
Returns: tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the fetched data and metadata.
60def match_criterion( 61 pattern: str, 62 dimension: str, 63 match_type: MatchType = MatchType.PARTIAL, 64) -> MatchItem: 65 """Create a new match criterion for use in selection. 66 67 Args: 68 pattern (str): The pattern to match. 69 dimension (str): The dimension to match against. 70 match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT. 71 72 Returns: 73 MatchElement: A tuple representing the match element. 74 75 """ 76 return (pattern, dimension, match_type)
Create a new match criterion for use in selection.
Args: pattern (str): The pattern to match. dimension (str): The dimension to match against. match_type (MatchType, optional): The type of match to perform. Defaults to MatchType.EXACT.
Returns: MatchElement: A tuple representing the match element.
79def select_items( 80 flow_id: str, 81 criteria: MatchCriteria, 82) -> pd.DataFrame: 83 """Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata. 84 85 Args: 86 flow_id (str): The ID of the data flow to select items from. 87 criteria (MatchElements): A sequence of tuples containing the element name, 88 the value to match, and the match type (exact, partial, or regex). 89 90 Returns: 91 pd.DataFrame: A DataFrame containing the selected items, which can be dropped 92 into the call of the function fetch_multi(). 93 94 Raises: 95 ValueError: If the flow_id is not valid or if no items match the criteria. 96 97 Notes: 98 - Should build a one line DataFrame. This Frame may select multiple data series, 99 when passed to fetch_multi. It also can be concatenated with other DataFrames 100 to build a larger selection. 101 - If two match elements refer to the same dimension, only the `intersection` of the 102 matches will be returned. 103 104 """ 105 # --- some sanity checks 106 if flow_id not in data_flows(): 107 raise ValueError(f"Invalid flow_id: {flow_id}.") 108 dimensions = data_dimensions(flow_id) 109 if not dimensions: 110 raise ValueError(f"No dimensions found for flow_id: {flow_id}.") 111 112 # --- lets build the codelist dictionary 113 return_dict: dict[str, str] = {} 114 for pattern, dimension, match_type in criteria: 115 if dimension not in dimensions: 116 print(f"Dimension '{dimension}' not found for flow '{flow_id}'; (skipping)") 117 continue 118 dim_dict = dimensions[dimension] 119 code_list_dict = get_code_list_dict(dimension, dim_dict) 120 if not code_list_dict: 121 continue 122 123 codes = get_codes(code_list_dict, pattern, match_type) 124 125 # --- combine (as an intersection) with previous matches for this dimension 126 if dimension in return_dict: 127 previous = return_dict[dimension].split("+") 128 codes = list(set(previous).intersection(set(codes))) 129 if not codes: 130 del return_dict[dimension] # no matches, remove dimension 131 if codes: 132 return_dict[dimension] = "+".join(list(set(codes))) 133 134 # --- return a DataFrame 135 return_dict["flow_id"] = flow_id 136 return pd.DataFrame([return_dict])
Build the 'wanted' Dataframe for use by fetch_multi() by matching data flow metadata.
Args: flow_id (str): The ID of the data flow to select items from. criteria (MatchElements): A sequence of tuples containing the element name, the value to match, and the match type (exact, partial, or regex).
Returns: pd.DataFrame: A DataFrame containing the selected items, which can be dropped into the call of the function fetch_multi().
Raises: ValueError: If the flow_id is not valid or if no items match the criteria.
Notes:
- Should build a one line DataFrame. This Frame may select multiple data series, when passed to fetch_multi. It also can be concatenated with other DataFrames to build a larger selection.
- If two match elements refer to the same dimension, only the
intersectionof the matches will be returned.