sdmxabs
Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.
1"""Capture data from the Australian Bureau of Statistics (ABS) using the SDMX API.""" 2 3from importlib.metadata import PackageNotFoundError, version 4 5from .data import fetch 6from .download_cache import ( 7 CacheError, 8 GetFileKwargs, 9 HttpError, 10 ModalityType, 11) 12from .metadata import code_lists, data_dimensions, data_flows 13 14# --- version and author 15try: 16 __version__ = version(__name__) 17except PackageNotFoundError: 18 __version__ = "0.0.0" # Fallback for development mode 19__author__ = "Bryan Palmer" 20 21# --- establish the package contents 22__all__ = [ 23 "CacheError", 24 "GetFileKwargs", 25 "HttpError", 26 "ModalityType", 27 "__author__", 28 "__version__", 29 "code_lists", 30 "data_dimensions", 31 "data_flows", 32 "fetch", 33]
A problem retrieving data from the cache.
37class GetFileKwargs(TypedDict): 38 """TypedDict for acqure_url function arguments.""" 39 40 verbose: NotRequired[bool] 41 """If True, print information about the retrieval process.""" 42 modality: NotRequired[ModalityType] 43 """Kind of retrieval: "prefer_cache", "prefer_url"."""
TypedDict for acqure_url function arguments.
A problem retrieving data from HTTP.
84@cache 85def code_lists(cl_id: str, **kwargs: Unpack[GetFileKwargs]) -> pd.DataFrame: 86 """Get the code list metadata from the ABS SDMX API. 87 88 Args: 89 cl_id (str): The ID of the code list to retrieve. 90 **kwargs: Additional keyword arguments passed to acquire_url(). 91 92 Raises: 93 HttpError: If there is an issue with the HTTP request. 94 CacheError: If there is an issue with the cache. 95 ValueError: If no XML root is found in the response. 96 97 """ 98 tree = acquire_xml(f"{URL_STEM}/codelist/ABS/{cl_id}", **kwargs) 99 100 codes = {} 101 for code in tree.findall(".//str:Code", NAME_SPACES): 102 code_id = code.get("id") 103 if code_id is None: 104 continue 105 elements = {} 106 name = code.find("com:Name", NAME_SPACES) 107 elements["name"] = name.text if name is not None else None 108 parent = code.find("str:Parent", NAME_SPACES) 109 parent_id = None 110 if parent is not None: 111 ref = parent.find("Ref", NAME_SPACES) 112 if ref is not None: 113 parent_id = ref.get("id") 114 elements["parent"] = parent_id 115 codes[code_id] = elements 116 117 return pd.DataFrame(codes).T.sort_index().rename_axis(index=cl_id)
Get the code list metadata from the ABS SDMX API.
Args: cl_id (str): The ID of the code list to retrieve. **kwargs: Additional keyword arguments passed to acquire_url().
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
53@cache 54def data_dimensions(flow_id: str, **kwargs: Unpack[GetFileKwargs]) -> pd.DataFrame: 55 """Get the data dimensions metadata from the ABS SDMX API. 56 57 Args: 58 flow_id (str): The ID of the dataflow to retrieve dimensions for. 59 **kwargs: Additional keyword arguments passed to acquire_url(). 60 61 Raises: 62 HttpError: If there is an issue with the HTTP request. 63 CacheError: If there is an issue with the cache. 64 ValueError: If no XML root is found in the response. 65 66 """ 67 tree = acquire_xml(f"{URL_STEM}/datastructure/ABS/{flow_id}", **kwargs) 68 69 dimensions = {} 70 for dim in tree.findall(".//str:Dimension", NAME_SPACES): 71 dim_id = dim.get("id") 72 dim_pos = dim.get("position") 73 if dim_id is None or dim_pos is None: 74 continue 75 contents = {"position": dim_pos} 76 if (lr := dim.find("str:LocalRepresentation", NAME_SPACES)) is not None and ( 77 enumer := lr.find("str:Enumeration/Ref", NAME_SPACES) 78 ) is not None: 79 contents = contents | enumer.attrib 80 dimensions[dim_id] = contents 81 return pd.DataFrame(dimensions).T.rename_axis(index="dimensions")
Get the data dimensions metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve dimensions for. **kwargs: Additional keyword arguments passed to acquire_url().
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
19@cache 20def data_flows(flow_id: str = "all", **kwargs: Unpack[GetFileKwargs]) -> pd.DataFrame: 21 """Get the toplevel metadata from the ABS SDMX API. 22 23 Args: 24 flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". 25 **kwargs: Additional keyword arguments passed to acquire_url(). 26 27 Returns: 28 pd.Series: A Series containing the dataflow IDs and names. 29 30 Raises: 31 HttpError: If there is an issue with the HTTP request. 32 CacheError: If there is an issue with the cache. 33 ValueError: If no XML root is found in the response. 34 35 """ 36 tree = acquire_xml(f"{URL_STEM}/dataflow/ABS/{flow_id}", **kwargs) 37 38 df = {} 39 for dataflow in tree.findall(".//str:Dataflow", NAME_SPACES): 40 attributes = dataflow.attrib.copy() 41 if "id" not in attributes: 42 continue 43 df_id = attributes.pop("id") 44 name_elem = dataflow.find("com:Name", NAME_SPACES) 45 df_name = name_elem.text if name_elem is not None else "(no name)" 46 attributes["name"] = str(df_name) 47 df[df_id] = attributes 48 return pd.DataFrame(df).T.sort_index().rename_axis(index="dataflows") 49 # Note: The returned DataFrame has the dataflow IDs as the index and 50 # the attributes (like name, etc.) as columns.
Get the toplevel metadata from the ABS SDMX API.
Args: flow_id (str): The ID of the dataflow to retrieve. Defaults to "all". **kwargs: Additional keyword arguments passed to acquire_url().
Returns: pd.Series: A Series containing the dataflow IDs and names.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML root is found in the response.
123def fetch( 124 flow_id: str, 125 dims: dict[str, str] | None = None, 126 constraints: dict[str, str] | None = None, # not implemented yet 127 *, 128 validate: bool = False, 129 **kwargs: Unpack[GetFileKwargs], 130) -> tuple[pd.DataFrame, pd.DataFrame]: 131 """Fetch data from the ABS SDMX API. 132 133 Args: 134 flow_id (str): The ID of the data flow from which to retrieve data items. 135 dims (dict[str, str], optional): A dictionary of dimensions to select the 136 data items. If None, the ABS fetch request will be for all data items, 137 which can be slow. 138 constraints (dict[str, str], optional): A dictionary of constraints to apply 139 to the data items. If None, no constraints are applied. 140 validate (bool): If True, print validation diagnostics for the proposed 141 dimensions against the metadata requirements. Defaults to False. 142 **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml(). 143 144 Returns: a tuple of two DataFrames: 145 - The first DataFrame contains the fetched data. 146 - The second DataFrame contains the metadata. 147 148 Raises: 149 HttpError: If there is an issue with the HTTP request. 150 CacheError: If there is an issue with the cache. 151 ValueError: If no XML tree is found in the response. 152 153 """ 154 # --- prepare to get the XML tree from the ABS SDMX API 155 kwargs["modality"] = kwargs.get("modality", "prefer_cache") # default prefer_cache 156 key = build_key( 157 flow_id, 158 dims, 159 validate=validate, 160 ) 161 162 # --- get the XML tree from the ABS SDMX API 163 _not_implemented = constraints 164 url = f"{URL_STEM}/data/{flow_id}/{key}" 165 tree = acquire_xml(url, **kwargs) 166 167 # --- extract and return metadata and data from the XML tree 168 return populate(flow_id, tree)
Fetch data from the ABS SDMX API.
Args: flow_id (str): The ID of the data flow from which to retrieve data items. dims (dict[str, str], optional): A dictionary of dimensions to select the data items. If None, the ABS fetch request will be for all data items, which can be slow. constraints (dict[str, str], optional): A dictionary of constraints to apply to the data items. If None, no constraints are applied. validate (bool): If True, print validation diagnostics for the proposed dimensions against the metadata requirements. Defaults to False. **kwargs (GetFileKwargs): Additional keyword arguments passed to acquire_xml().
Returns: a tuple of two DataFrames: - The first DataFrame contains the fetched data. - The second DataFrame contains the metadata.
Raises: HttpError: If there is an issue with the HTTP request. CacheError: If there is an issue with the cache. ValueError: If no XML tree is found in the response.