rust_data_processing
Python bindings for the rust-data-processing Rust crate.
The native extension is built with PyO3 and maturin. Prefer the APIs exported here rather than
importing rust_data_processing._rust_data_processing directly.
1"""Python bindings for the `rust-data-processing` Rust crate. 2 3The native extension is built with PyO3 and maturin. Prefer the APIs exported here rather than 4importing ``rust_data_processing._rust_data_processing`` directly. 5""" 6 7from __future__ import annotations 8 9import json 10from importlib.metadata import PackageNotFoundError, version 11from typing import Any, Mapping 12 13from . import cdc 14 15from ._rust_data_processing import ( 16 DataFrame, 17 DataSet, 18 ExecutionEngine, 19 SqlContext, 20 detect_outliers_json, 21 detect_outliers_markdown, 22 extension_version, 23 ingest_from_db, 24 ingest_from_db_infer, 25 ingest_from_path, 26 ingest_from_path_infer, 27 infer_schema_from_path, 28 processing_arg_max_row, 29 processing_arg_min_row, 30 processing_feature_wise_mean_std, 31 processing_filter, 32 processing_map, 33 processing_reduce, 34 processing_top_k_by_frequency, 35 profile_dataset_json, 36 profile_dataset_markdown, 37 sql_query_dataset, 38 transform_apply_json, 39 validate_dataset_json, 40 validate_dataset_markdown, 41) 42 43try: 44 __version__ = version("rust-data-processing") 45except PackageNotFoundError: 46 __version__ = extension_version() 47 48 49def ingest_with_inferred_schema(path: str, options: dict[str, Any] | None = None): 50 """Infer schema once, then ingest (two passes over the file; same as the Rust helper).""" 51 schema = infer_schema_from_path(path, options) 52 return ingest_from_path(path, schema, options), schema 53 54 55def transform_apply(dataset: DataSet, spec: Mapping[str, Any] | str) -> DataSet: 56 """Apply a :class:`TransformSpec` given as JSON string or dict (serde shape).""" 57 if isinstance(spec, str): 58 payload = spec 59 else: 60 payload = json.dumps(spec) 61 return transform_apply_json(dataset, payload) 62 63 64def profile_dataset(dataset: DataSet, options: dict[str, Any] | None = None) -> dict[str, Any]: 65 """Return profiling report as a dict (parsed JSON).""" 66 return json.loads(profile_dataset_json(dataset, options)) 67 68 69def validate_dataset(dataset: DataSet, spec: Mapping[str, Any]) -> dict[str, Any]: 70 """Run validation checks; return report dict (parsed JSON).""" 71 return json.loads(validate_dataset_json(dataset, spec)) 72 73 74def detect_outliers( 75 dataset: DataSet, 76 column: str, 77 method: Mapping[str, Any], 78 options: dict[str, Any] | None = None, 79) -> dict[str, Any]: 80 """Outlier report as dict (parsed JSON).""" 81 return json.loads(detect_outliers_json(dataset, column, method, options)) 82 83 84__all__ = [ 85 "DataFrame", 86 "DataSet", 87 "ExecutionEngine", 88 "SqlContext", 89 "__version__", 90 "cdc", 91 "detect_outliers", 92 "detect_outliers_json", 93 "detect_outliers_markdown", 94 "extension_version", 95 "ingest_from_db", 96 "ingest_from_db_infer", 97 "ingest_from_path", 98 "ingest_from_path_infer", 99 "ingest_with_inferred_schema", 100 "infer_schema_from_path", 101 "processing_arg_max_row", 102 "processing_arg_min_row", 103 "processing_feature_wise_mean_std", 104 "processing_filter", 105 "processing_map", 106 "processing_reduce", 107 "processing_top_k_by_frequency", 108 "profile_dataset", 109 "profile_dataset_json", 110 "profile_dataset_markdown", 111 "sql_query_dataset", 112 "transform_apply", 113 "transform_apply_json", 114 "validate_dataset", 115 "validate_dataset_json", 116 "validate_dataset_markdown", 117]
Polars-backed lazy pipeline; collect to [DataSet] when ready.
In-memory tabular dataset (mirrors rust_data_processing::types::DataSet).
Configurable Rayon-backed engine: parallel filter/map (Python row callbacks acquire the GIL per row),
sequential reduce, and optional on_execution_event hook.
Multi-table SQL context (register several pipeline frames, then execute).
75def detect_outliers( 76 dataset: DataSet, 77 column: str, 78 method: Mapping[str, Any], 79 options: dict[str, Any] | None = None, 80) -> dict[str, Any]: 81 """Outlier report as dict (parsed JSON).""" 82 return json.loads(detect_outliers_json(dataset, column, method, options))
Outlier report as dict (parsed JSON).
50def ingest_with_inferred_schema(path: str, options: dict[str, Any] | None = None): 51 """Infer schema once, then ingest (two passes over the file; same as the Rust helper).""" 52 schema = infer_schema_from_path(path, options) 53 return ingest_from_path(path, schema, options), schema
Infer schema once, then ingest (two passes over the file; same as the Rust helper).
65def profile_dataset(dataset: DataSet, options: dict[str, Any] | None = None) -> dict[str, Any]: 66 """Return profiling report as a dict (parsed JSON).""" 67 return json.loads(profile_dataset_json(dataset, options))
Return profiling report as a dict (parsed JSON).
56def transform_apply(dataset: DataSet, spec: Mapping[str, Any] | str) -> DataSet: 57 """Apply a :class:`TransformSpec` given as JSON string or dict (serde shape).""" 58 if isinstance(spec, str): 59 payload = spec 60 else: 61 payload = json.dumps(spec) 62 return transform_apply_json(dataset, payload)
Apply a TransformSpec given as JSON string or dict (serde shape).
70def validate_dataset(dataset: DataSet, spec: Mapping[str, Any]) -> dict[str, Any]: 71 """Run validation checks; return report dict (parsed JSON).""" 72 return json.loads(validate_dataset_json(dataset, spec))
Run validation checks; return report dict (parsed JSON).