rust_data_processing

Python bindings for the rust-data-processing Rust crate.

The native extension is built with PyO3 and maturin. Prefer the APIs exported here rather than importing rust_data_processing._rust_data_processing directly.

  1"""Python bindings for the `rust-data-processing` Rust crate.
  2
  3The native extension is built with PyO3 and maturin. Prefer the APIs exported here rather than
  4importing ``rust_data_processing._rust_data_processing`` directly.
  5"""
  6
  7from __future__ import annotations
  8
  9import json
 10from importlib.metadata import PackageNotFoundError, version
 11from typing import Any, Mapping
 12
 13from . import cdc
 14
 15from ._rust_data_processing import (
 16    DataFrame,
 17    DataSet,
 18    ExecutionEngine,
 19    SqlContext,
 20    detect_outliers_json,
 21    detect_outliers_markdown,
 22    extension_version,
 23    ingest_from_db,
 24    ingest_from_db_infer,
 25    ingest_from_path,
 26    ingest_from_path_infer,
 27    infer_schema_from_path,
 28    processing_arg_max_row,
 29    processing_arg_min_row,
 30    processing_feature_wise_mean_std,
 31    processing_filter,
 32    processing_map,
 33    processing_reduce,
 34    processing_top_k_by_frequency,
 35    profile_dataset_json,
 36    profile_dataset_markdown,
 37    sql_query_dataset,
 38    transform_apply_json,
 39    validate_dataset_json,
 40    validate_dataset_markdown,
 41)
 42
 43try:
 44    __version__ = version("rust-data-processing")
 45except PackageNotFoundError:
 46    __version__ = extension_version()
 47
 48
 49def ingest_with_inferred_schema(path: str, options: dict[str, Any] | None = None):
 50    """Infer schema once, then ingest (two passes over the file; same as the Rust helper)."""
 51    schema = infer_schema_from_path(path, options)
 52    return ingest_from_path(path, schema, options), schema
 53
 54
 55def transform_apply(dataset: DataSet, spec: Mapping[str, Any] | str) -> DataSet:
 56    """Apply a :class:`TransformSpec` given as JSON string or dict (serde shape)."""
 57    if isinstance(spec, str):
 58        payload = spec
 59    else:
 60        payload = json.dumps(spec)
 61    return transform_apply_json(dataset, payload)
 62
 63
 64def profile_dataset(dataset: DataSet, options: dict[str, Any] | None = None) -> dict[str, Any]:
 65    """Return profiling report as a dict (parsed JSON)."""
 66    return json.loads(profile_dataset_json(dataset, options))
 67
 68
 69def validate_dataset(dataset: DataSet, spec: Mapping[str, Any]) -> dict[str, Any]:
 70    """Run validation checks; return report dict (parsed JSON)."""
 71    return json.loads(validate_dataset_json(dataset, spec))
 72
 73
 74def detect_outliers(
 75    dataset: DataSet,
 76    column: str,
 77    method: Mapping[str, Any],
 78    options: dict[str, Any] | None = None,
 79) -> dict[str, Any]:
 80    """Outlier report as dict (parsed JSON)."""
 81    return json.loads(detect_outliers_json(dataset, column, method, options))
 82
 83
 84__all__ = [
 85    "DataFrame",
 86    "DataSet",
 87    "ExecutionEngine",
 88    "SqlContext",
 89    "__version__",
 90    "cdc",
 91    "detect_outliers",
 92    "detect_outliers_json",
 93    "detect_outliers_markdown",
 94    "extension_version",
 95    "ingest_from_db",
 96    "ingest_from_db_infer",
 97    "ingest_from_path",
 98    "ingest_from_path_infer",
 99    "ingest_with_inferred_schema",
100    "infer_schema_from_path",
101    "processing_arg_max_row",
102    "processing_arg_min_row",
103    "processing_feature_wise_mean_std",
104    "processing_filter",
105    "processing_map",
106    "processing_reduce",
107    "processing_top_k_by_frequency",
108    "profile_dataset",
109    "profile_dataset_json",
110    "profile_dataset_markdown",
111    "sql_query_dataset",
112    "transform_apply",
113    "transform_apply_json",
114    "validate_dataset",
115    "validate_dataset_json",
116    "validate_dataset_markdown",
117]
class DataFrame:

Polars-backed lazy pipeline; collect to [DataSet] when ready.

def from_dataset(ds):

The type of the None singleton.

def filter_eq(self, /, column, value):

The type of the None singleton.

def filter_not_null(self, /, column):

The type of the None singleton.

def filter_mod_eq_int64(self, /, column, modulus, equals):

The type of the None singleton.

def select(self, /, columns):

The type of the None singleton.

def rename(self, /, pairs):

The type of the None singleton.

def drop(self, /, columns):

The type of the None singleton.

def cast(self, /, column, to):

The type of the None singleton.

def cast_with_mode(self, /, column, to, mode):

The type of the None singleton.

def fill_null(self, /, column, value):

The type of the None singleton.

def with_literal(self, /, name, value):

The type of the None singleton.

def multiply_f64(self, /, column, factor):

The type of the None singleton.

def add_f64(self, /, column, delta):

The type of the None singleton.

def with_mul_f64(self, /, name, source, factor):

The type of the None singleton.

def with_add_f64(self, /, name, source, delta):

The type of the None singleton.

def group_by(self, /, keys, aggs):

The type of the None singleton.

def join(self, /, other, left_on, right_on, how):

The type of the None singleton.

def collect(self, /):

The type of the None singleton.

def collect_with_schema(self, /, schema):

The type of the None singleton.

def reduce(self, /, column, op):

The type of the None singleton.

def sum(self, /, column):

The type of the None singleton.

def feature_wise_mean_std(self, /, columns, std_kind=None):

The type of the None singleton.

class DataSet:

In-memory tabular dataset (mirrors rust_data_processing::types::DataSet).

def row_count(self, /):

The type of the None singleton.

def column_names(self, /):

The type of the None singleton.

def schema(self, /):

The type of the None singleton.

def to_rows(self, /):

The type of the None singleton.

class ExecutionEngine:

Configurable Rayon-backed engine: parallel filter/map (Python row callbacks acquire the GIL per row), sequential reduce, and optional on_execution_event hook.

def filter_parallel(self, /, ds, predicate):

The type of the None singleton.

def map_parallel(self, /, ds, mapper):

The type of the None singleton.

def reduce(self, /, ds, column, op):

The type of the None singleton.

def metrics_snapshot(self, /):

The type of the None singleton.

class SqlContext:

Multi-table SQL context (register several pipeline frames, then execute).

def register(self, /, name, df):

The type of the None singleton.

def execute(self, /, sql):

The type of the None singleton.

__version__ = '0.1.5'
def detect_outliers( dataset: DataSet, column: str, method: Mapping[str, Any], options: dict[str, typing.Any] | None = None) -> dict[str, typing.Any]:
75def detect_outliers(
76    dataset: DataSet,
77    column: str,
78    method: Mapping[str, Any],
79    options: dict[str, Any] | None = None,
80) -> dict[str, Any]:
81    """Outlier report as dict (parsed JSON)."""
82    return json.loads(detect_outliers_json(dataset, column, method, options))

Outlier report as dict (parsed JSON).

def detect_outliers_json(ds, column, method, options=None):
def detect_outliers_markdown(ds, column, method, options=None):
def extension_version():
def ingest_from_db(conn, query, schema):
def ingest_from_db_infer(conn, query):
def ingest_from_path(path, schema, options=None):
def ingest_from_path_infer(path, options=None):
def ingest_with_inferred_schema(path: str, options: dict[str, typing.Any] | None = None):
50def ingest_with_inferred_schema(path: str, options: dict[str, Any] | None = None):
51    """Infer schema once, then ingest (two passes over the file; same as the Rust helper)."""
52    schema = infer_schema_from_path(path, options)
53    return ingest_from_path(path, schema, options), schema

Infer schema once, then ingest (two passes over the file; same as the Rust helper).

def infer_schema_from_path(path, options=None):
def processing_arg_max_row(ds, column):
def processing_arg_min_row(ds, column):
def processing_feature_wise_mean_std(ds, columns, std_kind=None):
def processing_filter(ds, predicate):
def processing_map(ds, mapper):
def processing_reduce(ds, column, op):
def processing_top_k_by_frequency(ds, column, k):
def profile_dataset( dataset: DataSet, options: dict[str, typing.Any] | None = None) -> dict[str, typing.Any]:
65def profile_dataset(dataset: DataSet, options: dict[str, Any] | None = None) -> dict[str, Any]:
66    """Return profiling report as a dict (parsed JSON)."""
67    return json.loads(profile_dataset_json(dataset, options))

Return profiling report as a dict (parsed JSON).

def profile_dataset_json(ds, options=None):
def profile_dataset_markdown(ds, options=None):
def sql_query_dataset(ds, sql):
def transform_apply(dataset: DataSet, spec: Union[Mapping[str, Any], str]) -> DataSet:
56def transform_apply(dataset: DataSet, spec: Mapping[str, Any] | str) -> DataSet:
57    """Apply a :class:`TransformSpec` given as JSON string or dict (serde shape)."""
58    if isinstance(spec, str):
59        payload = spec
60    else:
61        payload = json.dumps(spec)
62    return transform_apply_json(dataset, payload)

Apply a TransformSpec given as JSON string or dict (serde shape).

def transform_apply_json(ds, spec_json):
def validate_dataset(dataset: DataSet, spec: Mapping[str, Any]) -> dict[str, typing.Any]:
70def validate_dataset(dataset: DataSet, spec: Mapping[str, Any]) -> dict[str, Any]:
71    """Run validation checks; return report dict (parsed JSON)."""
72    return json.loads(validate_dataset_json(dataset, spec))

Run validation checks; return report dict (parsed JSON).

def validate_dataset_json(ds, spec):
def validate_dataset_markdown(ds, spec):