dashdq
DashDQ — Data Quality for Databricks.
Workflow::
# Cell 1: open wizard, configure checks, click Save Config
config = dashdq.configure()
# Cell 2: run checks and get a DQReport
report = dashdq.run_checks(config)
# Or all-in-one:
dashdq.launch()
1""" 2DashDQ — Data Quality for Databricks. 3 4Workflow:: 5 6 # Cell 1: open wizard, configure checks, click Save Config 7 config = dashdq.configure() 8 9 # Cell 2: run checks and get a DQReport 10 report = dashdq.run_checks(config) 11 12 # Or all-in-one: 13 dashdq.launch() 14""" 15from dashdq.suite import run_checks, table_quality_ok, DQReport 16from dashdq.checks import CHECKS_REGISTRY, CheckResult 17 18__version__ = "0.1.12" 19__author__ = "Darshan Shah" 20__email__ = "darshan.innovation@gmail.com" 21__license__ = "Apache-2.0" 22__url__ = "https://github.com/dash-libs/dash-dq" 23 24__all__ = [ 25 "configure", "run_checks", "table_quality_ok", "launch", 26 "DQReport", "CheckResult", "CHECKS_REGISTRY", 27] 28 29 30def configure(spark=None) -> dict: 31 """Open the DashDQ wizard. Returns a config dict filled on Save Configuration.""" 32 from dashdq.ui import configure as _configure 33 return _configure(spark=spark) 34 35 36def launch(spark=None) -> None: 37 """All-in-one: open wizard + Run Checks button.""" 38 from dashdq.ui import launch as _launch 39 return _launch(spark=spark)
def
configure(spark=None) -> dict:
31def configure(spark=None) -> dict: 32 """Open the DashDQ wizard. Returns a config dict filled on Save Configuration.""" 33 from dashdq.ui import configure as _configure 34 return _configure(spark=spark)
Open the DashDQ wizard. Returns a config dict filled on Save Configuration.
175def run_checks(config, spark=None) -> DQReport: 176 """ 177 Execute all checks defined in config and return a DQReport. 178 179 ``config`` can be: 180 - a **dict** returned by ``dashdq.configure()`` 181 - a **file path** (str) to a JSON config saved by the wizard 182 183 config shape:: 184 185 { 186 "source": {"table": "catalog.schema.table"}, 187 "metadata": {"data_owner": "", "data_steward": "", ...}, # optional 188 "checks": [ 189 {"check_name": "expect_column_values_to_not_be_null", 190 "column": "customer_id", 191 "threshold_pct": 100.0, 192 "params": {}}, 193 ... 194 ], 195 "output": {"types": ["delta"], "delta_table": "..."} # optional 196 } 197 """ 198 import os 199 if isinstance(config, (str, os.PathLike)): 200 path = str(config) 201 if not os.path.exists(path): 202 raise FileNotFoundError(f"DashDQ config file not found: {path}") 203 with open(path) as f: 204 config = json.load(f) 205 206 if not config: 207 raise ValueError("Config is empty — run dashdq.configure() first and click 'Save Config'.") 208 209 if spark is None: 210 from pyspark.sql import SparkSession 211 spark = SparkSession.getActiveSession() 212 213 table = config["source"]["table"] 214 df = spark.table(table) 215 total_cols = len(df.columns) 216 metadata = config.get("metadata", {}) 217 run_ts = datetime.now().isoformat(timespec="seconds") 218 tags_str = ",".join(metadata.get("tags", [])) 219 220 checked_cols: set[str] = set() 221 results: list[CheckResult] = [] 222 223 for chk in config.get("checks", []): 224 name = chk["check_name"] 225 col = chk.get("column", "_TABLE_LEVEL_") 226 threshold = float(chk.get("threshold_pct", 100.0)) 227 params = chk.get("params", {}) 228 229 entry = CHECKS_REGISTRY.get(name) 230 if not entry: 231 continue 232 233 try: 234 if entry.get("cross_table"): 235 total, passed, failed = entry["fn"](df, col, params, spark) 236 else: 237 total, passed, failed = entry["fn"](df, col, params) 238 passed_pct = round(passed / total * 100, 2) if total > 0 else 0.0 239 status = "PASS" if passed_pct >= threshold else "FAIL" 240 if not entry.get("table_level") and not entry.get("compound"): 241 checked_cols.add(col) 242 except Exception as exc: 243 total = passed = failed = 0 244 passed_pct = 0.0 245 status = f"ERROR: {exc}" 246 247 results.append(CheckResult( 248 table_name=table, 249 column_name=col, 250 check_name=name, 251 dq_dimension=entry["dimension"], 252 total_rows=total, 253 passed_rows=passed, 254 failed_rows=failed, 255 passed_pct=passed_pct, 256 threshold_pct=threshold, 257 status=status, 258 check_params=json.dumps(params), 259 run_timestamp=run_ts, 260 data_owner=metadata.get("data_owner", ""), 261 data_steward=metadata.get("data_steward", ""), 262 business_domain=metadata.get("business_domain", ""), 263 table_description=metadata.get("description", ""), 264 tags=tags_str, 265 columns_checked=0, # back-filled below 266 total_columns=total_cols, 267 column_coverage_pct=0.0, 268 )) 269 270 # Back-fill coverage (same value for all rows — table-level metric) 271 n_covered = len(checked_cols) 272 coverage = round(n_covered / total_cols * 100, 2) if total_cols else 0.0 273 for r in results: 274 r.columns_checked = n_covered 275 r.column_coverage_pct = coverage 276 277 report = DQReport(results, config) 278 279 # Auto-save if output block present and not dataframe-only 280 output_cfg = config.get("output", {}) 281 types = output_cfg.get("types") or ([output_cfg.get("type", "dataframe")]) 282 if output_cfg and types != ["dataframe"]: 283 report.save(output_cfg, spark) 284 285 return report
Execute all checks defined in config and return a DQReport.
config can be:
- a dict returned by
dashdq.configure() - a file path (str) to a JSON config saved by the wizard
config shape::
{
"source": {"table": "catalog.schema.table"},
"metadata": {"data_owner": "", "data_steward": "", ...}, # optional
"checks": [
{"check_name": "expect_column_values_to_not_be_null",
"column": "customer_id",
"threshold_pct": 100.0,
"params": {}},
...
],
"output": {"types": ["delta"], "delta_table": "..."} # optional
}
def
table_quality_ok(config, spark=None) -> bool:
288def table_quality_ok(config, spark=None) -> bool: 289 """Run all configured checks and return True if every check passes. 290 291 Useful as a gate before consuming a table:: 292 293 if dashdq.table_quality_ok(config, spark=spark): 294 df = spark.table(config["source"]["table"]) 295 # safe to use 296 else: 297 raise RuntimeError("Table failed quality checks — aborting pipeline.") 298 """ 299 report = run_checks(config, spark=spark) 300 return report.table_summary().get("overall_status") == "PASS"
Run all configured checks and return True if every check passes.
Useful as a gate before consuming a table::
if dashdq.table_quality_ok(config, spark=spark):
df = spark.table(config["source"]["table"])
# safe to use
else:
raise RuntimeError("Table failed quality checks — aborting pipeline.")
def
launch(spark=None) -> None:
37def launch(spark=None) -> None: 38 """All-in-one: open wizard + Run Checks button.""" 39 from dashdq.ui import launch as _launch 40 return _launch(spark=spark)
All-in-one: open wizard + Run Checks button.
class
DQReport:
9class DQReport: 10 def __init__(self, results: list[CheckResult], config: dict): 11 self.results = results 12 self.config = config 13 14 # ── Row-level outputs (one row per check × column) ──────────────────────── 15 16 def to_dict(self) -> list[dict]: 17 return [r.to_dict() for r in self.results] 18 19 def to_spark_df(self, spark=None): 20 if spark is None: 21 from pyspark.sql import SparkSession 22 spark = SparkSession.getActiveSession() 23 return spark.createDataFrame([r.to_dict() for r in self.results]) 24 25 def to_pandas(self): 26 import pandas as pd 27 return pd.DataFrame(self.to_dict()) 28 29 def display(self): 30 try: 31 from IPython.display import display as ipy_display 32 ipy_display(self.to_pandas()) 33 except Exception: 34 for r in self.results: 35 print(r) 36 37 def summary(self) -> dict: 38 total = len(self.results) 39 passed = sum(1 for r in self.results if r.status == "PASS") 40 return { 41 "total_checks": total, 42 "passed": passed, 43 "failed": total - passed, 44 "pass_rate_pct": round(passed / total * 100, 1) if total else 0, 45 } 46 47 # ── Table-level summary (one row per table run) ─────────────────────────── 48 49 def table_summary(self) -> dict: 50 """Single-row summary at table level. 51 52 clean_records = rows that passed every column check applied to them. 53 overall_status = PASS only if all checks passed. 54 """ 55 if not self.results: 56 return {} 57 58 r0 = self.results[0] 59 metadata = self.config.get("metadata", {}) 60 total_rows = r0.total_rows 61 total_checks = len(self.results) 62 passed_checks = sum(1 for r in self.results if r.status == "PASS") 63 failed_checks = total_checks - passed_checks 64 overall_status = "PASS" if failed_checks == 0 else "FAIL" 65 66 # Clean records: rows not flagged as failed by ANY check. 67 # Each check reports failed_rows independently; we sum them as a 68 # conservative lower-bound on dirty rows (exact intersection needs a join). 69 total_failed_rows = sum(r.failed_rows for r in self.results) 70 # Cap at total_rows to avoid negative clean counts when checks overlap 71 dirty_rows = min(total_failed_rows, total_rows) if total_rows else 0 72 clean_rows = max(0, total_rows - dirty_rows) 73 clean_pct = round(clean_rows / total_rows * 100, 2) if total_rows else 0.0 74 75 return { 76 "table_name": r0.table_name, 77 "overall_status": overall_status, 78 "total_rows": total_rows, 79 "clean_rows": clean_rows, 80 "dirty_rows": dirty_rows, 81 "clean_pct": clean_pct, 82 "total_checks": total_checks, 83 "passed_checks": passed_checks, 84 "failed_checks": failed_checks, 85 "columns_checked": r0.columns_checked, 86 "total_columns": r0.total_columns, 87 "column_coverage_pct": r0.column_coverage_pct, 88 "run_timestamp": r0.run_timestamp, 89 "data_owner": metadata.get("data_owner", ""), 90 "data_steward": metadata.get("data_steward", ""), 91 "business_domain": metadata.get("business_domain", ""), 92 "description": metadata.get("description", ""), 93 "tags": ",".join(metadata.get("tags", [])), 94 } 95 96 def to_table_summary_df(self, spark=None): 97 """Spark DataFrame with one row summarising this table run.""" 98 if spark is None: 99 from pyspark.sql import SparkSession 100 spark = SparkSession.getActiveSession() 101 return spark.createDataFrame([self.table_summary()]) 102 103 def table_summary_pandas(self): 104 import pandas as pd 105 return pd.DataFrame([self.table_summary()]) 106 107 def save(self, output_cfg: dict, spark=None): 108 """Persist results to one or more destinations defined in output_cfg.""" 109 import os 110 111 # Support both old single-type ("type") and new multi-type ("types") format 112 types = output_cfg.get("types") or ([output_cfg["type"]] if "type" in output_cfg else ["dataframe"]) 113 114 results = {} 115 sdf = None 116 117 for otype in types: 118 if otype == "dataframe": 119 if sdf is None: 120 sdf = self.to_spark_df(spark) 121 results["dataframe"] = sdf 122 123 elif otype == "delta": 124 if sdf is None: 125 sdf = self.to_spark_df(spark) 126 table = output_cfg.get("delta_table", "") 127 if not table: 128 print("⚠️ delta_table not set — skipping Delta output") 129 continue 130 (sdf.write.format("delta") 131 .mode("append") 132 .option("mergeSchema", "true") 133 .saveAsTable(table)) 134 print(f"✅ Saved to Delta table: {table}") 135 # Also write table-level summary to <table>_summary if configured 136 summary_table = output_cfg.get("summary_delta_table", "") 137 if summary_table: 138 (self.to_table_summary_df(spark) 139 .write.format("delta") 140 .mode("append") 141 .option("mergeSchema", "true") 142 .saveAsTable(summary_table)) 143 print(f"✅ Saved table summary to: {summary_table}") 144 results["delta"] = sdf 145 146 elif otype in ("volume_json", "volume_csv"): 147 # vol_path already contains catalog/schema from the wizard 148 vol_path = output_cfg.get("volume_path", "").rstrip("/") 149 table_name = self.config.get("source", {}).get("table", "") 150 tbl = table_name.split(".")[-1] if table_name else "table" 151 filename = (output_cfg.get("filename") 152 or f"dq_{tbl}_{datetime.now().strftime('%Y%m%d_%H%M%S')}") 153 ext = "json" if otype == "volume_json" else "csv" 154 os.makedirs(vol_path, exist_ok=True) 155 full = f"{vol_path}/{filename}.{ext}" 156 pdf = self.to_pandas() 157 if ext == "json": 158 pdf.to_json(full, orient="records", indent=2) 159 else: 160 pdf.to_csv(full, index=False) 161 print(f"✅ Saved to: {full}") 162 # Summary file alongside: same dir, _summary suffix 163 summary_file = f"{vol_path}/{filename}_summary.{ext}" 164 spdf = self.table_summary_pandas() 165 if ext == "json": 166 spdf.to_json(summary_file, orient="records", indent=2) 167 else: 168 spdf.to_csv(summary_file, index=False) 169 print(f"✅ Saved table summary to: {summary_file}") 170 results[otype] = full 171 172 return results.get("dataframe") or (sdf if sdf is not None else None)
DQReport(results: list[CheckResult], config: dict)
def
table_summary(self) -> dict:
49 def table_summary(self) -> dict: 50 """Single-row summary at table level. 51 52 clean_records = rows that passed every column check applied to them. 53 overall_status = PASS only if all checks passed. 54 """ 55 if not self.results: 56 return {} 57 58 r0 = self.results[0] 59 metadata = self.config.get("metadata", {}) 60 total_rows = r0.total_rows 61 total_checks = len(self.results) 62 passed_checks = sum(1 for r in self.results if r.status == "PASS") 63 failed_checks = total_checks - passed_checks 64 overall_status = "PASS" if failed_checks == 0 else "FAIL" 65 66 # Clean records: rows not flagged as failed by ANY check. 67 # Each check reports failed_rows independently; we sum them as a 68 # conservative lower-bound on dirty rows (exact intersection needs a join). 69 total_failed_rows = sum(r.failed_rows for r in self.results) 70 # Cap at total_rows to avoid negative clean counts when checks overlap 71 dirty_rows = min(total_failed_rows, total_rows) if total_rows else 0 72 clean_rows = max(0, total_rows - dirty_rows) 73 clean_pct = round(clean_rows / total_rows * 100, 2) if total_rows else 0.0 74 75 return { 76 "table_name": r0.table_name, 77 "overall_status": overall_status, 78 "total_rows": total_rows, 79 "clean_rows": clean_rows, 80 "dirty_rows": dirty_rows, 81 "clean_pct": clean_pct, 82 "total_checks": total_checks, 83 "passed_checks": passed_checks, 84 "failed_checks": failed_checks, 85 "columns_checked": r0.columns_checked, 86 "total_columns": r0.total_columns, 87 "column_coverage_pct": r0.column_coverage_pct, 88 "run_timestamp": r0.run_timestamp, 89 "data_owner": metadata.get("data_owner", ""), 90 "data_steward": metadata.get("data_steward", ""), 91 "business_domain": metadata.get("business_domain", ""), 92 "description": metadata.get("description", ""), 93 "tags": ",".join(metadata.get("tags", [])), 94 }
Single-row summary at table level.
clean_records = rows that passed every column check applied to them. overall_status = PASS only if all checks passed.
def
to_table_summary_df(self, spark=None):
96 def to_table_summary_df(self, spark=None): 97 """Spark DataFrame with one row summarising this table run.""" 98 if spark is None: 99 from pyspark.sql import SparkSession 100 spark = SparkSession.getActiveSession() 101 return spark.createDataFrame([self.table_summary()])
Spark DataFrame with one row summarising this table run.
def
save(self, output_cfg: dict, spark=None):
107 def save(self, output_cfg: dict, spark=None): 108 """Persist results to one or more destinations defined in output_cfg.""" 109 import os 110 111 # Support both old single-type ("type") and new multi-type ("types") format 112 types = output_cfg.get("types") or ([output_cfg["type"]] if "type" in output_cfg else ["dataframe"]) 113 114 results = {} 115 sdf = None 116 117 for otype in types: 118 if otype == "dataframe": 119 if sdf is None: 120 sdf = self.to_spark_df(spark) 121 results["dataframe"] = sdf 122 123 elif otype == "delta": 124 if sdf is None: 125 sdf = self.to_spark_df(spark) 126 table = output_cfg.get("delta_table", "") 127 if not table: 128 print("⚠️ delta_table not set — skipping Delta output") 129 continue 130 (sdf.write.format("delta") 131 .mode("append") 132 .option("mergeSchema", "true") 133 .saveAsTable(table)) 134 print(f"✅ Saved to Delta table: {table}") 135 # Also write table-level summary to <table>_summary if configured 136 summary_table = output_cfg.get("summary_delta_table", "") 137 if summary_table: 138 (self.to_table_summary_df(spark) 139 .write.format("delta") 140 .mode("append") 141 .option("mergeSchema", "true") 142 .saveAsTable(summary_table)) 143 print(f"✅ Saved table summary to: {summary_table}") 144 results["delta"] = sdf 145 146 elif otype in ("volume_json", "volume_csv"): 147 # vol_path already contains catalog/schema from the wizard 148 vol_path = output_cfg.get("volume_path", "").rstrip("/") 149 table_name = self.config.get("source", {}).get("table", "") 150 tbl = table_name.split(".")[-1] if table_name else "table" 151 filename = (output_cfg.get("filename") 152 or f"dq_{tbl}_{datetime.now().strftime('%Y%m%d_%H%M%S')}") 153 ext = "json" if otype == "volume_json" else "csv" 154 os.makedirs(vol_path, exist_ok=True) 155 full = f"{vol_path}/{filename}.{ext}" 156 pdf = self.to_pandas() 157 if ext == "json": 158 pdf.to_json(full, orient="records", indent=2) 159 else: 160 pdf.to_csv(full, index=False) 161 print(f"✅ Saved to: {full}") 162 # Summary file alongside: same dir, _summary suffix 163 summary_file = f"{vol_path}/{filename}_summary.{ext}" 164 spdf = self.table_summary_pandas() 165 if ext == "json": 166 spdf.to_json(summary_file, orient="records", indent=2) 167 else: 168 spdf.to_csv(summary_file, index=False) 169 print(f"✅ Saved table summary to: {summary_file}") 170 results[otype] = full 171 172 return results.get("dataframe") or (sdf if sdf is not None else None)
Persist results to one or more destinations defined in output_cfg.
@dataclass
class
CheckResult:
24@dataclass 25class CheckResult: 26 table_name: str 27 column_name: str 28 check_name: str 29 dq_dimension: str 30 total_rows: int 31 passed_rows: int 32 failed_rows: int 33 passed_pct: float 34 threshold_pct: float 35 status: str # PASS | FAIL | ERROR 36 check_params: str = "{}" 37 run_timestamp: str = "" 38 data_owner: str = "" 39 data_steward: str = "" 40 business_domain: str = "" 41 table_description: str = "" 42 columns_checked: int = 0 43 total_columns: int = 0 44 column_coverage_pct: float = 0.0 45 tags: str = "" 46 47 def to_dict(self) -> dict: 48 return {k: v for k, v in self.__dict__.items()}
CheckResult( table_name: str, column_name: str, check_name: str, dq_dimension: str, total_rows: int, passed_rows: int, failed_rows: int, passed_pct: float, threshold_pct: float, status: str, check_params: str = '{}', run_timestamp: str = '', data_owner: str = '', data_steward: str = '', business_domain: str = '', table_description: str = '', columns_checked: int = 0, total_columns: int = 0, column_coverage_pct: float = 0.0, tags: str = '')
CHECKS_REGISTRY =
{'expect_column_values_to_not_be_null': {'dimension': 'Completeness', 'description': 'Values must not be null', 'params': [], 'fn': <function _not_null>}, 'expect_column_values_to_be_null': {'dimension': 'Completeness', 'description': 'Values must be null', 'params': [], 'fn': <function _is_null>}, 'expect_column_values_to_not_be_null_or_empty': {'dimension': 'Completeness', 'description': 'Values must not be null and must not be an empty/whitespace string', 'params': [], 'fn': <function _not_null_or_empty>}, 'expect_column_null_count_to_be_between': {'dimension': 'Completeness', 'description': 'Number of null values must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _null_count_between>, 'aggregate': True}, 'expect_column_null_proportion_to_be_between': {'dimension': 'Completeness', 'description': 'Proportion of null values (0–1) must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _null_proportion_between>, 'aggregate': True}, 'expect_column_values_to_be_between': {'dimension': 'Accuracy', 'description': 'Values must fall within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _between>}, 'expect_column_values_to_not_be_between': {'dimension': 'Accuracy', 'description': 'Values must fall outside [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _not_between>}, 'expect_column_values_to_be_in_set': {'dimension': 'Accuracy', 'description': 'Values must belong to the allowed set', 'params': ['value_set'], 'fn': <function _in_set>}, 'expect_column_values_to_not_be_in_set': {'dimension': 'Accuracy', 'description': 'Values must not belong to the disallowed set', 'params': ['value_set'], 'fn': <function _not_in_set>}, 'expect_column_values_to_equal': {'dimension': 'Accuracy', 'description': 'All values must equal the specified constant', 'params': ['value'], 'fn': <function _equal_to>}, 'expect_column_values_to_not_equal': {'dimension': 'Accuracy', 'description': 'No values may equal the specified constant', 'params': ['value'], 'fn': <function _not_equal_to>}, 'expect_column_values_to_be_not_less_than': {'dimension': 'Accuracy', 'description': 'Values must be >= min_value', 'params': ['min_value'], 'fn': <function _not_less_than>}, 'expect_column_values_to_be_not_greater_than': {'dimension': 'Accuracy', 'description': 'Values must be <= max_value', 'params': ['max_value'], 'fn': <function _not_greater_than>}, 'expect_column_values_to_be_positive': {'dimension': 'Accuracy', 'description': 'Values must be strictly positive (> 0)', 'params': [], 'fn': <function _positive>}, 'expect_column_values_to_be_negative': {'dimension': 'Accuracy', 'description': 'Values must be strictly negative (< 0)', 'params': [], 'fn': <function _negative>}, 'expect_column_values_to_be_non_negative': {'dimension': 'Accuracy', 'description': 'Values must be zero or positive (>= 0)', 'params': [], 'fn': <function _non_negative>}, 'expect_column_values_to_be_increasing': {'dimension': 'Accuracy', 'description': 'Values must be non-decreasing in row order', 'params': [], 'fn': <function _increasing>}, 'expect_column_values_to_be_decreasing': {'dimension': 'Accuracy', 'description': 'Values must be non-increasing in row order', 'params': [], 'fn': <function _decreasing>}, 'expect_column_values_to_not_be_empty_string': {'dimension': 'Accuracy', 'description': 'Non-null values must not be empty or whitespace-only', 'params': [], 'fn': <function _not_empty_string>}, 'expect_column_values_to_match_regex': {'dimension': 'Accuracy', 'description': 'Values must match the regular expression', 'params': ['regex'], 'fn': <function _regex>}, 'expect_column_values_to_not_match_regex': {'dimension': 'Accuracy', 'description': 'Values must not match the regular expression', 'params': ['regex'], 'fn': <function _not_regex>}, 'expect_column_values_to_match_regex_list': {'dimension': 'Accuracy', 'description': 'Values must match at least one regex in the list', 'params': ['regex_list'], 'fn': <function _regex_list>}, 'expect_column_values_to_match_like_pattern': {'dimension': 'Accuracy', 'description': 'Values must match the SQL LIKE pattern (% and _ wildcards)', 'params': ['like_pattern'], 'fn': <function _like_pattern>}, 'expect_column_values_to_not_match_like_pattern': {'dimension': 'Accuracy', 'description': 'Values must not match the SQL LIKE pattern', 'params': ['like_pattern'], 'fn': <function _not_like_pattern>}, 'expect_column_value_lengths_to_be_between': {'dimension': 'Accuracy', 'description': 'String length must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _length_between>}, 'expect_column_value_lengths_to_equal': {'dimension': 'Accuracy', 'description': 'String length must equal the specified value', 'params': ['value'], 'fn': <function _length_equal>}, 'expect_column_values_to_be_of_type': {'dimension': 'Accuracy', 'description': "Column dtype must contain the specified type string (e.g. 'int', 'string')", 'params': ['type_'], 'fn': <function _of_type>}, 'expect_column_values_to_be_in_type_list': {'dimension': 'Accuracy', 'description': 'Column dtype must match one of the types in type_list', 'params': ['type_list'], 'fn': <function _in_type_list>}, 'expect_column_values_to_be_valid_email': {'dimension': 'Accuracy', 'description': 'Values must be valid email addresses', 'params': [], 'fn': <function _valid_email>}, 'expect_column_values_to_be_valid_url': {'dimension': 'Accuracy', 'description': 'Values must be valid HTTP/HTTPS URLs', 'params': [], 'fn': <function _valid_url>}, 'expect_column_values_to_be_valid_ipv4': {'dimension': 'Accuracy', 'description': 'Values must be valid IPv4 addresses (e.g. 192.168.1.1)', 'params': [], 'fn': <function _valid_ipv4>}, 'expect_column_values_to_be_valid_uuid': {'dimension': 'Accuracy', 'description': 'Values must be valid UUIDs (8-4-4-4-12 hex format)', 'params': [], 'fn': <function _valid_uuid>}, 'expect_column_values_to_be_json_parseable': {'dimension': 'Accuracy', 'description': 'Values must be valid JSON strings', 'params': [], 'fn': <function _json_parseable>}, 'expect_column_values_to_match_strftime_format': {'dimension': 'Accuracy', 'description': 'Values must match the strftime date format (e.g. %Y-%m-%d)', 'params': ['strftime_format'], 'fn': <function _strftime>}, 'expect_column_values_to_be_dateutil_parseable': {'dimension': 'Accuracy', 'description': 'Values must be parseable as a date in any common format', 'params': [], 'fn': <function _date_parseable>}, 'expect_column_values_to_not_be_in_future': {'dimension': 'Accuracy', 'description': 'Date/timestamp values must not be in the future', 'params': [], 'fn': <function _not_in_future>}, 'expect_column_values_to_be_not_older_than_n_days': {'dimension': 'Accuracy', 'description': 'Date values must be within the last n_days', 'params': ['n_days'], 'fn': <function _not_older_than_n_days>}, 'expect_column_values_to_not_be_in_near_future': {'dimension': 'Accuracy', 'description': 'Values must not fall within the next n_days', 'params': ['n_days'], 'fn': <function _not_near_future>}, 'expect_column_data_to_be_fresh': {'dimension': 'Accuracy', 'description': 'Most recent value in column must be within n_minutes of now', 'params': ['n_minutes'], 'fn': <function _data_fresh>, 'aggregate': True}, 'expect_column_values_to_pass_custom_sql_filter': {'dimension': 'Accuracy', 'description': 'Rows matching sql_filter are FAILED rows (write a WHERE clause for bad data)', 'params': ['sql_filter'], 'fn': <function _custom_sql_filter>}, 'expect_column_mean_to_be_between': {'dimension': 'Accuracy', 'description': 'Column mean must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _mean_between>, 'aggregate': True}, 'expect_column_median_to_be_between': {'dimension': 'Accuracy', 'description': 'Column median must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _median_between>, 'aggregate': True}, 'expect_column_stdev_to_be_between': {'dimension': 'Accuracy', 'description': 'Standard deviation must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _stdev_between>, 'aggregate': True}, 'expect_column_max_to_be_between': {'dimension': 'Accuracy', 'description': 'Column maximum must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _max_between>, 'aggregate': True}, 'expect_column_min_to_be_between': {'dimension': 'Accuracy', 'description': 'Column minimum must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _min_between>, 'aggregate': True}, 'expect_column_sum_to_be_between': {'dimension': 'Accuracy', 'description': 'Column sum must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _sum_between>, 'aggregate': True}, 'expect_column_most_common_value_to_be_in_set': {'dimension': 'Accuracy', 'description': 'Most frequent value must be in the allowed set', 'params': ['value_set'], 'fn': <function _most_common_in_set>, 'aggregate': True}, 'expect_column_quantile_value_to_be_between': {'dimension': 'Accuracy', 'description': 'Column quantile (0–1) must be within [min_value, max_value]', 'params': ['quantile', 'min_value', 'max_value'], 'fn': <function _quantile_between>, 'aggregate': True}, 'expect_column_distinct_values_to_be_in_set': {'dimension': 'Accuracy', 'description': 'All distinct values must be in value_set (no unlisted values allowed)', 'params': ['value_set'], 'fn': <function _distinct_in_set>, 'aggregate': True}, 'expect_column_distinct_values_to_contain_set': {'dimension': 'Accuracy', 'description': 'Distinct values must include all items in value_set', 'params': ['value_set'], 'fn': <function _distinct_contains_set>, 'aggregate': True}, 'expect_column_distinct_values_to_equal_set': {'dimension': 'Accuracy', 'description': 'Distinct values must exactly match value_set (no extras, no missing)', 'params': ['value_set'], 'fn': <function _distinct_equal_set>, 'aggregate': True}, 'expect_column_values_to_be_unique': {'dimension': 'Integrity', 'description': 'All values must be unique — no duplicates', 'params': [], 'fn': <function _unique>}, 'expect_column_unique_value_count_to_be_between': {'dimension': 'Integrity', 'description': 'Distinct value count must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _unique_count_between>, 'aggregate': True}, 'expect_column_proportion_of_unique_values_to_be_between': {'dimension': 'Integrity', 'description': 'Proportion of unique values (0–1) must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _unique_proportion_between>, 'aggregate': True}, 'expect_column_pair_values_to_be_equal': {'dimension': 'Integrity', 'description': 'Values in this column must equal values in column_b (row-wise)', 'params': ['column_b'], 'fn': <function _pair_equal>}, 'expect_column_pair_values_a_to_be_greater_than_b': {'dimension': 'Integrity', 'description': 'Values in this column must be greater than values in column_b', 'params': ['column_b'], 'fn': <function _pair_greater>}, 'expect_column_pair_values_to_be_in_set': {'dimension': 'Integrity', 'description': 'Row-wise (colA, colB) value pairs must be in valid_pairs', 'params': ['column_b', 'valid_pairs'], 'fn': <function _pair_in_set>}, 'expect_compound_columns_to_be_unique': {'dimension': 'Integrity', 'description': 'Combination of columns must be unique across all rows', 'params': ['columns'], 'fn': <function _compound_unique>, 'compound': True}, 'expect_primary_key_to_be_valid': {'dimension': 'Integrity', 'description': 'PK columns must all be non-null AND the combination must be unique', 'params': ['columns'], 'fn': <function _primary_key_valid>, 'compound': True}, 'expect_column_values_to_exist_in_reference_table': {'dimension': 'Integrity', 'description': 'Values must exist in reference_table.reference_column (foreign key check)', 'params': ['reference_table', 'reference_column'], 'fn': <function _foreign_key>, 'cross_table': True}, 'expect_referential_integrity': {'dimension': 'Integrity', 'description': 'Full referential integrity check between tables; optionally checks orphans too', 'params': ['reference_table', 'reference_column', 'check_orphans'], 'fn': <function _referential_integrity>, 'cross_table': True}, 'expect_table_row_count_to_be_between': {'dimension': 'Consistency', 'description': 'Table row count must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _row_count_between>, 'table_level': True, 'aggregate': True}, 'expect_table_row_count_to_equal': {'dimension': 'Consistency', 'description': 'Table row count must equal value exactly', 'params': ['value'], 'fn': <function _row_count_equal>, 'table_level': True, 'aggregate': True}, 'expect_table_column_count_to_be_between': {'dimension': 'Consistency', 'description': 'Number of columns must be within [min_value, max_value]', 'params': ['min_value', 'max_value'], 'fn': <function _col_count_between>, 'table_level': True, 'aggregate': True}, 'expect_table_column_count_to_equal': {'dimension': 'Consistency', 'description': 'Number of columns must equal value exactly', 'params': ['value'], 'fn': <function _col_count_equal>, 'table_level': True, 'aggregate': True}, 'expect_column_to_exist': {'dimension': 'Consistency', 'description': 'The specified column must exist in the table', 'params': [], 'fn': <function _col_to_exist>, 'table_level': True, 'aggregate': True}, 'expect_table_columns_to_match_set': {'dimension': 'Consistency', 'description': 'Table column names must exactly match column_set (order-independent)', 'params': ['column_set'], 'fn': <function _columns_match_set>, 'table_level': True, 'aggregate': True}, 'expect_table_columns_to_match_ordered_list': {'dimension': 'Consistency', 'description': 'Table columns must match column_list in exact order', 'params': ['column_list'], 'fn': <function _columns_match_ordered_list>, 'table_level': True, 'aggregate': True}, 'expect_multicolumn_sum_to_equal': {'dimension': 'Consistency', 'description': 'Sum of all values across columns must equal sum_value', 'params': ['columns', 'sum_value'], 'fn': <function _multicolumn_sum_equal>, 'compound': True, 'aggregate': True}, 'expect_table_row_count_to_equal_other_table': {'dimension': 'Consistency', 'description': "This table's row count must equal reference_table's row count", 'params': ['reference_table'], 'fn': <function _row_count_equal_other_table>, 'table_level': True, 'cross_table': True, 'aggregate': True}, 'expect_table_schema_to_match': {'dimension': 'Consistency', 'description': 'Table schema must match expected_schema dict {col_name: dtype_string}', 'params': ['expected_schema'], 'fn': <function _schema_valid>, 'table_level': True, 'aggregate': True}}