Coverage for src/instawell/data_validation.py: 0%
92 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-28 21:17 -0500
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-28 21:17 -0500
1import logging
2from typing import Any, Dict, List, Optional, Type, TypeVar
4import pandas as pd
5from pydantic import BaseModel, Field, ValidationError, model_validator
6from pydantic_core import ErrorDetails
8# Define a generic type for Pydantic models
9PydanticModel = TypeVar("PydanticModel", bound=BaseModel)
12# def validate_df_with_pydantic(
13# df: pd.DataFrame, model: Type[PydanticModel]
14# ) -> pd.DataFrame:
15# """
16# Validates a DataFrame against a Pydantic model.
18# Args:
19# df: The pandas DataFrame to validate.
20# model: The Pydantic model class to validate each row against.
22# Returns:
23# A new DataFrame with validated and potentially coerced data.
25# Raises:
26# ValidationError: If any row fails validation.
27# ValueError: If the input is not a DataFrame or is empty.
28# """
29# if not isinstance(df, pd.DataFrame):
30# raise ValueError("Input must be a pandas DataFrame.")
31# if df.empty:
32# raise ValueError("Input DataFrame is empty.")
33# # Ensure the column names of the dataframe are strings
34# df.columns = df.columns.astype(str)
35# try:
36# # Convert, validate, and dump back to dicts
37# validated_data = [
38# model(**{str(k): v for k, v in row.items()}).model_dump()
39# for row in df.to_dict(orient="records")
40# ]
41# # Create a new DataFrame from validated data
42# return pd.DataFrame(validated_data)
43# except ValidationError as e:
44# print(f"DataFrame validation failed against model '{model.__name__}':")
45# # You might want to provide more context about *which* rows failed
46# raise e
47# except Exception as e:
48# print(f"An unexpected error occurred during validation: {e}")
49# raise e
52class LongData(BaseModel):
53 Temperature: float = Field(..., description="Temperature in Celsius.")
54 value: float = Field(..., description="Measured value at the given temperature and well.")
55 ligand: str = Field(..., description="Ligand identifier (e.g., 'LigandA').")
56 protein: str = Field(..., description="Protein identifier (e.g., 'ProteinX').")
57 buffer: str = Field(..., description="Buffer condition (e.g., 'Buffer1').")
60class LongDataRaw(LongData):
61 well: str = Field(..., description="Well identifier (e.g., 'A1', 'B2').")
63 well_uqcond: str = Field(
64 ...,
65 description="Unique Combination of well ligand, protein, and buffer conditions.",
66 )
69class LongDataAvg(LongData):
70 uq_cond: Optional[str] = Field(
71 None,
72 description="Unique condition identifier combining ligand, protein, and buffer.",
73 )
76class WideDataNumeric(BaseModel):
77 """
78 Represents a row in a DataFrame.
79 It has fixed 'id' and 'category' fields, and allows for any number
80 of additional dynamic fields, which are expected to be numeric.
81 """
83 # --- Fixed fields ---
84 Temperature: float = Field(..., description="Temperature in Celsius.")
86 # --- Configuration to allow extra fields ---
87 # This tells Pydantic to accept fields not explicitly defined above.
88 # These extra fields will be stored in `self.model_extra` and included in `model_dump()`.
89 model_config = {"extra": "allow"}
91 # --- Validator for dynamic/extra fields ---
92 @model_validator(mode="after")
93 def check_dynamic_features_are_numeric(self) -> "WideDataNumeric":
94 """
95 Validates that all dynamically added fields (extras) are numeric (int or float).
96 This validator runs after the initial parsing of known fields.
97 """
98 if self.model_extra: # self.model_extra contains the dict of dynamic fields
99 for field_name, value in self.model_extra.items():
100 if not isinstance(value, float):
101 # Raise a ValueError; Pydantic will catch this and incorporate it
102 # into its standard ValidationError structure.
103 raise ValueError(
104 f"Dynamic feature '{field_name}' must be numeric (int or float), "
105 f"but got type {type(value).__name__} with value '{value}'."
106 )
107 return self
110class WideDataNoBase(WideDataNumeric):
111 @model_validator(mode="after")
112 def check_no_col_names_contain_npc(self) -> "WideDataNoBase":
113 """
114 Validates that no column names contain 'npc'.
115 This validator runs after the initial parsing of known fields.
116 """
118 if self.model_extra:
119 for field_name in self.model_extra.keys():
120 if "NPC" in field_name.lower():
121 raise ValueError(
122 f"Column name '{field_name}' contains 'NPC', which should have been removed by this stage."
123 )
124 return self
127class WideDataMinMax(WideDataNoBase):
128 @model_validator(mode="after")
129 def check_min_max_columns(self) -> "WideDataMinMax":
130 """checks that all columns except Temp are between 0 and 1, and that Temp is NOT between 0 and 1."""
131 if self.model_extra:
132 for field_name, value in self.model_extra.items():
133 if not (0 <= value <= 1):
134 raise ValueError(
135 f"Column '{field_name}' must be between 0 and 1, but got {value}."
136 )
137 if self.Temperature < 1:
138 logging.warning(
139 f"Temperature '{self.Temperature}' is less than 1, which is unusual for this context."
140 )
141 return self
144class LongDataMinMax(LongDataAvg):
145 @model_validator(mode="after")
146 def check_min_max_columns(self) -> "LongDataMinMax":
147 """checks that all columns except Temp are between 0 and 1, and that Temp is NOT between 0 and 1."""
148 if not (0 <= self.value <= 1):
149 raise ValueError(f"Value '{self.value}' must be between 0 and 1, but got {self.value}.")
150 if self.Temperature < 1:
151 logging.warning(
152 f"Temperature '{self.Temperature}' is less than 1, which is unusual for this context."
153 )
154 return self
157class LongDataDT(LongDataAvg):
158 # how to validate this?
159 pass
162class LongDataDTMinMax(LongDataDT):
163 pass
166class MinTempData(BaseModel):
167 # include convert to float
168 pass
171class LayoutDynamicData(BaseModel):
172 """
173 Represents a row in a DataFrame.
174 It has fixed 'id' and 'category' fields, and allows for any number
175 of additional dynamic fields, which are expected to be numeric.
176 """
178 # --- Fixed fields ---
179 well: str = Field(..., description="Well identifier (e.g., 'A1', 'B2').")
181 # --- Configuration to allow extra fields ---
182 # This tells Pydantic to accept fields not explicitly defined above.
183 # These extra fields will be stored in `self.model_extra` and included in `model_dump()`.
184 model_config = {"extra": "allow"}
186 # --- Validator for dynamic/extra fields ---
187 @model_validator(mode="after")
188 def check_dynamic_features_are_str(self) -> "LayoutDynamicData":
189 """
190 Validates that all dynamically added fields (extras) are str
191 This validator runs after the initial parsing of known fields.
192 """
193 if self.model_extra: # self.model_extra contains the dict of dynamic fields
194 for field_name, value in self.model_extra.items():
195 if not isinstance(value, (str)):
196 # Raise a ValueError; Pydantic will catch this and incorporate it
197 # into its standard ValidationError structure.
198 raise ValueError(
199 f"Dynamic feature '{field_name}' must be numeric (int or float), "
200 f"but got type {type(value).__name__} with value '{value}'."
201 )
202 return self
205def validate_df_dynamic_model(df: pd.DataFrame, model_class: Type[BaseModel]) -> pd.DataFrame:
206 """
207 Validates each row of a pandas DataFrame against the provided Pydantic model.
209 Args:
210 df: The pandas DataFrame to validate.
211 model_class: The Pydantic model class to use for validation.
213 Returns:
214 A new pandas DataFrame containing the validated and potentially coerced data.
216 Raises:
217 TypeError: If the input 'df' is not a pandas DataFrame.
218 ValidationError: If any row in the DataFrame fails validation against the model.
219 The raised error will contain details for all failing rows.
220 """
221 if not isinstance(df, pd.DataFrame):
222 raise TypeError("Input must be a pandas DataFrame.")
224 if df.empty:
225 print(
226 "Warning: Input DataFrame is empty. Returning an empty DataFrame based on model fields."
227 )
228 # Create an empty DataFrame with columns based on the model's known fields.
229 # Dynamic columns aren't known at this stage for an empty input.
230 return pd.DataFrame(columns=list(model_class.model_fields.keys()))
232 validated_rows_data: List[Dict[str, Any]] = []
233 all_error_details: List[ErrorDetails] = [] # To collect Pydantic's ErrorDetail dicts
235 # Ensure DataFrame columns are strings for **row unpacking, though model_validate is safer
236 df.columns = df.columns.astype(str)
238 for idx, row_dict in enumerate(df.to_dict(orient="records")):
239 try:
240 # Use model_validate for Pydantic v2
241 validated_model_instance = model_class.model_validate(row_dict)
242 # model_dump() will include extra fields by default if extra='allow'
243 # It also handles alias generation, exclude_none, etc., if configured.
244 validated_rows_data.append(validated_model_instance.model_dump())
245 except ValidationError as e:
246 # e.errors() returns a list of ErrorDetail dictionaries.
247 # We prepend the DataFrame row index to the 'loc' (location)
248 # of each error for better context.
249 for error_detail in e.errors():
250 current_loc = error_detail.get("loc", ())
251 # Ensure current_loc is a tuple before prepending
252 if not isinstance(current_loc, tuple):
253 current_loc = (current_loc,)
254 error_detail["loc"] = (f"row_{idx}",) + current_loc
255 all_error_details.append(error_detail)
257 if all_error_details:
258 # If there were any errors, construct and raise a single ValidationError
259 # containing all collected error details from all rows.
260 # The second argument to ValidationError is the model class itself.
261 raise ValidationError(all_error_details, model_class)
263 # If all rows are valid, create a new DataFrame from the validated data.
264 return pd.DataFrame(validated_rows_data)