Coverage for src/instawell/data_validation.py: 0%

92 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-28 21:17 -0500

1import logging 

2from typing import Any, Dict, List, Optional, Type, TypeVar 

3 

4import pandas as pd 

5from pydantic import BaseModel, Field, ValidationError, model_validator 

6from pydantic_core import ErrorDetails 

7 

8# Define a generic type for Pydantic models 

9PydanticModel = TypeVar("PydanticModel", bound=BaseModel) 

10 

11 

12# def validate_df_with_pydantic( 

13# df: pd.DataFrame, model: Type[PydanticModel] 

14# ) -> pd.DataFrame: 

15# """ 

16# Validates a DataFrame against a Pydantic model. 

17 

18# Args: 

19# df: The pandas DataFrame to validate. 

20# model: The Pydantic model class to validate each row against. 

21 

22# Returns: 

23# A new DataFrame with validated and potentially coerced data. 

24 

25# Raises: 

26# ValidationError: If any row fails validation. 

27# ValueError: If the input is not a DataFrame or is empty. 

28# """ 

29# if not isinstance(df, pd.DataFrame): 

30# raise ValueError("Input must be a pandas DataFrame.") 

31# if df.empty: 

32# raise ValueError("Input DataFrame is empty.") 

33# # Ensure the column names of the dataframe are strings 

34# df.columns = df.columns.astype(str) 

35# try: 

36# # Convert, validate, and dump back to dicts 

37# validated_data = [ 

38# model(**{str(k): v for k, v in row.items()}).model_dump() 

39# for row in df.to_dict(orient="records") 

40# ] 

41# # Create a new DataFrame from validated data 

42# return pd.DataFrame(validated_data) 

43# except ValidationError as e: 

44# print(f"DataFrame validation failed against model '{model.__name__}':") 

45# # You might want to provide more context about *which* rows failed 

46# raise e 

47# except Exception as e: 

48# print(f"An unexpected error occurred during validation: {e}") 

49# raise e 

50 

51 

52class LongData(BaseModel): 

53 Temperature: float = Field(..., description="Temperature in Celsius.") 

54 value: float = Field(..., description="Measured value at the given temperature and well.") 

55 ligand: str = Field(..., description="Ligand identifier (e.g., 'LigandA').") 

56 protein: str = Field(..., description="Protein identifier (e.g., 'ProteinX').") 

57 buffer: str = Field(..., description="Buffer condition (e.g., 'Buffer1').") 

58 

59 

60class LongDataRaw(LongData): 

61 well: str = Field(..., description="Well identifier (e.g., 'A1', 'B2').") 

62 

63 well_uqcond: str = Field( 

64 ..., 

65 description="Unique Combination of well ligand, protein, and buffer conditions.", 

66 ) 

67 

68 

69class LongDataAvg(LongData): 

70 uq_cond: Optional[str] = Field( 

71 None, 

72 description="Unique condition identifier combining ligand, protein, and buffer.", 

73 ) 

74 

75 

76class WideDataNumeric(BaseModel): 

77 """ 

78 Represents a row in a DataFrame. 

79 It has fixed 'id' and 'category' fields, and allows for any number 

80 of additional dynamic fields, which are expected to be numeric. 

81 """ 

82 

83 # --- Fixed fields --- 

84 Temperature: float = Field(..., description="Temperature in Celsius.") 

85 

86 # --- Configuration to allow extra fields --- 

87 # This tells Pydantic to accept fields not explicitly defined above. 

88 # These extra fields will be stored in `self.model_extra` and included in `model_dump()`. 

89 model_config = {"extra": "allow"} 

90 

91 # --- Validator for dynamic/extra fields --- 

92 @model_validator(mode="after") 

93 def check_dynamic_features_are_numeric(self) -> "WideDataNumeric": 

94 """ 

95 Validates that all dynamically added fields (extras) are numeric (int or float). 

96 This validator runs after the initial parsing of known fields. 

97 """ 

98 if self.model_extra: # self.model_extra contains the dict of dynamic fields 

99 for field_name, value in self.model_extra.items(): 

100 if not isinstance(value, float): 

101 # Raise a ValueError; Pydantic will catch this and incorporate it 

102 # into its standard ValidationError structure. 

103 raise ValueError( 

104 f"Dynamic feature '{field_name}' must be numeric (int or float), " 

105 f"but got type {type(value).__name__} with value '{value}'." 

106 ) 

107 return self 

108 

109 

110class WideDataNoBase(WideDataNumeric): 

111 @model_validator(mode="after") 

112 def check_no_col_names_contain_npc(self) -> "WideDataNoBase": 

113 """ 

114 Validates that no column names contain 'npc'. 

115 This validator runs after the initial parsing of known fields. 

116 """ 

117 

118 if self.model_extra: 

119 for field_name in self.model_extra.keys(): 

120 if "NPC" in field_name.lower(): 

121 raise ValueError( 

122 f"Column name '{field_name}' contains 'NPC', which should have been removed by this stage." 

123 ) 

124 return self 

125 

126 

127class WideDataMinMax(WideDataNoBase): 

128 @model_validator(mode="after") 

129 def check_min_max_columns(self) -> "WideDataMinMax": 

130 """checks that all columns except Temp are between 0 and 1, and that Temp is NOT between 0 and 1.""" 

131 if self.model_extra: 

132 for field_name, value in self.model_extra.items(): 

133 if not (0 <= value <= 1): 

134 raise ValueError( 

135 f"Column '{field_name}' must be between 0 and 1, but got {value}." 

136 ) 

137 if self.Temperature < 1: 

138 logging.warning( 

139 f"Temperature '{self.Temperature}' is less than 1, which is unusual for this context." 

140 ) 

141 return self 

142 

143 

144class LongDataMinMax(LongDataAvg): 

145 @model_validator(mode="after") 

146 def check_min_max_columns(self) -> "LongDataMinMax": 

147 """checks that all columns except Temp are between 0 and 1, and that Temp is NOT between 0 and 1.""" 

148 if not (0 <= self.value <= 1): 

149 raise ValueError(f"Value '{self.value}' must be between 0 and 1, but got {self.value}.") 

150 if self.Temperature < 1: 

151 logging.warning( 

152 f"Temperature '{self.Temperature}' is less than 1, which is unusual for this context." 

153 ) 

154 return self 

155 

156 

157class LongDataDT(LongDataAvg): 

158 # how to validate this? 

159 pass 

160 

161 

162class LongDataDTMinMax(LongDataDT): 

163 pass 

164 

165 

166class MinTempData(BaseModel): 

167 # include convert to float 

168 pass 

169 

170 

171class LayoutDynamicData(BaseModel): 

172 """ 

173 Represents a row in a DataFrame. 

174 It has fixed 'id' and 'category' fields, and allows for any number 

175 of additional dynamic fields, which are expected to be numeric. 

176 """ 

177 

178 # --- Fixed fields --- 

179 well: str = Field(..., description="Well identifier (e.g., 'A1', 'B2').") 

180 

181 # --- Configuration to allow extra fields --- 

182 # This tells Pydantic to accept fields not explicitly defined above. 

183 # These extra fields will be stored in `self.model_extra` and included in `model_dump()`. 

184 model_config = {"extra": "allow"} 

185 

186 # --- Validator for dynamic/extra fields --- 

187 @model_validator(mode="after") 

188 def check_dynamic_features_are_str(self) -> "LayoutDynamicData": 

189 """ 

190 Validates that all dynamically added fields (extras) are str 

191 This validator runs after the initial parsing of known fields. 

192 """ 

193 if self.model_extra: # self.model_extra contains the dict of dynamic fields 

194 for field_name, value in self.model_extra.items(): 

195 if not isinstance(value, (str)): 

196 # Raise a ValueError; Pydantic will catch this and incorporate it 

197 # into its standard ValidationError structure. 

198 raise ValueError( 

199 f"Dynamic feature '{field_name}' must be numeric (int or float), " 

200 f"but got type {type(value).__name__} with value '{value}'." 

201 ) 

202 return self 

203 

204 

205def validate_df_dynamic_model(df: pd.DataFrame, model_class: Type[BaseModel]) -> pd.DataFrame: 

206 """ 

207 Validates each row of a pandas DataFrame against the provided Pydantic model. 

208 

209 Args: 

210 df: The pandas DataFrame to validate. 

211 model_class: The Pydantic model class to use for validation. 

212 

213 Returns: 

214 A new pandas DataFrame containing the validated and potentially coerced data. 

215 

216 Raises: 

217 TypeError: If the input 'df' is not a pandas DataFrame. 

218 ValidationError: If any row in the DataFrame fails validation against the model. 

219 The raised error will contain details for all failing rows. 

220 """ 

221 if not isinstance(df, pd.DataFrame): 

222 raise TypeError("Input must be a pandas DataFrame.") 

223 

224 if df.empty: 

225 print( 

226 "Warning: Input DataFrame is empty. Returning an empty DataFrame based on model fields." 

227 ) 

228 # Create an empty DataFrame with columns based on the model's known fields. 

229 # Dynamic columns aren't known at this stage for an empty input. 

230 return pd.DataFrame(columns=list(model_class.model_fields.keys())) 

231 

232 validated_rows_data: List[Dict[str, Any]] = [] 

233 all_error_details: List[ErrorDetails] = [] # To collect Pydantic's ErrorDetail dicts 

234 

235 # Ensure DataFrame columns are strings for **row unpacking, though model_validate is safer 

236 df.columns = df.columns.astype(str) 

237 

238 for idx, row_dict in enumerate(df.to_dict(orient="records")): 

239 try: 

240 # Use model_validate for Pydantic v2 

241 validated_model_instance = model_class.model_validate(row_dict) 

242 # model_dump() will include extra fields by default if extra='allow' 

243 # It also handles alias generation, exclude_none, etc., if configured. 

244 validated_rows_data.append(validated_model_instance.model_dump()) 

245 except ValidationError as e: 

246 # e.errors() returns a list of ErrorDetail dictionaries. 

247 # We prepend the DataFrame row index to the 'loc' (location) 

248 # of each error for better context. 

249 for error_detail in e.errors(): 

250 current_loc = error_detail.get("loc", ()) 

251 # Ensure current_loc is a tuple before prepending 

252 if not isinstance(current_loc, tuple): 

253 current_loc = (current_loc,) 

254 error_detail["loc"] = (f"row_{idx}",) + current_loc 

255 all_error_details.append(error_detail) 

256 

257 if all_error_details: 

258 # If there were any errors, construct and raise a single ValidationError 

259 # containing all collected error details from all rows. 

260 # The second argument to ValidationError is the model class itself. 

261 raise ValidationError(all_error_details, model_class) 

262 

263 # If all rows are valid, create a new DataFrame from the validated data. 

264 return pd.DataFrame(validated_rows_data)