Coverage for src/instawell/processing/step_02_filter_wells.py: 92%

37 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-07 15:47 -0600

1import logging 

2 

3import pandas as pd 

4 

5from instawell.core.exceptions import PrerequisiteStepError 

6from instawell.core.exp_context import ExperimentContext 

7from instawell.core.steps import StepFiles 

8from instawell.utils.logging_util import setup_experiment_logging 

9 

10# set logging level to INFO 

11logger = logging.getLogger(__name__) 

12 

13 

14def filter_wells( 

15 ctx: ExperimentContext, 

16 *, 

17 wells_to_filter: list[str] | None = None, 

18) -> None: 

19 """ 

20 Filters the organized data based on the provided parameters. MUST still be run even if no wells are to be filtered, in order to record that no wells were filtered. 

21 """ 

22 if ctx.log_to_file: 

23 setup_experiment_logging( 

24 experiment_dir=ctx.experiment_dir, filename="experiment.log", level=ctx.log_level 

25 ) 

26 

27 organized_data_path = ctx.experiment_dir / StepFiles.INGESTED_DATA 

28 try: 

29 organized_data = pd.read_csv(organized_data_path) 

30 except FileNotFoundError as exc: 

31 raise PrerequisiteStepError( 

32 f"Missing prerequisite data file: {organized_data_path}\n" 

33 "This usually means step 1 has not been run yet for this experiment.\n" 

34 "Please run `step_01_organize_data` (or the equivalent first step) " 

35 f"for experiment '{ctx.experiment_name}' and try again." 

36 ) from exc 

37 

38 filtered_data_path = ctx.experiment_dir / StepFiles.FILTERED_DATA 

39 required_columns = [ 

40 "Temperature", 

41 "well", 

42 "value", 

43 "ligand", 

44 "protein", 

45 "buffer", 

46 "concentration", 

47 "well_unqcond", 

48 ] 

49 for col in required_columns: 

50 if col not in organized_data.columns: 

51 raise ValueError(f"Missing required column: {col} in the data.") 

52 if not wells_to_filter: 

53 logger.info("No wells to filter. Returning the original data.") 

54 # save the organized data to a csv file in the experiment directory 

55 organized_data.to_csv(filtered_data_path, index=False) 

56 logger.info(f"Filtered data saved to {filtered_data_path}") 

57 # save a .txt file with the filtered wells 

58 with open(ctx.experiment_dir / StepFiles.FILTERED_WELLS, "w") as f: 

59 f.write("No wells filtered.") 

60 return 

61 

62 # first check if each well in wells is in the organized_data 

63 for well in wells_to_filter: 

64 if well not in organized_data["well"].unique(): 

65 raise ValueError( 

66 f"Well {well} not found in organized data. Please check the well names." 

67 ) 

68 # Filter the organized data to remove the specified wells 

69 for well in wells_to_filter: 

70 organized_data = organized_data[organized_data["well"] != well] 

71 logger.info(f"Filtered out well: {well}") 

72 

73 # save the filtered data to a csv file in the experiment directory 

74 organized_data.to_csv(filtered_data_path, index=False) 

75 logger.info(f"Filtered data saved to {filtered_data_path}") 

76 

77 # save a .txt file with the filtered wells 

78 with open(ctx.experiment_dir / StepFiles.FILTERED_WELLS, "w") as f: 

79 f.write("\n".join(wells_to_filter))