Coverage for src/instawell/processing/step_03_average_replicates.py: 94%

34 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-07 15:47 -0600

1import logging 

2 

3import pandas as pd 

4 

5from instawell.core.exceptions import PrerequisiteStepError 

6from instawell.core.exp_context import ExperimentContext 

7from instawell.core.steps import StepFiles 

8from instawell.utils.logging_util import setup_experiment_logging 

9from instawell.utils.utils import convert_concentration_to_float 

10 

11# set logging level to INFO 

12logger = logging.getLogger(__name__) 

13 

14 

15def _avg_across_replicates( 

16 organized_data: pd.DataFrame, 

17 sep: str = "|", 

18) -> pd.DataFrame: 

19 """ 

20 Averages the data across replicates. 

21 """ 

22 # Group by the unique condition and temperature, then average the values 

23 organized_data["unqcond"] = ( 

24 organized_data["concentration"] 

25 + sep 

26 + organized_data["ligand"] 

27 + sep 

28 + organized_data["protein"] 

29 + sep 

30 + organized_data["buffer"] 

31 ) 

32 # Drop well_unqcond as it is not needed for averaging 

33 # organized_data = organized_data.drop(columns=["well_unqcond"]) 

34 # Add a column for the unique condition 

35 # averaged_data = raw_data_long.groupby(['Temperature', 'combination2']).agg({'value': 'mean'}).reset_index() 

36 

37 averaged_data = ( 

38 organized_data.groupby(["Temperature", "unqcond"]).agg({"value": "mean"}).reset_index() 

39 ) 

40 

41 averaged_data_pivot = averaged_data.pivot( 

42 index="Temperature", columns="unqcond", values="value" 

43 ) 

44 

45 # averaged_data_pivot = split_unqcon_column(averaged_data_pivot) 

46 

47 return averaged_data_pivot 

48 

49 

50def average_accross_replicates(ctx: ExperimentContext) -> None: 

51 """ 

52 The second step of the data processing pipeline. 

53 It filters the organized data based on the provided parameters. 

54 """ 

55 if ctx.log_to_file: 

56 setup_experiment_logging( 

57 experiment_dir=ctx.experiment_dir, filename="experiment.log", level=ctx.log_level 

58 ) 

59 filtered_data_path = ctx.experiment_dir / StepFiles.FILTERED_DATA 

60 

61 if not filtered_data_path.exists(): 

62 raise PrerequisiteStepError(f"Filtered data file not found: {filtered_data_path}") 

63 # Load the filtered data 

64 filtered_data = pd.read_csv(filtered_data_path) 

65 required_columns = [ 

66 "Temperature", 

67 "well", 

68 "value", 

69 "ligand", 

70 "protein", 

71 "buffer", 

72 "concentration", 

73 "well_unqcond", 

74 ] 

75 for col in required_columns: 

76 if col not in filtered_data.columns: 

77 raise ValueError(f"Missing required column: {col} in the data.") 

78 sep = ctx.condition_separator 

79 averaged_data = _avg_across_replicates(organized_data=filtered_data, sep=sep) 

80 

81 # in the averaged across replicates data, we need to sort the columns (except for Temperature) by matching ligand, protein, and buffer 

82 # Sort the columns based on ligand, protein, and buffer. then within each group, sort by increasing concentration 

83 # get the columns except for Temperature 

84 columns_to_sort = averaged_data.columns[1:] # Exclude 'Temperature' 

85 

86 # field_positions = {field: idx for idx, field in enumerate(ctx.fields)} 

87 

88 sorted_columns = sorted( 

89 columns_to_sort, 

90 key=lambda x: ( 

91 x.split(sep)[1], # ligand 

92 x.split(sep)[2], # protein 

93 x.split(sep)[3], # buffer 

94 convert_concentration_to_float( 

95 x.split(sep)[0] 

96 ), # concentration, convert to float for sorting 

97 ), 

98 ) 

99 # print(f"Sorted columns: {sorted_columns}") 

100 # Reorder the columns in the DataFrame 

101 averaged_data_sorted = averaged_data[sorted_columns] 

102 # Reset the index to make Temperature a column again 

103 averaged_data_sorted.reset_index(inplace=True) 

104 averaged_data.reset_index(inplace=True) 

105 # Add the Temperature column back to the front 

106 # averaged_data_sorted.insert(0, "Temperature", averaged_data["Temperature"]) 

107 

108 # Save the averaged data to a CSV file in the experiment directory 

109 averaged_data_path = ctx.experiment_dir / StepFiles.AVERAGED_DATA 

110 averaged_data_sorted.to_csv(averaged_data_path, index=False) 

111 logger.info(f"Averaged data saved to {averaged_data_path}")