Coverage for src/instawell/processing/step_03_average_replicates.py: 94%
34 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-07 15:47 -0600
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-07 15:47 -0600
1import logging
3import pandas as pd
5from instawell.core.exceptions import PrerequisiteStepError
6from instawell.core.exp_context import ExperimentContext
7from instawell.core.steps import StepFiles
8from instawell.utils.logging_util import setup_experiment_logging
9from instawell.utils.utils import convert_concentration_to_float
11# set logging level to INFO
12logger = logging.getLogger(__name__)
15def _avg_across_replicates(
16 organized_data: pd.DataFrame,
17 sep: str = "|",
18) -> pd.DataFrame:
19 """
20 Averages the data across replicates.
21 """
22 # Group by the unique condition and temperature, then average the values
23 organized_data["unqcond"] = (
24 organized_data["concentration"]
25 + sep
26 + organized_data["ligand"]
27 + sep
28 + organized_data["protein"]
29 + sep
30 + organized_data["buffer"]
31 )
32 # Drop well_unqcond as it is not needed for averaging
33 # organized_data = organized_data.drop(columns=["well_unqcond"])
34 # Add a column for the unique condition
35 # averaged_data = raw_data_long.groupby(['Temperature', 'combination2']).agg({'value': 'mean'}).reset_index()
37 averaged_data = (
38 organized_data.groupby(["Temperature", "unqcond"]).agg({"value": "mean"}).reset_index()
39 )
41 averaged_data_pivot = averaged_data.pivot(
42 index="Temperature", columns="unqcond", values="value"
43 )
45 # averaged_data_pivot = split_unqcon_column(averaged_data_pivot)
47 return averaged_data_pivot
50def average_accross_replicates(ctx: ExperimentContext) -> None:
51 """
52 The second step of the data processing pipeline.
53 It filters the organized data based on the provided parameters.
54 """
55 if ctx.log_to_file:
56 setup_experiment_logging(
57 experiment_dir=ctx.experiment_dir, filename="experiment.log", level=ctx.log_level
58 )
59 filtered_data_path = ctx.experiment_dir / StepFiles.FILTERED_DATA
61 if not filtered_data_path.exists():
62 raise PrerequisiteStepError(f"Filtered data file not found: {filtered_data_path}")
63 # Load the filtered data
64 filtered_data = pd.read_csv(filtered_data_path)
65 required_columns = [
66 "Temperature",
67 "well",
68 "value",
69 "ligand",
70 "protein",
71 "buffer",
72 "concentration",
73 "well_unqcond",
74 ]
75 for col in required_columns:
76 if col not in filtered_data.columns:
77 raise ValueError(f"Missing required column: {col} in the data.")
78 sep = ctx.condition_separator
79 averaged_data = _avg_across_replicates(organized_data=filtered_data, sep=sep)
81 # in the averaged across replicates data, we need to sort the columns (except for Temperature) by matching ligand, protein, and buffer
82 # Sort the columns based on ligand, protein, and buffer. then within each group, sort by increasing concentration
83 # get the columns except for Temperature
84 columns_to_sort = averaged_data.columns[1:] # Exclude 'Temperature'
86 # field_positions = {field: idx for idx, field in enumerate(ctx.fields)}
88 sorted_columns = sorted(
89 columns_to_sort,
90 key=lambda x: (
91 x.split(sep)[1], # ligand
92 x.split(sep)[2], # protein
93 x.split(sep)[3], # buffer
94 convert_concentration_to_float(
95 x.split(sep)[0]
96 ), # concentration, convert to float for sorting
97 ),
98 )
99 # print(f"Sorted columns: {sorted_columns}")
100 # Reorder the columns in the DataFrame
101 averaged_data_sorted = averaged_data[sorted_columns]
102 # Reset the index to make Temperature a column again
103 averaged_data_sorted.reset_index(inplace=True)
104 averaged_data.reset_index(inplace=True)
105 # Add the Temperature column back to the front
106 # averaged_data_sorted.insert(0, "Temperature", averaged_data["Temperature"])
108 # Save the averaged data to a CSV file in the experiment directory
109 averaged_data_path = ctx.experiment_dir / StepFiles.AVERAGED_DATA
110 averaged_data_sorted.to_csv(averaged_data_path, index=False)
111 logger.info(f"Averaged data saved to {averaged_data_path}")