Coverage for src/instawell/processing/step_04_subtract_background.py: 98%
47 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-07 15:47 -0600
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-07 15:47 -0600
1import logging
3import pandas as pd
5from instawell.core.exp_context import ExperimentContext
6from instawell.core.steps import StepFiles
7from instawell.utils.logging_util import setup_experiment_logging
9# set logging level to INFO
10logger = logging.getLogger(__name__)
13def find_background_column(
14 data: pd.DataFrame,
15 concentration: str,
16 ligand: str,
17 protein: str,
18 buffer: str,
19 sep: str = "|",
20 non_protein_control_marker: str = "NPC",
21) -> str | None:
22 """Helper function to find the background column in the data."""
23 if protein == non_protein_control_marker:
24 return None # Dont remove background for NPC
25 for col in data.columns:
26 if f"{concentration}{sep}{ligand}{sep}{non_protein_control_marker}{sep}{buffer}" in col:
27 return col
28 return None
31def subtract_background(ctx: ExperimentContext) -> None:
32 """Finds the background column for each unique condition and subtracts it from the data. The BG col should be in the format 'concentration_ligand_NPC_buffer'."""
33 if ctx.log_to_file:
34 setup_experiment_logging(
35 experiment_dir=ctx.experiment_dir, filename="experiment.log", level=ctx.log_level
36 )
37 # Build the path to the averaged data
38 averaged_data_path = ctx.experiment_dir / StepFiles.AVERAGED_DATA
39 if not averaged_data_path.exists():
40 raise FileNotFoundError(f"Averaged data file not found: {averaged_data_path}")
41 # Load the averaged data
42 data = pd.read_csv(averaged_data_path)
44 # create a dictionary of the column names
45 columns_dict = {}
46 for col in data.columns:
47 if col == "Temperature":
48 continue
49 columns_dict[col] = False
50 # get number of items in the ctx.fields tuple
51 num_fields = len(ctx.fields)
52 for col in data.columns:
53 parts = col.split(ctx.condition_separator)
54 if len(parts) < num_fields:
55 continue
56 # Note: we create this df so we know the order of fields
57 concentration = parts[0]
58 ligand = parts[1]
59 protein = parts[2]
60 buffer = parts[3]
62 background_col = find_background_column(
63 data,
64 concentration,
65 ligand,
66 protein,
67 buffer,
68 ctx.condition_separator,
69 ctx.non_protein_control_marker,
70 )
72 if background_col and background_col in data.columns:
73 data[col] = data[col] - data[background_col]
74 columns_dict[background_col] = True
75 columns_dict[col] = True
76 logging.info(f"Background subtracted for {col} using {background_col} as background.")
78 # ensure that all columns have been marked as True
79 for col, marked in columns_dict.items():
80 if not marked:
81 logging.warning(
82 f"Warning: Column {col} was not processed for background subtraction. Check if the background column exists."
83 )
85 # Remove the background columns
86 data = data.loc[:, ~data.columns.str.contains(ctx.non_protein_control_marker)]
88 # Save the data with background subtracted
89 background_subtracted_path = ctx.experiment_dir / StepFiles.BG_SUB_DATA
90 data.to_csv(background_subtracted_path, index=False)
91 logger.info(f"Background subtracted data saved to {background_subtracted_path}")