Coverage for src/instawell/processing/step_04_subtract_background.py: 98%

47 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-07 15:47 -0600

1import logging 

2 

3import pandas as pd 

4 

5from instawell.core.exp_context import ExperimentContext 

6from instawell.core.steps import StepFiles 

7from instawell.utils.logging_util import setup_experiment_logging 

8 

9# set logging level to INFO 

10logger = logging.getLogger(__name__) 

11 

12 

13def find_background_column( 

14 data: pd.DataFrame, 

15 concentration: str, 

16 ligand: str, 

17 protein: str, 

18 buffer: str, 

19 sep: str = "|", 

20 non_protein_control_marker: str = "NPC", 

21) -> str | None: 

22 """Helper function to find the background column in the data.""" 

23 if protein == non_protein_control_marker: 

24 return None # Dont remove background for NPC 

25 for col in data.columns: 

26 if f"{concentration}{sep}{ligand}{sep}{non_protein_control_marker}{sep}{buffer}" in col: 

27 return col 

28 return None 

29 

30 

31def subtract_background(ctx: ExperimentContext) -> None: 

32 """Finds the background column for each unique condition and subtracts it from the data. The BG col should be in the format 'concentration_ligand_NPC_buffer'.""" 

33 if ctx.log_to_file: 

34 setup_experiment_logging( 

35 experiment_dir=ctx.experiment_dir, filename="experiment.log", level=ctx.log_level 

36 ) 

37 # Build the path to the averaged data 

38 averaged_data_path = ctx.experiment_dir / StepFiles.AVERAGED_DATA 

39 if not averaged_data_path.exists(): 

40 raise FileNotFoundError(f"Averaged data file not found: {averaged_data_path}") 

41 # Load the averaged data 

42 data = pd.read_csv(averaged_data_path) 

43 

44 # create a dictionary of the column names 

45 columns_dict = {} 

46 for col in data.columns: 

47 if col == "Temperature": 

48 continue 

49 columns_dict[col] = False 

50 # get number of items in the ctx.fields tuple 

51 num_fields = len(ctx.fields) 

52 for col in data.columns: 

53 parts = col.split(ctx.condition_separator) 

54 if len(parts) < num_fields: 

55 continue 

56 # Note: we create this df so we know the order of fields 

57 concentration = parts[0] 

58 ligand = parts[1] 

59 protein = parts[2] 

60 buffer = parts[3] 

61 

62 background_col = find_background_column( 

63 data, 

64 concentration, 

65 ligand, 

66 protein, 

67 buffer, 

68 ctx.condition_separator, 

69 ctx.non_protein_control_marker, 

70 ) 

71 

72 if background_col and background_col in data.columns: 

73 data[col] = data[col] - data[background_col] 

74 columns_dict[background_col] = True 

75 columns_dict[col] = True 

76 logging.info(f"Background subtracted for {col} using {background_col} as background.") 

77 

78 # ensure that all columns have been marked as True 

79 for col, marked in columns_dict.items(): 

80 if not marked: 

81 logging.warning( 

82 f"Warning: Column {col} was not processed for background subtraction. Check if the background column exists." 

83 ) 

84 

85 # Remove the background columns 

86 data = data.loc[:, ~data.columns.str.contains(ctx.non_protein_control_marker)] 

87 

88 # Save the data with background subtracted 

89 background_subtracted_path = ctx.experiment_dir / StepFiles.BG_SUB_DATA 

90 data.to_csv(background_subtracted_path, index=False) 

91 logger.info(f"Background subtracted data saved to {background_subtracted_path}")