ReporterScreen sample / guide quality report¶

Examine the quality of the guide and samples and masks the low-quality guides and samples.

In [1]:
import perturb_tools as pt
import bean as be
import matplotlib.pyplot as plt
plt.style.use('default')
In [2]:
! jt -r
Reset css and font defaults in:
/PHShome/jr1025/.jupyter/custom &
/PHShome/jr1025/.local/share/jupyter/nbextensions
In [3]:
plt.style.use('default')
In [4]:
exp_id = "LDLRCDS"
save_format = "png"
bdata_path = "../results/mapped/LDLRCDS/bean_count_LDLRCDS_combined.h5ad"
out_bdata_path = "../results/mapped/LDLRCDS/bean_count_LDLRCDS_masked.h5ad"
edit_quantification_start_pos = 2
edit_quantification_end_pos = 7
corr_X_thres=0.7
edit_rate_thres=0.1
lfc_thres=-0.1
replicate_label="rep"
condition_label="bin"
In [5]:
# Parameters
bdata_path = "results/mapped/LDLvar/bean_count_LDLvar_combined.h5ad"
out_bdata_path = "results/filtered_annotated/LDLvar/bean_count_LDLvar_masked.h5ad"
edit_quantification_start_pos = 2
edit_quantification_end_pos = 7
corr_X_thres = 0.8
edit_rate_thres = 0.1
lfc_thres = -0.1
condition_label = "bin"
In [6]:
bdata = be.read_h5ad(bdata_path)

Annotate unannotated samples & log-normalize guides

In [7]:
bdata.samples[["rep", condition_label]] = bdata.samples.index.to_series().str.split("_", expand=True)
In [8]:
bdata.log_norm()

Sample quality¶

Visualize quality metrics¶

1. Guide coverage¶

In [9]:
pt.qc.plot_guide_coverage(bdata, figsize=(6,4))
Out[9]:
<AxesSubplot: title={'center': 'Guide coverage'}, xlabel='counts', ylabel='# guides'>
In [10]:
plt.style.use('default')
pt.qc.plot_X_gini(bdata)
plt.savefig(f"{exp_id}_gini.{save_format}")

2. Guide abundance correlation¶

In [11]:
pt.qc.plot_correlation(bdata, "Spearman")
Out[11]:
<AxesSubplot: title={'center': 'corr_X'}>

3. LFC correlation of positive controls¶

In [12]:
lfcs = bdata[bdata.guides.Group == "PosCtrl",:].log_fold_change_reps(cond1="top", cond2="bot", rep_col="rep", compare_col=condition_label)
In [13]:
ax=pt.qc.plot_lfc_correlation(bdata, bdata.guides.Group == "PosCtrl", method="Spearman", cond1="top", cond2="bot", rep_col="rep", compare_col=condition_label, figsize=(10,10))

ax.set_title("top/bot LFC correlation, Spearman")
plt.yticks(rotation=0) 
plt.xticks(rotation=90) 
plt.show()

4. Guide editing rates¶

In [14]:
bdata.uns['allele_counts'] = bdata.uns['allele_counts'].loc[bdata.uns['allele_counts'].allele.map(str) != ""]
In [15]:
bdata.get_edit_from_allele()
bdata.get_edit_mat_from_uns(
    rel_pos_start=edit_quantification_start_pos, 
    rel_pos_end=edit_quantification_end_pos, 
    rel_pos_is_reporter=False
)
bdata.get_guide_edit_rate(
    editable_base_start = edit_quantification_start_pos, 
    editable_base_end=edit_quantification_end_pos)
be.qc.plot_guide_edit_rates(bdata)
New edit matrix saved in .layers['edits']. Returning old edits.
Out[15]:
<AxesSubplot: xlabel='Editing rate', ylabel='Count'>
In [16]:
bdata.get_edit_rate(
    editable_base_start = edit_quantification_start_pos, 
    editable_base_end=edit_quantification_end_pos
)
be.qc.plot_sample_edit_rates(bdata)
Out[16]:
<AxesSubplot: xlabel='Editing rate', ylabel='Density'>

Mask low-quality samples¶

In [17]:
bdata.samples.style.background_gradient(cmap="coolwarm_r")
Out[17]:
  bin rep gini_X median_corr_X median_lfc_corr.top_bot median_editing_rate
rep5_top top rep5 0.493510 0.848909 0.186755 0.339147
rep5_high high rep5 0.488701 0.866966 0.186755 0.333333
rep5_bulk bulk rep5 0.486657 0.850945 0.186755 0.304582
rep5_low low rep5 0.486369 0.864541 0.186755 0.322271
rep5_bot bot rep5 0.489634 0.852060 0.186755 0.333333
rep6_top top rep6 0.546780 0.796851 0.159803 0.319277
rep6_high high rep6 0.536605 0.790187 0.159803 0.329041
rep6_bulk bulk rep6 0.545218 0.789376 0.159803 0.288397
rep6_low low rep6 0.562569 0.788251 0.159803 0.308665
rep6_bot bot rep6 0.547677 0.789220 0.159803 0.303136
rep7_top top rep7 0.496987 0.866755 -0.113122 0.320755
rep7_high high rep7 0.556694 0.681675 -0.113122 0.125000
rep7_bulk bulk rep7 0.469606 0.858363 -0.113122 0.300892
rep7_low low rep7 0.482696 0.877669 -0.113122 0.306667
rep7_bot bot rep7 0.507931 0.855484 -0.113122 0.307692
rep8_top top rep8 0.502173 0.854318 -0.132698 0.302076
rep8_high high rep8 0.499085 0.873216 -0.132698 0.308123
rep8_bulk bulk rep8 0.492796 0.881288 -0.132698 0.311688
rep8_low low rep8 0.488354 0.881742 -0.132698 0.303226
rep8_bot bot rep8 0.493498 0.871812 -0.132698 0.310811
rep9_top top rep9 0.584971 0.627554 -0.034763 0.000000
rep9_high high rep9 0.493625 0.866789 -0.034763 0.306250
rep9_bulk bulk rep9 0.489102 0.874705 -0.034763 0.319297
rep9_low low rep9 0.499184 0.878257 -0.034763 0.296296
rep9_bot bot rep9 0.498359 0.863626 -0.034763 0.303030
rep10_top top rep10 0.538189 0.829731 0.196945 0.333333
rep10_high high rep10 0.532495 0.831738 0.196945 0.333333
rep10_bulk bulk rep10 0.511525 0.858363 0.196945 0.352381
rep10_low low rep10 0.494003 0.845543 0.196945 0.356194
rep10_bot bot rep10 0.502412 0.857996 0.196945 0.350669
rep11_top top rep11 0.564059 0.809491 0.243655 0.317779
rep11_high high rep11 0.530025 0.825476 0.243655 0.328125
rep11_bulk bulk rep11 0.525725 0.820250 0.243655 0.316832
rep11_low low rep11 0.547771 0.803748 0.243655 0.327273
rep11_bot bot rep11 0.533811 0.839518 0.243655 0.349020
rep12_top top rep12 0.547334 0.806682 0.207608 0.308124
rep12_high high rep12 0.550476 0.815312 0.207608 0.326087
rep12_bulk bulk rep12 0.549551 0.816621 0.207608 0.308304
rep12_low low rep12 0.549946 0.808375 0.207608 0.319383
rep12_bot bot rep12 0.525709 0.840556 0.207608 0.330357
rep13_top top rep13 0.553004 0.795911 0.163633 0.326295
rep13_high high rep13 0.570676 0.810649 0.163633 0.333333
rep13_bulk bulk rep13 0.538722 0.826259 0.163633 0.333333
rep13_low low rep13 0.546158 0.819364 0.163633 0.326309
rep13_bot bot rep13 0.531331 0.834095 0.163633 0.345865
rep14_top top rep14 0.550545 0.854848 0.196976 0.319149
rep14_high high rep14 0.519600 0.860293 0.196976 0.301894
rep14_bulk bulk rep14 0.524859 0.868545 0.196976 0.336406
rep14_low low rep14 0.524069 0.866804 0.196976 0.326157
rep14_bot bot rep14 0.534318 0.839321 0.196976 0.333333
rep15_top top rep15 0.528447 0.865505 0.246431 0.327418
rep15_high high rep15 0.545241 0.824235 0.246431 0.333333
rep15_bulk bulk rep15 0.526414 0.845968 0.246431 0.341711
rep15_low low rep15 0.547233 0.834643 0.246431 0.307692
rep15_bot bot rep15 0.514986 0.858354 0.246431 0.333333

Assign sample mask to mask low-quality samples.

In [18]:
bdata.samples['mask'] = 1
bdata.samples.loc[bdata.samples.median_corr_X < corr_X_thres, 'mask'] = 0
bdata.samples.loc[bdata.samples.median_editing_rate < edit_rate_thres, 'mask'] = 0
bdata = bdata[:, bdata.samples["median_lfc_corr.top_bot"] > lfc_thres]
In [19]:
# leave replicate with more than 1 sorting bin data
rep_n_samples = bdata.samples.groupby(replicate_label)['mask'].sum()
print(rep_n_samples)
rep_has_too_small_sample = rep_n_samples.loc[rep_n_samples < 2].index.tolist()
rep_has_too_small_sample
print(f"Excluding reps {rep_has_too_small_sample} that has less than 2 samples per replicate.")
bdata = bdata[:, ~bdata.samples[replicate_label].isin(rep_has_too_small_sample)]
rep
rep10    5
rep11    5
rep12    5
rep13    4
rep14    5
rep15    5
rep5     5
rep6     0
rep9     4
Name: mask, dtype: int64
Excluding reps ['rep6'] that has less than 2 samples per replicate.
In [20]:
bdata.samples.style.background_gradient(cmap="coolwarm_r")
Out[20]:
  bin rep gini_X median_corr_X median_lfc_corr.top_bot median_editing_rate mask
rep5_top top rep5 0.493510 0.848909 0.186755 0.339147 1
rep5_high high rep5 0.488701 0.866966 0.186755 0.333333 1
rep5_bulk bulk rep5 0.486657 0.850945 0.186755 0.304582 1
rep5_low low rep5 0.486369 0.864541 0.186755 0.322271 1
rep5_bot bot rep5 0.489634 0.852060 0.186755 0.333333 1
rep9_top top rep9 0.584971 0.627554 -0.034763 0.000000 0
rep9_high high rep9 0.493625 0.866789 -0.034763 0.306250 1
rep9_bulk bulk rep9 0.489102 0.874705 -0.034763 0.319297 1
rep9_low low rep9 0.499184 0.878257 -0.034763 0.296296 1
rep9_bot bot rep9 0.498359 0.863626 -0.034763 0.303030 1
rep10_top top rep10 0.538189 0.829731 0.196945 0.333333 1
rep10_high high rep10 0.532495 0.831738 0.196945 0.333333 1
rep10_bulk bulk rep10 0.511525 0.858363 0.196945 0.352381 1
rep10_low low rep10 0.494003 0.845543 0.196945 0.356194 1
rep10_bot bot rep10 0.502412 0.857996 0.196945 0.350669 1
rep11_top top rep11 0.564059 0.809491 0.243655 0.317779 1
rep11_high high rep11 0.530025 0.825476 0.243655 0.328125 1
rep11_bulk bulk rep11 0.525725 0.820250 0.243655 0.316832 1
rep11_low low rep11 0.547771 0.803748 0.243655 0.327273 1
rep11_bot bot rep11 0.533811 0.839518 0.243655 0.349020 1
rep12_top top rep12 0.547334 0.806682 0.207608 0.308124 1
rep12_high high rep12 0.550476 0.815312 0.207608 0.326087 1
rep12_bulk bulk rep12 0.549551 0.816621 0.207608 0.308304 1
rep12_low low rep12 0.549946 0.808375 0.207608 0.319383 1
rep12_bot bot rep12 0.525709 0.840556 0.207608 0.330357 1
rep13_top top rep13 0.553004 0.795911 0.163633 0.326295 0
rep13_high high rep13 0.570676 0.810649 0.163633 0.333333 1
rep13_bulk bulk rep13 0.538722 0.826259 0.163633 0.333333 1
rep13_low low rep13 0.546158 0.819364 0.163633 0.326309 1
rep13_bot bot rep13 0.531331 0.834095 0.163633 0.345865 1
rep14_top top rep14 0.550545 0.854848 0.196976 0.319149 1
rep14_high high rep14 0.519600 0.860293 0.196976 0.301894 1
rep14_bulk bulk rep14 0.524859 0.868545 0.196976 0.336406 1
rep14_low low rep14 0.524069 0.866804 0.196976 0.326157 1
rep14_bot bot rep14 0.534318 0.839321 0.196976 0.333333 1
rep15_top top rep15 0.528447 0.865505 0.246431 0.327418 1
rep15_high high rep15 0.545241 0.824235 0.246431 0.333333 1
rep15_bulk bulk rep15 0.526414 0.845968 0.246431 0.341711 1
rep15_low low rep15 0.547233 0.834643 0.246431 0.307692 1
rep15_bot bot rep15 0.514986 0.858354 0.246431 0.333333 1
In [ ]:
 

Identify outlier guides¶

In [21]:
outlier_guides, mask = be.qc.get_outlier_guides_and_mask(bdata, condit_col = condition_label, replicate_col = "rep")
In [22]:
outlier_guides
Out[22]:
name sample RPM rep
0 CONTROL_1_g3 rep9_top 43769.902344 rep9
1 LDLR_SA_1_g1 rep9_top 25234.220703 rep9
2 LDLR_SA_2_g1 rep9_top 54505.914062 rep9
3 LDLR_SA_2_g2 rep9_top 14498.206055 rep9
4 LDLR_SA_3_g2 rep9_top 19820.332031 rep9
5 LDLR_SA_4_g1 rep9_top 33400.929688 rep9
6 LDLR_SA_4_g2 rep9_top 28629.369141 rep9
7 LDLR_SA_4_g4 rep9_top 39457.144531 rep9
8 LDLR_SA_5_g2 rep9_top 36429.035156 rep9
9 LDLR_SA_6_g2 rep9_top 13305.315430 rep9
10 LDLR_SA_8_g1 rep9_top 20921.460938 rep9
11 LDLR_SA_8_g4 rep9_top 26060.066406 rep9
12 LDLR_SD_10_g2 rep9_top 31657.474609 rep9
13 LDLR_SD_10_g4 rep9_top 10460.730469 rep9
14 LDLR_SD_11_g4 rep9_top 23031.960938 rep9
15 LDLR_SD_15_g1 rep9_top 16241.661133 rep9
16 LDLR_SD_15_g2 rep9_top 11194.817383 rep9
17 LDLR_SD_16_g1 rep9_top 23215.482422 rep9
18 LDLR_SD_9_g4 rep9_top 28078.804688 rep9
19 rs2301249_Maj_ABE_145_g1 rep11_top 214290.734375 rep11
20 rs34468565_Maj_ABE_170_g4 rep11_top 205705.796875 rep11
21 rs3750944_Min_ABE_471_g2 rep14_top 359395.593750 rep14
22 rs2301249_Maj_ABE_145_g1 rep5_high 42865.296875 rep5
23 rs34468565_Maj_ABE_170_g4 rep5_high 38354.488281 rep5
24 CONTROL_2_g5 rep12_high 194362.250000 rep12
25 CONTROL_2_g5 rep13_high 311175.718750 rep13
26 rs3750944_Min_ABE_471_g2 rep13_high 298567.593750 rep13
27 rs12610605_Min_ABE_395_g5 rep15_high 15965.768555 rep15
28 rs2301249_Maj_ABE_145_g1 rep5_bulk 18126.062500 rep5
29 rs34468565_Maj_ABE_170_g4 rep5_bulk 36164.839844 rep5
30 rs625219_Min_ABE_535_g5 rep5_bulk 11696.111328 rep5
31 CONTROL_2_g5 rep12_bulk 93601.929688 rep12
32 rs3750944_Min_ABE_471_g2 rep12_bulk 277782.375000 rep12
33 CONTROL_2_g5 rep11_low 73574.234375 rep11
34 rs2301249_Maj_ABE_145_g1 rep5_bot 27445.576172 rep5
35 rs34468565_Maj_ABE_170_g4 rep5_bot 23435.796875 rep5
36 rs3750944_Min_ABE_471_g2 rep11_bot 86463.968750 rep11
37 LDLR_SD_15_g2 rep13_bot 18775.865234 rep13
In [23]:
outlier_guides_n_samples = outlier_guides['name'].value_counts()
guides_to_exclude = outlier_guides_n_samples.loc[outlier_guides_n_samples > 2].index
guides_to_exclude
Out[23]:
Index(['CONTROL_2_g5', 'rs3750944_Min_ABE_471_g2', 'rs34468565_Maj_ABE_170_g4',
       'rs2301249_Maj_ABE_145_g1'],
      dtype='object')
In [24]:
bdata.uns['repguide_mask'] = mask
In [25]:
bdata = bdata[~bdata.guides.index.isin(guides_to_exclude),:]
In [26]:
bdata
Out[26]:
Genome Editing Screen comprised of n_guides x n_conditions = 3451 x 40
   guides:    'Unnamed: 0', 'Target gene/variant', 'Target descriptor', 'Arbitrary number', 'gRNA position category', 'Target base position in gRNA', 'Target base position in reporter', 'BE', 'Group', 'sequence', 'Reporter', 'barcode', '5-nt PAM', 'offset', 'target', 'target_pos', 'Group2', 'masked_sequence', 'masked_barcode', 'edit_rate'
   samples:   'bin', 'rep', 'gini_X', 'median_corr_X', 'median_lfc_corr.top_bot', 'median_editing_rate', 'mask'
   samples_m: 
   samples_p: 
   layers:    'X_bcmatch', 'edits', 'lognorm_counts', 'edit_rate', 'X_RPM'
   uns:       'allele_counts', 'edit_counts', 'target_base_change', 'tiling', 'lfc', 'lfc_corr', 'repguide_mask'
In [27]:
bdata.uns['repguide_mask'].shape
Out[27]:
(3451, 8)
In [28]:
bdata.write(out_bdata_path)
... storing 'bin' as categorical
... storing 'rep' as categorical
In [ ]: