Examine the quality of the guide and samples and masks the low-quality guides and samples.
import perturb_tools as pt
import bean as be
import matplotlib.pyplot as plt
plt.style.use('default')
! jt -r
Reset css and font defaults in: /PHShome/jr1025/.jupyter/custom & /PHShome/jr1025/.local/share/jupyter/nbextensions
plt.style.use('default')
exp_id = "LDLRCDS"
save_format = "png"
bdata_path = "../results/mapped/LDLRCDS/bean_count_LDLRCDS_combined.h5ad"
out_bdata_path = "../results/mapped/LDLRCDS/bean_count_LDLRCDS_masked.h5ad"
edit_quantification_start_pos = 2
edit_quantification_end_pos = 7
corr_X_thres=0.7
edit_rate_thres=0.1
lfc_thres=-0.1
replicate_label="rep"
condition_label="bin"
# Parameters
bdata_path = "results/mapped/LDLvar/bean_count_LDLvar_combined.h5ad"
out_bdata_path = "results/filtered_annotated/LDLvar/bean_count_LDLvar_masked.h5ad"
edit_quantification_start_pos = 2
edit_quantification_end_pos = 7
corr_X_thres = 0.8
edit_rate_thres = 0.1
lfc_thres = -0.1
condition_label = "bin"
bdata = be.read_h5ad(bdata_path)
Annotate unannotated samples & log-normalize guides
bdata.samples[["rep", condition_label]] = bdata.samples.index.to_series().str.split("_", expand=True)
bdata.log_norm()
pt.qc.plot_guide_coverage(bdata, figsize=(6,4))
<AxesSubplot: title={'center': 'Guide coverage'}, xlabel='counts', ylabel='# guides'>
plt.style.use('default')
pt.qc.plot_X_gini(bdata)
plt.savefig(f"{exp_id}_gini.{save_format}")
pt.qc.plot_correlation(bdata, "Spearman")
<AxesSubplot: title={'center': 'corr_X'}>
lfcs = bdata[bdata.guides.Group == "PosCtrl",:].log_fold_change_reps(cond1="top", cond2="bot", rep_col="rep", compare_col=condition_label)
ax=pt.qc.plot_lfc_correlation(bdata, bdata.guides.Group == "PosCtrl", method="Spearman", cond1="top", cond2="bot", rep_col="rep", compare_col=condition_label, figsize=(10,10))
ax.set_title("top/bot LFC correlation, Spearman")
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()
bdata.uns['allele_counts'] = bdata.uns['allele_counts'].loc[bdata.uns['allele_counts'].allele.map(str) != ""]
bdata.get_edit_from_allele()
bdata.get_edit_mat_from_uns(
rel_pos_start=edit_quantification_start_pos,
rel_pos_end=edit_quantification_end_pos,
rel_pos_is_reporter=False
)
bdata.get_guide_edit_rate(
editable_base_start = edit_quantification_start_pos,
editable_base_end=edit_quantification_end_pos)
be.qc.plot_guide_edit_rates(bdata)
New edit matrix saved in .layers['edits']. Returning old edits.
<AxesSubplot: xlabel='Editing rate', ylabel='Count'>
bdata.get_edit_rate(
editable_base_start = edit_quantification_start_pos,
editable_base_end=edit_quantification_end_pos
)
be.qc.plot_sample_edit_rates(bdata)
<AxesSubplot: xlabel='Editing rate', ylabel='Density'>
bdata.samples.style.background_gradient(cmap="coolwarm_r")
bin | rep | gini_X | median_corr_X | median_lfc_corr.top_bot | median_editing_rate | |
---|---|---|---|---|---|---|
rep5_top | top | rep5 | 0.493510 | 0.848909 | 0.186755 | 0.339147 |
rep5_high | high | rep5 | 0.488701 | 0.866966 | 0.186755 | 0.333333 |
rep5_bulk | bulk | rep5 | 0.486657 | 0.850945 | 0.186755 | 0.304582 |
rep5_low | low | rep5 | 0.486369 | 0.864541 | 0.186755 | 0.322271 |
rep5_bot | bot | rep5 | 0.489634 | 0.852060 | 0.186755 | 0.333333 |
rep6_top | top | rep6 | 0.546780 | 0.796851 | 0.159803 | 0.319277 |
rep6_high | high | rep6 | 0.536605 | 0.790187 | 0.159803 | 0.329041 |
rep6_bulk | bulk | rep6 | 0.545218 | 0.789376 | 0.159803 | 0.288397 |
rep6_low | low | rep6 | 0.562569 | 0.788251 | 0.159803 | 0.308665 |
rep6_bot | bot | rep6 | 0.547677 | 0.789220 | 0.159803 | 0.303136 |
rep7_top | top | rep7 | 0.496987 | 0.866755 | -0.113122 | 0.320755 |
rep7_high | high | rep7 | 0.556694 | 0.681675 | -0.113122 | 0.125000 |
rep7_bulk | bulk | rep7 | 0.469606 | 0.858363 | -0.113122 | 0.300892 |
rep7_low | low | rep7 | 0.482696 | 0.877669 | -0.113122 | 0.306667 |
rep7_bot | bot | rep7 | 0.507931 | 0.855484 | -0.113122 | 0.307692 |
rep8_top | top | rep8 | 0.502173 | 0.854318 | -0.132698 | 0.302076 |
rep8_high | high | rep8 | 0.499085 | 0.873216 | -0.132698 | 0.308123 |
rep8_bulk | bulk | rep8 | 0.492796 | 0.881288 | -0.132698 | 0.311688 |
rep8_low | low | rep8 | 0.488354 | 0.881742 | -0.132698 | 0.303226 |
rep8_bot | bot | rep8 | 0.493498 | 0.871812 | -0.132698 | 0.310811 |
rep9_top | top | rep9 | 0.584971 | 0.627554 | -0.034763 | 0.000000 |
rep9_high | high | rep9 | 0.493625 | 0.866789 | -0.034763 | 0.306250 |
rep9_bulk | bulk | rep9 | 0.489102 | 0.874705 | -0.034763 | 0.319297 |
rep9_low | low | rep9 | 0.499184 | 0.878257 | -0.034763 | 0.296296 |
rep9_bot | bot | rep9 | 0.498359 | 0.863626 | -0.034763 | 0.303030 |
rep10_top | top | rep10 | 0.538189 | 0.829731 | 0.196945 | 0.333333 |
rep10_high | high | rep10 | 0.532495 | 0.831738 | 0.196945 | 0.333333 |
rep10_bulk | bulk | rep10 | 0.511525 | 0.858363 | 0.196945 | 0.352381 |
rep10_low | low | rep10 | 0.494003 | 0.845543 | 0.196945 | 0.356194 |
rep10_bot | bot | rep10 | 0.502412 | 0.857996 | 0.196945 | 0.350669 |
rep11_top | top | rep11 | 0.564059 | 0.809491 | 0.243655 | 0.317779 |
rep11_high | high | rep11 | 0.530025 | 0.825476 | 0.243655 | 0.328125 |
rep11_bulk | bulk | rep11 | 0.525725 | 0.820250 | 0.243655 | 0.316832 |
rep11_low | low | rep11 | 0.547771 | 0.803748 | 0.243655 | 0.327273 |
rep11_bot | bot | rep11 | 0.533811 | 0.839518 | 0.243655 | 0.349020 |
rep12_top | top | rep12 | 0.547334 | 0.806682 | 0.207608 | 0.308124 |
rep12_high | high | rep12 | 0.550476 | 0.815312 | 0.207608 | 0.326087 |
rep12_bulk | bulk | rep12 | 0.549551 | 0.816621 | 0.207608 | 0.308304 |
rep12_low | low | rep12 | 0.549946 | 0.808375 | 0.207608 | 0.319383 |
rep12_bot | bot | rep12 | 0.525709 | 0.840556 | 0.207608 | 0.330357 |
rep13_top | top | rep13 | 0.553004 | 0.795911 | 0.163633 | 0.326295 |
rep13_high | high | rep13 | 0.570676 | 0.810649 | 0.163633 | 0.333333 |
rep13_bulk | bulk | rep13 | 0.538722 | 0.826259 | 0.163633 | 0.333333 |
rep13_low | low | rep13 | 0.546158 | 0.819364 | 0.163633 | 0.326309 |
rep13_bot | bot | rep13 | 0.531331 | 0.834095 | 0.163633 | 0.345865 |
rep14_top | top | rep14 | 0.550545 | 0.854848 | 0.196976 | 0.319149 |
rep14_high | high | rep14 | 0.519600 | 0.860293 | 0.196976 | 0.301894 |
rep14_bulk | bulk | rep14 | 0.524859 | 0.868545 | 0.196976 | 0.336406 |
rep14_low | low | rep14 | 0.524069 | 0.866804 | 0.196976 | 0.326157 |
rep14_bot | bot | rep14 | 0.534318 | 0.839321 | 0.196976 | 0.333333 |
rep15_top | top | rep15 | 0.528447 | 0.865505 | 0.246431 | 0.327418 |
rep15_high | high | rep15 | 0.545241 | 0.824235 | 0.246431 | 0.333333 |
rep15_bulk | bulk | rep15 | 0.526414 | 0.845968 | 0.246431 | 0.341711 |
rep15_low | low | rep15 | 0.547233 | 0.834643 | 0.246431 | 0.307692 |
rep15_bot | bot | rep15 | 0.514986 | 0.858354 | 0.246431 | 0.333333 |
Assign sample mask to mask low-quality samples.
bdata.samples['mask'] = 1
bdata.samples.loc[bdata.samples.median_corr_X < corr_X_thres, 'mask'] = 0
bdata.samples.loc[bdata.samples.median_editing_rate < edit_rate_thres, 'mask'] = 0
bdata = bdata[:, bdata.samples["median_lfc_corr.top_bot"] > lfc_thres]
# leave replicate with more than 1 sorting bin data
rep_n_samples = bdata.samples.groupby(replicate_label)['mask'].sum()
print(rep_n_samples)
rep_has_too_small_sample = rep_n_samples.loc[rep_n_samples < 2].index.tolist()
rep_has_too_small_sample
print(f"Excluding reps {rep_has_too_small_sample} that has less than 2 samples per replicate.")
bdata = bdata[:, ~bdata.samples[replicate_label].isin(rep_has_too_small_sample)]
rep rep10 5 rep11 5 rep12 5 rep13 4 rep14 5 rep15 5 rep5 5 rep6 0 rep9 4 Name: mask, dtype: int64 Excluding reps ['rep6'] that has less than 2 samples per replicate.
bdata.samples.style.background_gradient(cmap="coolwarm_r")
bin | rep | gini_X | median_corr_X | median_lfc_corr.top_bot | median_editing_rate | mask | |
---|---|---|---|---|---|---|---|
rep5_top | top | rep5 | 0.493510 | 0.848909 | 0.186755 | 0.339147 | 1 |
rep5_high | high | rep5 | 0.488701 | 0.866966 | 0.186755 | 0.333333 | 1 |
rep5_bulk | bulk | rep5 | 0.486657 | 0.850945 | 0.186755 | 0.304582 | 1 |
rep5_low | low | rep5 | 0.486369 | 0.864541 | 0.186755 | 0.322271 | 1 |
rep5_bot | bot | rep5 | 0.489634 | 0.852060 | 0.186755 | 0.333333 | 1 |
rep9_top | top | rep9 | 0.584971 | 0.627554 | -0.034763 | 0.000000 | 0 |
rep9_high | high | rep9 | 0.493625 | 0.866789 | -0.034763 | 0.306250 | 1 |
rep9_bulk | bulk | rep9 | 0.489102 | 0.874705 | -0.034763 | 0.319297 | 1 |
rep9_low | low | rep9 | 0.499184 | 0.878257 | -0.034763 | 0.296296 | 1 |
rep9_bot | bot | rep9 | 0.498359 | 0.863626 | -0.034763 | 0.303030 | 1 |
rep10_top | top | rep10 | 0.538189 | 0.829731 | 0.196945 | 0.333333 | 1 |
rep10_high | high | rep10 | 0.532495 | 0.831738 | 0.196945 | 0.333333 | 1 |
rep10_bulk | bulk | rep10 | 0.511525 | 0.858363 | 0.196945 | 0.352381 | 1 |
rep10_low | low | rep10 | 0.494003 | 0.845543 | 0.196945 | 0.356194 | 1 |
rep10_bot | bot | rep10 | 0.502412 | 0.857996 | 0.196945 | 0.350669 | 1 |
rep11_top | top | rep11 | 0.564059 | 0.809491 | 0.243655 | 0.317779 | 1 |
rep11_high | high | rep11 | 0.530025 | 0.825476 | 0.243655 | 0.328125 | 1 |
rep11_bulk | bulk | rep11 | 0.525725 | 0.820250 | 0.243655 | 0.316832 | 1 |
rep11_low | low | rep11 | 0.547771 | 0.803748 | 0.243655 | 0.327273 | 1 |
rep11_bot | bot | rep11 | 0.533811 | 0.839518 | 0.243655 | 0.349020 | 1 |
rep12_top | top | rep12 | 0.547334 | 0.806682 | 0.207608 | 0.308124 | 1 |
rep12_high | high | rep12 | 0.550476 | 0.815312 | 0.207608 | 0.326087 | 1 |
rep12_bulk | bulk | rep12 | 0.549551 | 0.816621 | 0.207608 | 0.308304 | 1 |
rep12_low | low | rep12 | 0.549946 | 0.808375 | 0.207608 | 0.319383 | 1 |
rep12_bot | bot | rep12 | 0.525709 | 0.840556 | 0.207608 | 0.330357 | 1 |
rep13_top | top | rep13 | 0.553004 | 0.795911 | 0.163633 | 0.326295 | 0 |
rep13_high | high | rep13 | 0.570676 | 0.810649 | 0.163633 | 0.333333 | 1 |
rep13_bulk | bulk | rep13 | 0.538722 | 0.826259 | 0.163633 | 0.333333 | 1 |
rep13_low | low | rep13 | 0.546158 | 0.819364 | 0.163633 | 0.326309 | 1 |
rep13_bot | bot | rep13 | 0.531331 | 0.834095 | 0.163633 | 0.345865 | 1 |
rep14_top | top | rep14 | 0.550545 | 0.854848 | 0.196976 | 0.319149 | 1 |
rep14_high | high | rep14 | 0.519600 | 0.860293 | 0.196976 | 0.301894 | 1 |
rep14_bulk | bulk | rep14 | 0.524859 | 0.868545 | 0.196976 | 0.336406 | 1 |
rep14_low | low | rep14 | 0.524069 | 0.866804 | 0.196976 | 0.326157 | 1 |
rep14_bot | bot | rep14 | 0.534318 | 0.839321 | 0.196976 | 0.333333 | 1 |
rep15_top | top | rep15 | 0.528447 | 0.865505 | 0.246431 | 0.327418 | 1 |
rep15_high | high | rep15 | 0.545241 | 0.824235 | 0.246431 | 0.333333 | 1 |
rep15_bulk | bulk | rep15 | 0.526414 | 0.845968 | 0.246431 | 0.341711 | 1 |
rep15_low | low | rep15 | 0.547233 | 0.834643 | 0.246431 | 0.307692 | 1 |
rep15_bot | bot | rep15 | 0.514986 | 0.858354 | 0.246431 | 0.333333 | 1 |
outlier_guides, mask = be.qc.get_outlier_guides_and_mask(bdata, condit_col = condition_label, replicate_col = "rep")
outlier_guides
name | sample | RPM | rep | |
---|---|---|---|---|
0 | CONTROL_1_g3 | rep9_top | 43769.902344 | rep9 |
1 | LDLR_SA_1_g1 | rep9_top | 25234.220703 | rep9 |
2 | LDLR_SA_2_g1 | rep9_top | 54505.914062 | rep9 |
3 | LDLR_SA_2_g2 | rep9_top | 14498.206055 | rep9 |
4 | LDLR_SA_3_g2 | rep9_top | 19820.332031 | rep9 |
5 | LDLR_SA_4_g1 | rep9_top | 33400.929688 | rep9 |
6 | LDLR_SA_4_g2 | rep9_top | 28629.369141 | rep9 |
7 | LDLR_SA_4_g4 | rep9_top | 39457.144531 | rep9 |
8 | LDLR_SA_5_g2 | rep9_top | 36429.035156 | rep9 |
9 | LDLR_SA_6_g2 | rep9_top | 13305.315430 | rep9 |
10 | LDLR_SA_8_g1 | rep9_top | 20921.460938 | rep9 |
11 | LDLR_SA_8_g4 | rep9_top | 26060.066406 | rep9 |
12 | LDLR_SD_10_g2 | rep9_top | 31657.474609 | rep9 |
13 | LDLR_SD_10_g4 | rep9_top | 10460.730469 | rep9 |
14 | LDLR_SD_11_g4 | rep9_top | 23031.960938 | rep9 |
15 | LDLR_SD_15_g1 | rep9_top | 16241.661133 | rep9 |
16 | LDLR_SD_15_g2 | rep9_top | 11194.817383 | rep9 |
17 | LDLR_SD_16_g1 | rep9_top | 23215.482422 | rep9 |
18 | LDLR_SD_9_g4 | rep9_top | 28078.804688 | rep9 |
19 | rs2301249_Maj_ABE_145_g1 | rep11_top | 214290.734375 | rep11 |
20 | rs34468565_Maj_ABE_170_g4 | rep11_top | 205705.796875 | rep11 |
21 | rs3750944_Min_ABE_471_g2 | rep14_top | 359395.593750 | rep14 |
22 | rs2301249_Maj_ABE_145_g1 | rep5_high | 42865.296875 | rep5 |
23 | rs34468565_Maj_ABE_170_g4 | rep5_high | 38354.488281 | rep5 |
24 | CONTROL_2_g5 | rep12_high | 194362.250000 | rep12 |
25 | CONTROL_2_g5 | rep13_high | 311175.718750 | rep13 |
26 | rs3750944_Min_ABE_471_g2 | rep13_high | 298567.593750 | rep13 |
27 | rs12610605_Min_ABE_395_g5 | rep15_high | 15965.768555 | rep15 |
28 | rs2301249_Maj_ABE_145_g1 | rep5_bulk | 18126.062500 | rep5 |
29 | rs34468565_Maj_ABE_170_g4 | rep5_bulk | 36164.839844 | rep5 |
30 | rs625219_Min_ABE_535_g5 | rep5_bulk | 11696.111328 | rep5 |
31 | CONTROL_2_g5 | rep12_bulk | 93601.929688 | rep12 |
32 | rs3750944_Min_ABE_471_g2 | rep12_bulk | 277782.375000 | rep12 |
33 | CONTROL_2_g5 | rep11_low | 73574.234375 | rep11 |
34 | rs2301249_Maj_ABE_145_g1 | rep5_bot | 27445.576172 | rep5 |
35 | rs34468565_Maj_ABE_170_g4 | rep5_bot | 23435.796875 | rep5 |
36 | rs3750944_Min_ABE_471_g2 | rep11_bot | 86463.968750 | rep11 |
37 | LDLR_SD_15_g2 | rep13_bot | 18775.865234 | rep13 |
outlier_guides_n_samples = outlier_guides['name'].value_counts()
guides_to_exclude = outlier_guides_n_samples.loc[outlier_guides_n_samples > 2].index
guides_to_exclude
Index(['CONTROL_2_g5', 'rs3750944_Min_ABE_471_g2', 'rs34468565_Maj_ABE_170_g4', 'rs2301249_Maj_ABE_145_g1'], dtype='object')
bdata.uns['repguide_mask'] = mask
bdata = bdata[~bdata.guides.index.isin(guides_to_exclude),:]
bdata
Genome Editing Screen comprised of n_guides x n_conditions = 3451 x 40 guides: 'Unnamed: 0', 'Target gene/variant', 'Target descriptor', 'Arbitrary number', 'gRNA position category', 'Target base position in gRNA', 'Target base position in reporter', 'BE', 'Group', 'sequence', 'Reporter', 'barcode', '5-nt PAM', 'offset', 'target', 'target_pos', 'Group2', 'masked_sequence', 'masked_barcode', 'edit_rate' samples: 'bin', 'rep', 'gini_X', 'median_corr_X', 'median_lfc_corr.top_bot', 'median_editing_rate', 'mask' samples_m: samples_p: layers: 'X_bcmatch', 'edits', 'lognorm_counts', 'edit_rate', 'X_RPM' uns: 'allele_counts', 'edit_counts', 'target_base_change', 'tiling', 'lfc', 'lfc_corr', 'repguide_mask'
bdata.uns['repguide_mask'].shape
(3451, 8)
bdata.write(out_bdata_path)
... storing 'bin' as categorical ... storing 'rep' as categorical