Gene Set Analysis Example#
This example demonstrates how to use UpSet plots to visualize gene set intersections, a common use case in bioinformatics. We’ll use simulated data representing genes involved in different biological pathways.
Setup#
First, let’s import our libraries and create some sample data:
import altair_upset as au
import pandas as pd
import numpy as np
# Simulate gene set data
np.random.seed(42)
n_genes = 2000
# Define pathways and their approximate sizes
pathways = {
'Cell_Cycle': 0.15, # 15% of genes
'DNA_Repair': 0.10,
'Apoptosis': 0.12,
'Immune_Response': 0.20,
'Metabolism': 0.25,
'Signal_Transduction': 0.30
}
# Create data with realistic overlaps
data = pd.DataFrame()
for pathway, prob in pathways.items():
# Add some correlation between related pathways
if pathway == 'Cell_Cycle':
data[pathway] = np.random.choice([0, 1], size=n_genes, p=[1-prob, prob])
elif pathway == 'DNA_Repair':
# DNA repair genes are more likely to be involved in cell cycle
p_repair = np.where(data['Cell_Cycle'] == 1, 0.3, 0.05)
p_repair = np.clip(p_repair, 0, 1) # Ensure probabilities are valid
data[pathway] = np.random.binomial(1, p_repair)
elif pathway == 'Apoptosis':
# Apoptosis genes might be involved in cell cycle and DNA repair
p_apoptosis = 0.05 + 0.15 * data['Cell_Cycle'] + 0.1 * data['DNA_Repair']
p_apoptosis = np.clip(p_apoptosis, 0, 1) # Ensure probabilities are valid
data[pathway] = np.random.binomial(1, p_apoptosis)
else:
data[pathway] = np.random.choice([0, 1], size=n_genes, p=[1-prob, prob])
Basic UpSet Plot#
Create a basic UpSet plot showing all pathway intersections:
au.UpSetAltair(
data=data,
sets=data.columns.tolist(),
sort_by="frequency",
sort_order="descending",
title="Gene Set Intersections",
subtitle="Distribution of genes across pathways",
glyph_size=100, # Ensure positive size
set_label_bg_size=500, # Ensure positive size
line_connection_size=2,
).chart
Focused DNA Repair Analysis#
Create a focused view of DNA repair pathways:
dna_repair_pathways = ['DNA_Repair', 'Cell_Cycle', 'Apoptosis']
au.UpSetAltair(
data=data[dna_repair_pathways],
sets=dna_repair_pathways,
sort_by="frequency",
sort_order="descending",
title="DNA Repair Pathway Intersections",
subtitle="Focused analysis of DNA repair mechanisms",
glyph_size=100, # Ensure positive size
set_label_bg_size=500, # Ensure positive size
line_connection_size=2,
).chart
Analysis Results#
Let’s analyze the pathway intersections in detail:
Single Pathway Analysis#
print("\nGenes unique to each pathway:")
for pathway in pathways:
unique_genes = data[data[pathway] == 1][
data.drop(columns=[pathway]).sum(axis=1) == 0
]
print(
f"{pathway}: {len(unique_genes)} genes ({len(unique_genes)/n_genes*100:.1f}%)"
)
Multi-Pathway Analysis#
# Multi-pathway genes
multi_pathway = data[data.sum(axis=1) > 1]
print(
f"\nGenes involved in multiple pathways: {len(multi_pathway)} ({len(multi_pathway)/n_genes*100:.1f}%)"
)
# Most common pathway combination
def get_pathway_combination(row):
return " & ".join(data.columns[row == 1])
most_common = (
data.groupby(data.columns.tolist()).size().sort_values(ascending=False).head(1)
)
combination = get_pathway_combination(
pd.Series(most_common.index[0], index=data.columns)
)
print(f"\nMost common pathway combination: {combination}")
print(
f"Number of genes: {most_common.values[0]} ({most_common.values[0]/n_genes*100:.1f}%)"
)
DNA Repair Pathway Analysis#
dna_repair_genes = data[data["DNA_Repair"] == 1]
print(f"\nDNA Repair Pathway Analysis:")
print(
f"Total DNA repair genes: {len(dna_repair_genes)} ({len(dna_repair_genes)/n_genes*100:.1f}%)"
)
print("Co-occurrence with other pathways:")
for pathway in pathways:
if pathway != "DNA_Repair":
co_occurrence = data[(data["DNA_Repair"] == 1) & (data[pathway] == 1)]
print(
f"{pathway}: {len(co_occurrence)} genes ({len(co_occurrence)/len(dna_repair_genes)*100:.1f}%)"
)