import scanpy as sc
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
import os, re
# ###################################################### #
# ################ I M P O R T A N T ################# #
# ###################################################### #
# the R conda envoirment to perform limma DEG analysis #
# ###################################################### #
r_conda_env = "condaEnv"adata_hvgs = sc.read_h5ad("../output/1.1-sce-after-contamination-removal-integration-clustering.h5ad")
adata_hvgs.var.index = adata_hvgs.var.index.str.split(".").str[-1]
adata_hvgsAnnData object with n_obs × n_vars = 21588 × 661
obs: 'Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res'
var: 'ID', 'Symbol', 'Type', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'selected'
uns: 'X_name', 'donor_colors', 'leiden', 'logcounts.scaled.pca.harmony.neighbors', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res_colors', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res_colors', 'sequencing_run_colors', 'tissue_colors', 'tissue_origin_colors'
obsm: 'logcounts.scaled.pca', 'logcounts.scaled.pca.harmony', 'logcounts.scaled.pca.harmony.neighbors.umap'
layers: 'logcounts', 'logcounts.scaled'
obsp: 'logcounts.scaled.pca.harmony.neighbors_connectivities', 'logcounts.scaled.pca.harmony.neighbors_distances'
#specifying the column containing the clustering information
clustering_label = "logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res"
adata_hvgs.obs[clustering_label]donor1_fat.AAAGAACCACATACGT-1 0
donor1_fat.AAAGGTAGTCCGGTCA-1 0
donor1_fat.AAAGGTATCTCTCTTC-1 1
donor1_fat.AACCACACAGTGGCTC-1 0
donor1_fat.AACGGGATCTGCTGAA-1 0
..
donor5_recovery.TTTGGTTTCGACCAAT-1 1
donor5_recovery.TTTGTTGGTCCTACGG-1 0
donor5_recovery.TTTGTTGGTCGCAGTC-1 5
donor5_recovery.TTTGTTGGTGGCGTAA-1 3
donor5_recovery.TTTGTTGTCGGCATTA-1 3
Name: logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res, Length: 21588, dtype: category
Categories (11, object): ['0', '1', '2', '3', ..., '7', '8', '9', '10']
# This requires a couple of minutes
# save the scaled log-norm conunts in a temp csv file
adata_hvgs.to_df(layer="logcounts.scaled").T.to_csv("_temp_count_file.csv", sep="\t")
# save the cluster factor in a temp file (csv as well)
adata_hvgs.obs[clustering_label].astype(str).to_csv("_temp_factor.csv", sep="\t")
# save the batch factor in a temp file (csv as well)
adata_hvgs.obs["donor"].astype(str).to_csv("_temp_batch.csv", sep="\t")
# create the limma_results folder
os.system("rm -r limma_results")
os.system("mkdir limma_results")
# run the R script
os.system("conda run -n %s Rscript limma_differential_expression.R"%r_conda_env)
# delete the temp files
os.system("rm _temp_count_file.csv")
os.system("rm _temp_factor.csv")
os.system("rm _temp_batch.csv")topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
0
# compute the average (or median, to you) cluster log-expression
cluster_expression = pd.concat([
pd.Series(
np.array(adata_hvgs[adata_hvgs.obs[clustering_label]==cluster].layers["logcounts"].mean(axis=0)).ravel(), # decomment for mean expresssion
#np.median(adata_hvgs[adata_hvgs.obs[clustering_label]==cluster].layers["logcounts"].toarray(), axis=0), # decomment for median expresssion
index=adata_hvgs.var.index, name=cluster
)
for cluster in adata_hvgs.obs[clustering_label].cat.categories
], axis=1).stack().rename("AveLogExpr")
cluster_expression.index.names = ["gene", "cluster"]
cluster_expressiongene cluster
ISG15 0 0.133747
1 0.144939
2 0.132254
3 0.200549
4 0.196444
...
RPL10 6 4.439668
7 4.670093
8 4.797745
9 4.825947
10 4.923193
Name: AveLogExpr, Length: 7271, dtype: float64
# reading the results from limma's lmFit:
# F: overall moderated F-statistic
# P.Value, adj.P.Val: from limma's lmFit
# entropy: the Shannon entropy computed, for each gene, on the clusters expresion
limma_overall_results = pd.read_csv("limma_results/linear_model.csv", sep=" ", index_col=0)[["F", "P.Value", "adj.P.Val"]]
limma_overall_results = limma_overall_results.join(
cluster_expression.unstack().apply(stats.entropy, axis=1).rename("entropy")
)
limma_overall_results.index.rename("gene")
limma_overall_results| F | P.Value | adj.P.Val | entropy | |
|---|---|---|---|---|
| RERE | 148.091436 | 0.000000 | 0.000000 | 2.339954 |
| ID3 | 379.143638 | 0.000000 | 0.000000 | 2.285471 |
| RPL11 | 986.400379 | 0.000000 | 0.000000 | 2.370063 |
| CD52 | 112.473669 | 0.000000 | 0.000000 | 0.094504 |
| RPS8 | 868.130539 | 0.000000 | 0.000000 | 2.376817 |
| ... | ... | ... | ... | ... |
| PJVK | 3.139309 | 0.000013 | 0.000013 | 1.939216 |
| ITIH3 | 2.731077 | 0.000151 | 0.000151 | 1.447063 |
| HAMP | 2.182759 | 0.003274 | 0.003284 | 1.763884 |
| CGA | 1.634906 | 0.047511 | 0.047583 | 1.423650 |
| TEX49 | 1.133994 | 0.312922 | 0.312922 | 0.615322 |
661 rows × 4 columns
# reading the results from limma's lmFit, specific to each cluster
# coef: coefficient from the linear model
# AveLogScaleExpr: mean(log(scaled(geneExpression))), (-inf, +inf), zero centered
# F: moderated F-statistic (cluster-specific)
# P.Value adj.P.Val: from lmFit test (cluster-speific)
# AveLogExpr: mean(log(geneExpression)), interval in (0, +inf), not zero-centered
limma_results = pd.concat(
[
pd.read_csv("limma_results/group%s.csv"%i, sep=" ", index_col=0).assign(cluster=i).set_index("cluster", append=True)\
.rename(columns = lambda colname: re.sub("group[0-9]+", "coef", colname)).rename(columns={"AveExpr":"AveLogScaleExpr"})
for i in adata_hvgs.obs[clustering_label].cat.categories.astype(str)
]
)
limma_results.index.names = ["gene", "cluster"]
limma_results = limma_results.join(cluster_expression) #AveLogExpr
limma_results| coef | AveLogScaleExpr | F | P.Value | adj.P.Val | AveLogExpr | ||
|---|---|---|---|---|---|---|---|
| gene | cluster | ||||||
| RPL11 | 0 | 0.621540 | -3.106327e-16 | 1.853735e+03 | 0.000000 | 0.000000 | 3.944744 |
| RPL32 | 0 | 0.577213 | 4.976558e-16 | 1.748737e+03 | 0.000000 | 0.000000 | 4.281347 |
| RPL29 | 0 | 0.644817 | -7.519618e-17 | 1.955046e+03 | 0.000000 | 0.000000 | 3.027835 |
| FNDC3B | 0 | -0.648354 | -9.928134e-18 | 1.635479e+03 | 0.000000 | 0.000000 | 0.698285 |
| RPS23 | 0 | 0.608984 | -1.416617e-16 | 1.894574e+03 | 0.000000 | 0.000000 | 4.055031 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| EGR1 | 10 | 0.005224 | 6.854649e-17 | 7.150909e-04 | 0.978666 | 0.984625 | 0.569705 |
| RELN | 10 | -0.004568 | 2.374638e-17 | 5.534042e-04 | 0.981232 | 0.985706 | 0.804559 |
| FAM25G | 10 | 0.004157 | -3.302949e-18 | 4.351351e-04 | 0.983358 | 0.986342 | 0.000000 |
| TNRC6A | 10 | 0.000611 | 7.032235e-17 | 8.713536e-06 | 0.997645 | 0.999156 | 0.471255 |
| PIEZO2 | 10 | 0.000072 | -7.431829e-17 | 1.348537e-07 | 0.999707 | 0.999707 | 0.425953 |
7271 rows × 6 columns
# compute the cross-entropy for each gene, for eache cluster
# the lower the cross-entropy, the more likely a gene is a marker of a specific cluster
def cross_entropy_loss(y_true, y_prob, epsilon=1e-12):
# Clip predicted probabilities to prevent log(0) and log(1) cases
y_prob = np.clip(y_prob, epsilon, 1.0 - epsilon)
# Compute cross-entropy loss
loss = -np.sum(y_true * np.log(y_prob))
return loss
cluster_cross_entropy = limma_results.AveLogExpr.unstack().apply(
lambda gene_series: pd.Series(
{cluster: cross_entropy_loss(np.eye(len(gene_series))[i], gene_series/gene_series.sum())
for i, cluster in enumerate(gene_series.index)}
), axis=1
).stack().rename("cross_enstropy_loss")
cluster_cross_entropy.index.names = ["gene", "cluster"]
cluster_cross_entropygene cluster
ABCC4 0 3.722040
1 1.881407
10 27.631021
2 2.926712
3 4.145379
...
ZSWIM6 5 1.643516
6 2.215055
7 2.664269
8 2.709228
9 2.296434
Name: cross_enstropy_loss, Length: 7271, dtype: float64
limma_results = limma_results.join(cluster_cross_entropy) #cross_enstropy_loss
limma_results| coef | AveLogScaleExpr | F | P.Value | adj.P.Val | AveLogExpr | cross_enstropy_loss | ||
|---|---|---|---|---|---|---|---|---|
| gene | cluster | |||||||
| RPL11 | 0 | 0.621540 | -3.106327e-16 | 1.853735e+03 | 0.000000 | 0.000000 | 3.944744 | 2.341838 |
| RPL32 | 0 | 0.577213 | 4.976558e-16 | 1.748737e+03 | 0.000000 | 0.000000 | 4.281347 | 2.349773 |
| RPL29 | 0 | 0.644817 | -7.519618e-17 | 1.955046e+03 | 0.000000 | 0.000000 | 3.027835 | 2.339591 |
| FNDC3B | 0 | -0.648354 | -9.928134e-18 | 1.635479e+03 | 0.000000 | 0.000000 | 0.698285 | 2.841968 |
| RPS23 | 0 | 0.608984 | -1.416617e-16 | 1.894574e+03 | 0.000000 | 0.000000 | 4.055031 | 2.339394 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| EGR1 | 10 | 0.005224 | 6.854649e-17 | 7.150909e-04 | 0.978666 | 0.984625 | 0.569705 | 2.462980 |
| RELN | 10 | -0.004568 | 2.374638e-17 | 5.534042e-04 | 0.981232 | 0.985706 | 0.804559 | 2.212195 |
| FAM25G | 10 | 0.004157 | -3.302949e-18 | 4.351351e-04 | 0.983358 | 0.986342 | 0.000000 | 27.631021 |
| TNRC6A | 10 | 0.000611 | 7.032235e-17 | 8.713536e-06 | 0.997645 | 0.999156 | 0.471255 | 2.479182 |
| PIEZO2 | 10 | 0.000072 | -7.431829e-17 | 1.348537e-07 | 0.999707 | 0.999707 | 0.425953 | 2.394570 |
7271 rows × 7 columns
limma_results.to_csv("../output/2.0-clusters_DEG_analysis.csv")cluster_names_dict = {
"0": "4 precoll2",
"1": "1 cap1",
"2": "3 precoll1",
"3": "6 valves",
"4": "2 cap2",
"5": "5 coll",
"7": "7 prolif",
}
reclustering_label = 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res'
valves_clusters_names_dict = {
"6":"valve 1",
"8":"valve 2"
}cell_types = adata_hvgs.obs.loc[adata_hvgs.obs[clustering_label].isin(cluster_names_dict.keys()), clustering_label].replace(cluster_names_dict)
cell_types = cell_types.to_frame().join(
adata_hvgs.obs.loc[
adata_hvgs.obs[reclustering_label].isin(valves_clusters_names_dict.keys()), reclustering_label
].replace(valves_clusters_names_dict)
)
# ensure that `{valve-1, valve-2}` is a partition of `6 valves`
cell_types.loc[cell_types[clustering_label]!="6 valves", reclustering_label] = np.nan
cell_types| logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res | logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res | |
|---|---|---|
| donor1_fat.AAAGAACCACATACGT-1 | 4 precoll2 | NaN |
| donor1_fat.AAAGGTAGTCCGGTCA-1 | 4 precoll2 | NaN |
| donor1_fat.AAAGGTATCTCTCTTC-1 | 1 cap1 | NaN |
| donor1_fat.AACCACACAGTGGCTC-1 | 4 precoll2 | NaN |
| donor1_fat.AACGGGATCTGCTGAA-1 | 4 precoll2 | NaN |
| ... | ... | ... |
| donor5_recovery.TTTGGTTTCGACCAAT-1 | 1 cap1 | NaN |
| donor5_recovery.TTTGTTGGTCCTACGG-1 | 4 precoll2 | NaN |
| donor5_recovery.TTTGTTGGTCGCAGTC-1 | 5 coll | NaN |
| donor5_recovery.TTTGTTGGTGGCGTAA-1 | 6 valves | valve 2 |
| donor5_recovery.TTTGTTGTCGGCATTA-1 | 6 valves | valve 1 |
21215 rows × 2 columns
adata = sc.read_h5ad("../output/0.2-sce-after-contamination-removal.h5ad")[cell_types.index] # read adata
adata.obs = adata.obs[['Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex']].astype(str)
sc.pp.calculate_qc_metrics(adata, inplace=True) # 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', etc...
adata.obs.loc[adata.obs.donor=="5.0", "sex"] = "M"
adata.obs.loc[adata.obs.donor=="7.0", "sex"] = "F"
adata.obs[["celltype", "valve_subtype"]] = cell_types
adataAnnData object with n_obs × n_vars = 21215 × 18627
obs: 'Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'celltype', 'valve_subtype'
var: 'ID', 'Symbol', 'Type', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
uns: 'X_name'
layers: 'logcounts'
adata.write_h5ad("../output/2.1-sce-after-contamination-removal-celltypes.h5ad")