import scanpy as sc
import numpy as np
import pandas as pd
from scipy import stats

from matplotlib import pyplot as plt

import os, re

# ###################################################### #
# ################  I M P O R T A N T  ################# #
# ###################################################### #
# the R conda envoirment to perform limma DEG analysis   #
# ###################################################### #
r_conda_env = "condaEnv"
adata_hvgs = sc.read_h5ad("../output/1.1-sce-after-contamination-removal-integration-clustering.h5ad")
adata_hvgs.var.index = adata_hvgs.var.index.str.split(".").str[-1]
adata_hvgs
AnnData object with n_obs × n_vars = 21588 × 661
    obs: 'Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res'
    var: 'ID', 'Symbol', 'Type', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'selected'
    uns: 'X_name', 'donor_colors', 'leiden', 'logcounts.scaled.pca.harmony.neighbors', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res_colors', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res_colors', 'sequencing_run_colors', 'tissue_colors', 'tissue_origin_colors'
    obsm: 'logcounts.scaled.pca', 'logcounts.scaled.pca.harmony', 'logcounts.scaled.pca.harmony.neighbors.umap'
    layers: 'logcounts', 'logcounts.scaled'
    obsp: 'logcounts.scaled.pca.harmony.neighbors_connectivities', 'logcounts.scaled.pca.harmony.neighbors_distances'
#specifying the column containing the clustering information

clustering_label = "logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res"
adata_hvgs.obs[clustering_label]
donor1_fat.AAAGAACCACATACGT-1         0
donor1_fat.AAAGGTAGTCCGGTCA-1         0
donor1_fat.AAAGGTATCTCTCTTC-1         1
donor1_fat.AACCACACAGTGGCTC-1         0
donor1_fat.AACGGGATCTGCTGAA-1         0
                                     ..
donor5_recovery.TTTGGTTTCGACCAAT-1    1
donor5_recovery.TTTGTTGGTCCTACGG-1    0
donor5_recovery.TTTGTTGGTCGCAGTC-1    5
donor5_recovery.TTTGTTGGTGGCGTAA-1    3
donor5_recovery.TTTGTTGTCGGCATTA-1    3
Name: logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res, Length: 21588, dtype: category
Categories (11, object): ['0', '1', '2', '3', ..., '7', '8', '9', '10']
# This requires a couple of minutes

# save the scaled log-norm conunts in a temp csv file
adata_hvgs.to_df(layer="logcounts.scaled").T.to_csv("_temp_count_file.csv", sep="\t")

# save the cluster factor in a temp file (csv as well)
adata_hvgs.obs[clustering_label].astype(str).to_csv("_temp_factor.csv", sep="\t")

# save the batch factor in a temp file (csv as well)
adata_hvgs.obs["donor"].astype(str).to_csv("_temp_batch.csv", sep="\t")

# create the limma_results folder
os.system("rm -r limma_results")
os.system("mkdir limma_results")

# run the R script
os.system("conda run -n %s Rscript limma_differential_expression.R"%r_conda_env)

# delete the temp files
os.system("rm _temp_count_file.csv")
os.system("rm _temp_factor.csv")
os.system("rm _temp_batch.csv")
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
0
# compute the average (or median, to you) cluster log-expression

cluster_expression = pd.concat([
    pd.Series(
        np.array(adata_hvgs[adata_hvgs.obs[clustering_label]==cluster].layers["logcounts"].mean(axis=0)).ravel(), # decomment for mean expresssion
        #np.median(adata_hvgs[adata_hvgs.obs[clustering_label]==cluster].layers["logcounts"].toarray(), axis=0), # decomment for median expresssion
        index=adata_hvgs.var.index, name=cluster
    )
    for cluster in adata_hvgs.obs[clustering_label].cat.categories
], axis=1).stack().rename("AveLogExpr")
cluster_expression.index.names = ["gene", "cluster"]
cluster_expression
gene   cluster
ISG15  0          0.133747
       1          0.144939
       2          0.132254
       3          0.200549
       4          0.196444
                    ...   
RPL10  6          4.439668
       7          4.670093
       8          4.797745
       9          4.825947
       10         4.923193
Name: AveLogExpr, Length: 7271, dtype: float64
# reading the results from limma's lmFit:
#   F: overall moderated F-statistic
#   P.Value, adj.P.Val: from limma's lmFit
#   entropy: the Shannon entropy computed, for each gene, on the clusters expresion

limma_overall_results = pd.read_csv("limma_results/linear_model.csv", sep=" ", index_col=0)[["F", "P.Value", "adj.P.Val"]]
limma_overall_results = limma_overall_results.join(
    cluster_expression.unstack().apply(stats.entropy, axis=1).rename("entropy")
)
limma_overall_results.index.rename("gene")
limma_overall_results
F P.Value adj.P.Val entropy
RERE 148.091436 0.000000 0.000000 2.339954
ID3 379.143638 0.000000 0.000000 2.285471
RPL11 986.400379 0.000000 0.000000 2.370063
CD52 112.473669 0.000000 0.000000 0.094504
RPS8 868.130539 0.000000 0.000000 2.376817
... ... ... ... ...
PJVK 3.139309 0.000013 0.000013 1.939216
ITIH3 2.731077 0.000151 0.000151 1.447063
HAMP 2.182759 0.003274 0.003284 1.763884
CGA 1.634906 0.047511 0.047583 1.423650
TEX49 1.133994 0.312922 0.312922 0.615322

661 rows × 4 columns

# reading the results from limma's lmFit, specific to each cluster
#   coef: coefficient from the linear model
#   AveLogScaleExpr: mean(log(scaled(geneExpression))), (-inf, +inf), zero centered
#   F: moderated F-statistic (cluster-specific)
#   P.Value adj.P.Val: from lmFit test (cluster-speific)
#   AveLogExpr: mean(log(geneExpression)), interval in (0, +inf), not zero-centered

limma_results = pd.concat(
    [
        pd.read_csv("limma_results/group%s.csv"%i, sep=" ", index_col=0).assign(cluster=i).set_index("cluster", append=True)\
            .rename(columns = lambda colname: re.sub("group[0-9]+", "coef", colname)).rename(columns={"AveExpr":"AveLogScaleExpr"})
        for i in adata_hvgs.obs[clustering_label].cat.categories.astype(str)
    ]
)
limma_results.index.names = ["gene", "cluster"]
limma_results = limma_results.join(cluster_expression) #AveLogExpr
limma_results
coef AveLogScaleExpr F P.Value adj.P.Val AveLogExpr
gene cluster
RPL11 0 0.621540 -3.106327e-16 1.853735e+03 0.000000 0.000000 3.944744
RPL32 0 0.577213 4.976558e-16 1.748737e+03 0.000000 0.000000 4.281347
RPL29 0 0.644817 -7.519618e-17 1.955046e+03 0.000000 0.000000 3.027835
FNDC3B 0 -0.648354 -9.928134e-18 1.635479e+03 0.000000 0.000000 0.698285
RPS23 0 0.608984 -1.416617e-16 1.894574e+03 0.000000 0.000000 4.055031
... ... ... ... ... ... ... ...
EGR1 10 0.005224 6.854649e-17 7.150909e-04 0.978666 0.984625 0.569705
RELN 10 -0.004568 2.374638e-17 5.534042e-04 0.981232 0.985706 0.804559
FAM25G 10 0.004157 -3.302949e-18 4.351351e-04 0.983358 0.986342 0.000000
TNRC6A 10 0.000611 7.032235e-17 8.713536e-06 0.997645 0.999156 0.471255
PIEZO2 10 0.000072 -7.431829e-17 1.348537e-07 0.999707 0.999707 0.425953

7271 rows × 6 columns

# compute the cross-entropy for each gene, for eache cluster
# the lower the cross-entropy, the more likely a gene is a marker of a specific cluster

def cross_entropy_loss(y_true, y_prob, epsilon=1e-12):
    
    # Clip predicted probabilities to prevent log(0) and log(1) cases
    y_prob = np.clip(y_prob, epsilon, 1.0 - epsilon)
    
    # Compute cross-entropy loss
    loss = -np.sum(y_true * np.log(y_prob))
    return loss

cluster_cross_entropy = limma_results.AveLogExpr.unstack().apply(
    lambda gene_series: pd.Series(
        {cluster: cross_entropy_loss(np.eye(len(gene_series))[i], gene_series/gene_series.sum())
         for i, cluster in enumerate(gene_series.index)}
    ), axis=1
).stack().rename("cross_enstropy_loss")
cluster_cross_entropy.index.names = ["gene", "cluster"]
cluster_cross_entropy
gene    cluster
ABCC4   0           3.722040
        1           1.881407
        10         27.631021
        2           2.926712
        3           4.145379
                     ...    
ZSWIM6  5           1.643516
        6           2.215055
        7           2.664269
        8           2.709228
        9           2.296434
Name: cross_enstropy_loss, Length: 7271, dtype: float64
limma_results = limma_results.join(cluster_cross_entropy) #cross_enstropy_loss
limma_results
coef AveLogScaleExpr F P.Value adj.P.Val AveLogExpr cross_enstropy_loss
gene cluster
RPL11 0 0.621540 -3.106327e-16 1.853735e+03 0.000000 0.000000 3.944744 2.341838
RPL32 0 0.577213 4.976558e-16 1.748737e+03 0.000000 0.000000 4.281347 2.349773
RPL29 0 0.644817 -7.519618e-17 1.955046e+03 0.000000 0.000000 3.027835 2.339591
FNDC3B 0 -0.648354 -9.928134e-18 1.635479e+03 0.000000 0.000000 0.698285 2.841968
RPS23 0 0.608984 -1.416617e-16 1.894574e+03 0.000000 0.000000 4.055031 2.339394
... ... ... ... ... ... ... ... ...
EGR1 10 0.005224 6.854649e-17 7.150909e-04 0.978666 0.984625 0.569705 2.462980
RELN 10 -0.004568 2.374638e-17 5.534042e-04 0.981232 0.985706 0.804559 2.212195
FAM25G 10 0.004157 -3.302949e-18 4.351351e-04 0.983358 0.986342 0.000000 27.631021
TNRC6A 10 0.000611 7.032235e-17 8.713536e-06 0.997645 0.999156 0.471255 2.479182
PIEZO2 10 0.000072 -7.431829e-17 1.348537e-07 0.999707 0.999707 0.425953 2.394570

7271 rows × 7 columns

limma_results.to_csv("../output/2.0-clusters_DEG_analysis.csv")
cluster_names_dict = {
    "0": "4 precoll2",
    "1": "1 cap1",
    "2": "3 precoll1",
    "3": "6 valves",
    "4": "2 cap2",
    "5": "5 coll",
    "7": "7 prolif",
}
reclustering_label = 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res'
valves_clusters_names_dict = {
    "6":"valve 1",
    "8":"valve 2"
}
cell_types = adata_hvgs.obs.loc[adata_hvgs.obs[clustering_label].isin(cluster_names_dict.keys()), clustering_label].replace(cluster_names_dict)

cell_types = cell_types.to_frame().join(
    adata_hvgs.obs.loc[
        adata_hvgs.obs[reclustering_label].isin(valves_clusters_names_dict.keys()), reclustering_label
    ].replace(valves_clusters_names_dict)
)

# ensure that `{valve-1, valve-2}` is a partition of `6 valves`
cell_types.loc[cell_types[clustering_label]!="6 valves", reclustering_label] = np.nan

cell_types
logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res
donor1_fat.AAAGAACCACATACGT-1 4 precoll2 NaN
donor1_fat.AAAGGTAGTCCGGTCA-1 4 precoll2 NaN
donor1_fat.AAAGGTATCTCTCTTC-1 1 cap1 NaN
donor1_fat.AACCACACAGTGGCTC-1 4 precoll2 NaN
donor1_fat.AACGGGATCTGCTGAA-1 4 precoll2 NaN
... ... ...
donor5_recovery.TTTGGTTTCGACCAAT-1 1 cap1 NaN
donor5_recovery.TTTGTTGGTCCTACGG-1 4 precoll2 NaN
donor5_recovery.TTTGTTGGTCGCAGTC-1 5 coll NaN
donor5_recovery.TTTGTTGGTGGCGTAA-1 6 valves valve 2
donor5_recovery.TTTGTTGTCGGCATTA-1 6 valves valve 1

21215 rows × 2 columns

adata = sc.read_h5ad("../output/0.2-sce-after-contamination-removal.h5ad")[cell_types.index] # read adata
adata.obs = adata.obs[['Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex']].astype(str)
sc.pp.calculate_qc_metrics(adata, inplace=True) # 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', etc...
adata.obs.loc[adata.obs.donor=="5.0", "sex"] = "M"
adata.obs.loc[adata.obs.donor=="7.0", "sex"] = "F"
adata.obs[["celltype", "valve_subtype"]] = cell_types
adata
AnnData object with n_obs × n_vars = 21215 × 18627
    obs: 'Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'celltype', 'valve_subtype'
    var: 'ID', 'Symbol', 'Type', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'X_name'
    layers: 'logcounts'
adata.write_h5ad("../output/2.1-sce-after-contamination-removal-celltypes.h5ad")