lec_flow_push – clusters

import scanpy as sc
import numpy as np
import pandas as pd
from scipy import stats

from matplotlib import pyplot as plt

import os, re

# ###################################################### #
# ################  I M P O R T A N T  ################# #
# ###################################################### #
# the R conda envoirment to perform limma DEG analysis   #
# ###################################################### #
r_conda_env = "condaEnv"

adata_hvgs = sc.read_h5ad("../output/1.1-sce-after-contamination-removal-integration-clustering.h5ad")
adata_hvgs.var.index = adata_hvgs.var.index.str.split(".").str[-1]
adata_hvgs

AnnData object with n_obs × n_vars = 21588 × 661
    obs: 'Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res'
    var: 'ID', 'Symbol', 'Type', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'selected'
    uns: 'X_name', 'donor_colors', 'leiden', 'logcounts.scaled.pca.harmony.neighbors', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res_colors', 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res_colors', 'sequencing_run_colors', 'tissue_colors', 'tissue_origin_colors'
    obsm: 'logcounts.scaled.pca', 'logcounts.scaled.pca.harmony', 'logcounts.scaled.pca.harmony.neighbors.umap'
    layers: 'logcounts', 'logcounts.scaled'
    obsp: 'logcounts.scaled.pca.harmony.neighbors_connectivities', 'logcounts.scaled.pca.harmony.neighbors_distances'

#specifying the column containing the clustering information

clustering_label = "logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res"
adata_hvgs.obs[clustering_label]

donor1_fat.AAAGAACCACATACGT-1         0
donor1_fat.AAAGGTAGTCCGGTCA-1         0
donor1_fat.AAAGGTATCTCTCTTC-1         1
donor1_fat.AACCACACAGTGGCTC-1         0
donor1_fat.AACGGGATCTGCTGAA-1         0
                                     ..
donor5_recovery.TTTGGTTTCGACCAAT-1    1
donor5_recovery.TTTGTTGGTCCTACGG-1    0
donor5_recovery.TTTGTTGGTCGCAGTC-1    5
donor5_recovery.TTTGTTGGTGGCGTAA-1    3
donor5_recovery.TTTGTTGTCGGCATTA-1    3
Name: logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res, Length: 21588, dtype: category
Categories (11, object): ['0', '1', '2', '3', ..., '7', '8', '9', '10']

# This requires a couple of minutes

# save the scaled log-norm conunts in a temp csv file
adata_hvgs.to_df(layer="logcounts.scaled").T.to_csv("_temp_count_file.csv", sep="\t")

# save the cluster factor in a temp file (csv as well)
adata_hvgs.obs[clustering_label].astype(str).to_csv("_temp_factor.csv", sep="\t")

# save the batch factor in a temp file (csv as well)
adata_hvgs.obs["donor"].astype(str).to_csv("_temp_batch.csv", sep="\t")

# create the limma_results folder
os.system("rm -r limma_results")
os.system("mkdir limma_results")

# run the R script
os.system("conda run -n %s Rscript limma_differential_expression.R"%r_conda_env)

# delete the temp files
os.system("rm _temp_count_file.csv")
os.system("rm _temp_factor.csv")
os.system("rm _temp_batch.csv")

topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.
topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.

# compute the average (or median, to you) cluster log-expression

cluster_expression = pd.concat([
    pd.Series(
        np.array(adata_hvgs[adata_hvgs.obs[clustering_label]==cluster].layers["logcounts"].mean(axis=0)).ravel(), # decomment for mean expresssion
        #np.median(adata_hvgs[adata_hvgs.obs[clustering_label]==cluster].layers["logcounts"].toarray(), axis=0), # decomment for median expresssion
        index=adata_hvgs.var.index, name=cluster
    )
    for cluster in adata_hvgs.obs[clustering_label].cat.categories
], axis=1).stack().rename("AveLogExpr")
cluster_expression.index.names = ["gene", "cluster"]
cluster_expression

gene   cluster
ISG15  0          0.133747
       1          0.144939
       2          0.132254
       3          0.200549
       4          0.196444
                    ...   
RPL10  6          4.439668
       7          4.670093
       8          4.797745
       9          4.825947
       10         4.923193
Name: AveLogExpr, Length: 7271, dtype: float64

# reading the results from limma's lmFit:
#   F: overall moderated F-statistic
#   P.Value, adj.P.Val: from limma's lmFit
#   entropy: the Shannon entropy computed, for each gene, on the clusters expresion

limma_overall_results = pd.read_csv("limma_results/linear_model.csv", sep=" ", index_col=0)[["F", "P.Value", "adj.P.Val"]]
limma_overall_results = limma_overall_results.join(
    cluster_expression.unstack().apply(stats.entropy, axis=1).rename("entropy")
)
limma_overall_results.index.rename("gene")
limma_overall_results

	F	P.Value	adj.P.Val	entropy
RERE	148.091436	0.000000	0.000000	2.339954
ID3	379.143638	0.000000	0.000000	2.285471
RPL11	986.400379	0.000000	0.000000	2.370063
CD52	112.473669	0.000000	0.000000	0.094504
RPS8	868.130539	0.000000	0.000000	2.376817
...	...	...	...	...
PJVK	3.139309	0.000013	0.000013	1.939216
ITIH3	2.731077	0.000151	0.000151	1.447063
HAMP	2.182759	0.003274	0.003284	1.763884
CGA	1.634906	0.047511	0.047583	1.423650
TEX49	1.133994	0.312922	0.312922	0.615322

661 rows × 4 columns

# reading the results from limma's lmFit, specific to each cluster
#   coef: coefficient from the linear model
#   AveLogScaleExpr: mean(log(scaled(geneExpression))), (-inf, +inf), zero centered
#   F: moderated F-statistic (cluster-specific)
#   P.Value adj.P.Val: from lmFit test (cluster-speific)
#   AveLogExpr: mean(log(geneExpression)), interval in (0, +inf), not zero-centered

limma_results = pd.concat(
    [
        pd.read_csv("limma_results/group%s.csv"%i, sep=" ", index_col=0).assign(cluster=i).set_index("cluster", append=True)\
            .rename(columns = lambda colname: re.sub("group[0-9]+", "coef", colname)).rename(columns={"AveExpr":"AveLogScaleExpr"})
        for i in adata_hvgs.obs[clustering_label].cat.categories.astype(str)
    ]
)
limma_results.index.names = ["gene", "cluster"]
limma_results = limma_results.join(cluster_expression) #AveLogExpr
limma_results

		coef	AveLogScaleExpr	F	P.Value	adj.P.Val	AveLogExpr
gene	cluster
RPL11	0	0.621540	-3.106327e-16	1.853735e+03	0.000000	0.000000	3.944744
RPL32	0	0.577213	4.976558e-16	1.748737e+03	0.000000	0.000000	4.281347
RPL29	0	0.644817	-7.519618e-17	1.955046e+03	0.000000	0.000000	3.027835
FNDC3B	0	-0.648354	-9.928134e-18	1.635479e+03	0.000000	0.000000	0.698285
RPS23	0	0.608984	-1.416617e-16	1.894574e+03	0.000000	0.000000	4.055031
...	...	...	...	...	...	...	...
EGR1	10	0.005224	6.854649e-17	7.150909e-04	0.978666	0.984625	0.569705
RELN	10	-0.004568	2.374638e-17	5.534042e-04	0.981232	0.985706	0.804559
FAM25G	10	0.004157	-3.302949e-18	4.351351e-04	0.983358	0.986342	0.000000
TNRC6A	10	0.000611	7.032235e-17	8.713536e-06	0.997645	0.999156	0.471255
PIEZO2	10	0.000072	-7.431829e-17	1.348537e-07	0.999707	0.999707	0.425953

7271 rows × 6 columns

# compute the cross-entropy for each gene, for eache cluster
# the lower the cross-entropy, the more likely a gene is a marker of a specific cluster

def cross_entropy_loss(y_true, y_prob, epsilon=1e-12):
    
    # Clip predicted probabilities to prevent log(0) and log(1) cases
    y_prob = np.clip(y_prob, epsilon, 1.0 - epsilon)
    
    # Compute cross-entropy loss
    loss = -np.sum(y_true * np.log(y_prob))
    return loss

cluster_cross_entropy = limma_results.AveLogExpr.unstack().apply(
    lambda gene_series: pd.Series(
        {cluster: cross_entropy_loss(np.eye(len(gene_series))[i], gene_series/gene_series.sum())
         for i, cluster in enumerate(gene_series.index)}
    ), axis=1
).stack().rename("cross_enstropy_loss")
cluster_cross_entropy.index.names = ["gene", "cluster"]
cluster_cross_entropy

gene    cluster
ABCC4   0           3.722040
        1           1.881407
        10         27.631021
        2           2.926712
        3           4.145379
                     ...    
ZSWIM6  5           1.643516
        6           2.215055
        7           2.664269
        8           2.709228
        9           2.296434
Name: cross_enstropy_loss, Length: 7271, dtype: float64

limma_results = limma_results.join(cluster_cross_entropy) #cross_enstropy_loss
limma_results

		coef	AveLogScaleExpr	F	P.Value	adj.P.Val	AveLogExpr	cross_enstropy_loss
gene	cluster
RPL11	0	0.621540	-3.106327e-16	1.853735e+03	0.000000	0.000000	3.944744	2.341838
RPL32	0	0.577213	4.976558e-16	1.748737e+03	0.000000	0.000000	4.281347	2.349773
RPL29	0	0.644817	-7.519618e-17	1.955046e+03	0.000000	0.000000	3.027835	2.339591
FNDC3B	0	-0.648354	-9.928134e-18	1.635479e+03	0.000000	0.000000	0.698285	2.841968
RPS23	0	0.608984	-1.416617e-16	1.894574e+03	0.000000	0.000000	4.055031	2.339394
...	...	...	...	...	...	...	...	...
EGR1	10	0.005224	6.854649e-17	7.150909e-04	0.978666	0.984625	0.569705	2.462980
RELN	10	-0.004568	2.374638e-17	5.534042e-04	0.981232	0.985706	0.804559	2.212195
FAM25G	10	0.004157	-3.302949e-18	4.351351e-04	0.983358	0.986342	0.000000	27.631021
TNRC6A	10	0.000611	7.032235e-17	8.713536e-06	0.997645	0.999156	0.471255	2.479182
PIEZO2	10	0.000072	-7.431829e-17	1.348537e-07	0.999707	0.999707	0.425953	2.394570

7271 rows × 7 columns

limma_results.to_csv("../output/2.0-clusters_DEG_analysis.csv")

cluster_names_dict = {
    "0": "4 precoll2",
    "1": "1 cap1",
    "2": "3 precoll1",
    "3": "6 valves",
    "4": "2 cap2",
    "5": "5 coll",
    "7": "7 prolif",
}
reclustering_label = 'logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res'
valves_clusters_names_dict = {
    "6":"valve 1",
    "8":"valve 2"
}

cell_types = adata_hvgs.obs.loc[adata_hvgs.obs[clustering_label].isin(cluster_names_dict.keys()), clustering_label].replace(cluster_names_dict)

cell_types = cell_types.to_frame().join(
    adata_hvgs.obs.loc[
        adata_hvgs.obs[reclustering_label].isin(valves_clusters_names_dict.keys()), reclustering_label
    ].replace(valves_clusters_names_dict)
)

# ensure that `{valve-1, valve-2}` is a partition of `6 valves`
cell_types.loc[cell_types[clustering_label]!="6 valves", reclustering_label] = np.nan

cell_types

	logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_05res	logcounts.scaled.pca.harmony.neighbors_connectivities.leiden_075res
donor1_fat.AAAGAACCACATACGT-1	4 precoll2	NaN
donor1_fat.AAAGGTAGTCCGGTCA-1	4 precoll2	NaN
donor1_fat.AAAGGTATCTCTCTTC-1	1 cap1	NaN
donor1_fat.AACCACACAGTGGCTC-1	4 precoll2	NaN
donor1_fat.AACGGGATCTGCTGAA-1	4 precoll2	NaN
...	...	...
donor5_recovery.TTTGGTTTCGACCAAT-1	1 cap1	NaN
donor5_recovery.TTTGTTGGTCCTACGG-1	4 precoll2	NaN
donor5_recovery.TTTGTTGGTCGCAGTC-1	5 coll	NaN
donor5_recovery.TTTGTTGGTGGCGTAA-1	6 valves	valve 2
donor5_recovery.TTTGTTGTCGGCATTA-1	6 valves	valve 1

21215 rows × 2 columns

adata = sc.read_h5ad("../output/0.2-sce-after-contamination-removal.h5ad")[cell_types.index] # read adata
adata.obs = adata.obs[['Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex']].astype(str)
sc.pp.calculate_qc_metrics(adata, inplace=True) # 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', etc...
adata.obs.loc[adata.obs.donor=="5.0", "sex"] = "M"
adata.obs.loc[adata.obs.donor=="7.0", "sex"] = "F"
adata.obs[["celltype", "valve_subtype"]] = cell_types
adata

AnnData object with n_obs × n_vars = 21215 × 18627
    obs: 'Sample', 'Barcode', 'donor', 'tissue', 'tissue_origin', 'sequencing_run', 'sex', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'celltype', 'valve_subtype'
    var: 'ID', 'Symbol', 'Type', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'X_name'
    layers: 'logcounts'

adata.write_h5ad("../output/2.1-sce-after-contamination-removal-celltypes.h5ad")