This datset has been prepared by Roche. It contains pbmc from 4 different sample. Sample are derived from the same patient, have been processed in the same way and have been sequenced together. Experimental differnces are due to different storage conditions. One sample was sequenced from fresh cells, the other ones have been ored in different media and frozen for a week.
suppressPackageStartupMessages({
library(plotly)
library(readr)
library(stringr)
library(edgeR)
library(pheatmap)
library(purrr)
library(scater)
library(dplyr)
library(reshape2)
library(ggplot2)
library(cowplot)
library(Matrix)
library(scran)
library(Seurat)
library(sctransform)
library(readxl)
library(DropletUtils)
library(LSD)
library(CellMixS)
library(tibble)
library(here)
library(scDblFinder)
library(plyr)
})
seed <- 1000
Load data Raw reads were mappe dwith Cellranger v2 against ensembl hg38
out_path <- here::here("out")
data_path <- "/home/Shared_taupo/data/seq/calini_scrnaseq/PBMC_technical_effects/"
fastqdirs <- c("CR038", "CR039", "CR040", "CR041")
sce <- DropletUtils::read10xCounts(samples=paste0(data_path, "PBMC_data/", fastqdirs))
# Add metadata
meta <- read_excel(paste0(data_path, "EXP_CR005 Samples ID.xlsx"))
sce$batch <- gsub(".*data/","", sce$Sample)
sce$Sample <- gsub(".*data/","", sce$Sample)
sce$mean_reads_per_cell_cr <- meta$`Mean reads/cell`[match(sce$Sample, meta$`Sample ID`)]
sce$info <- meta$`Characteristics`[match(sce$Sample, meta$`Sample ID`)]
colnames(sce) <- paste0(sce$batch, ".", sce$Barcode)
rownames(sce) <- paste0(rowData(sce)$ID, ".", rowData(sce)$Symbol)
sce$batch <- factor(sce$batch)
sce$batch <- mapvalues(sce$batch, from = c("CR038", "CR039", "CR040", "CR041"), to = c("fresh", "DMSO", "CS10", "PSC"))
table(sce$batch)
##
## fresh DMSO CS10 PSC
## 3696 3612 2347 1994
dim(sce)
## [1] 19883 11649
dim(colData(sce))
## [1] 11649 5
dim(rowData(sce))
## [1] 19883 2
#remove genes without any counts
keep_features <- rowSums(counts(sce) > 0) > 0
sce <- sce[keep_features, ]
dim(sce)
## [1] 15269 11649
# # Mitochondrial genes
is.mito <- grepl("MT-", rownames(sce))
summary(is.mito)
## Mode FALSE TRUE
## logical 15256 13
mito <- rownames(sce)[is.mito]
sce <- calculateQCMetrics(sce, feature_controls = list(Mt = mito))
## Warning: 'calculateQCMetrics' is deprecated.
## Use 'perCellQCMetrics' or 'perFeatureQCMetrics' instead.
# # Plot filters
plotFilters <- function( sce, var="log10_total_counts", split_by="batch", nrow=NULL,
nmads=c(2,3,5), lt=c("dashed","dotted","dotdash"), xscale="free" ){
CD <- as.data.frame(colData(sce))
if(!(var %in% colnames(CD))) stop(paste("`var`",var,"is not in `colData(sce)`!"))
if(!is.null(split_by) && !(split_by %in% colnames(CD))){
stop(paste("`split_by`",split_by,"is not in `colData(sce)`!"))
}
library(ggplot2)
library(cowplot)
d <- CD[,var,drop=F]
if(!is.null(split_by)) d$dataset <- CD[[split_by]]
p <- ggplot(d, aes_string(x=var)) + geom_histogram(color="darkblue", bins=30)
if(xscale!="free"){
if(xscale!="fixed"){
if(xscale>1 && xscale%%1==0){
xq <- .tmads(d[[var]], xscale)
xr <- range(d[[var]],na.rm=T)
xq <- c(max(xq[1],xr[1]), min(xq[2],xr[2]))
}else{
if(xscale<=1 & xscale>0){
xscale <- (1-xscale)/2
xq <- quantile(d[[var]], probs=c(xscale,1-xscale), na.rm=T)
}else{
stop("Wrong `xscale` value!")
}
}
p <- p + xlim(xq[1], xq[2])
}
}
if(!is.null(split_by)){
if(is.null(nrow)) nrow <- ceiling(length(unique(d$dataset))/3)
p <- p + facet_wrap(~dataset, scales=ifelse(xscale=="free","free","free_y"), nrow=nrow)
for(ds in unique(d$dataset)){
for(i in 1:length(nmads)){
ma <- .tmads(d[which(d$dataset==ds),var], nmads[i])
df2 <- data.frame(xint=as.numeric(ma), dataset=rep(ds,2))
p <- p + geom_vline(data=df2, aes(xintercept=xint), linetype=lt[i])
}
}
}else{
for(i in 1:length(nmads)){
df2 <- data.frame(xint=as.numeric(.tmads(d[[var]], nmads[i])))
p <- p + geom_vline(data=df2, aes(xintercept=xint), linetype=lt[i])
}
}
p
}
.tmads <- function(x, nbmads=2.5){
x2 <- nbmads*median(abs(x-median(x)))
median(x)+c(-x2,x2)
}
plotFilters(sce)
plotFilters(sce, "log10_total_features_by_counts")
plotFilters(sce, "pct_counts_Mt", xscale=0.98)
## Warning: Removed 234 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_vline).
# Find outlier
outlierPlot <- function(cd, feature, aph=NULL, logScale=FALSE, show.legend=TRUE){
if(is.null(aph)) aph <- paste0(feature, "_drop")
if(!(aph %in% colnames(cd))) aph <- NULL
p <- ggplot(as.data.frame(cd), aes_string(x = feature, alpha = aph)) +
geom_histogram(show.legend=show.legend)
if(!is.null(aph)) p <- p + scale_alpha_manual(values = c("TRUE" = 0.4, "FALSE" = 1))
if(logScale) p <- p + scale_x_log10()
p
}
plQCplot <- function(cd, show.legend=TRUE){
ps <- lapply(split(cd,cd$batch), sl=show.legend, FUN=function(x,sl){
list( outlierPlot( x, "total_counts", logScale=T, show.legend=sl),
outlierPlot( x, "total_features_by_counts", "total_features_drop",
logScale=T, show.legend=sl),
outlierPlot( x, "pct_counts_Mt", "mito_drop", show.legend=sl)
)
})
plot_grid( plotlist = do.call(c, ps),
labels=rep(basename(names(ps)), each=length(ps[[1]])),
ncol=length(ps[[1]]),
label_x=0.5 )
}
#Filtering
sce$total_counts_drop <- isOutlier(sce$total_counts, nmads = 2.5,
type = "both", log = TRUE, batch=sce$batch)
sce$total_features_drop <- isOutlier(sce$total_features_by_counts, nmads = 2.5,
type = "both", log = TRUE, batch=sce$batch)
sce$mito_drop <- sce$pct_counts_Mt > 5 &
isOutlier(sce$pct_counts_Mt, nmads = 2.5, type = "higher", batch=sce$batch)
sce$isOutlier <- sce$total_counts_drop | sce$total_features_drop | sce$mito_drop
# quality plot
plQCplot(colData(sce), show.legend=FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(colData(sce) %>% as.data.frame, aes(x=total_features_by_counts, y=total_counts, colour=pct_counts_Mt)) + geom_point() + facet_wrap(~Sample) +geom_density_2d(col="white") + scale_x_sqrt() + scale_y_sqrt()
ggplot(colData(sce) %>% as.data.frame, aes(x=total_features_by_counts, y=pct_counts_Mt)) + geom_point() + facet_wrap(~Sample) +geom_density_2d(col="white")
# Check outlier
mets <- c("total_counts_drop","total_features_drop","mito_drop")
sapply(mets, FUN=function(x){ sapply(mets, y=x, function(x,y){ sum(sce[[x]] & sce[[y]]) }) })
## total_counts_drop total_features_drop mito_drop
## total_counts_drop 416 386 96
## total_features_drop 386 975 120
## mito_drop 96 120 437
nbcells <- cbind(table(sce$batch),table(sce$batch[!sce$isOutlier]))
colnames(nbcells) <- c("cells total","cells after filtering")
nbcells
## cells total cells after filtering
## fresh 3696 3289
## DMSO 3612 3180
## CS10 2347 2081
## PSC 1994 1788
layout(matrix(1:2,nrow=1))
LSD::heatscatter( sce$total_counts, sce$total_features_by_counts, xlab="Total counts", ylab="Non-zero features", main="",log="xy")
w <- which(!sce$isOutlier)
LSD::heatscatter( sce$total_counts[w], sce$total_features_by_counts[w], xlab="Total counts", ylab="Non-zero features", main="Filtered cells",log="xy")
# summary of cells kept
cct <- table(sce$isOutlier, sce$batch)
row.names(cct) <- c("Kept", "Filtered out")
cct
##
## fresh DMSO CS10 PSC
## Kept 3289 3180 2081 1788
## Filtered out 407 432 266 206
# drop outlier cells
sce <- sce[,!sce$isOutlier]
# require count > 1 in at least 20 cells
sce <- sce[which(rowSums(counts(sce)>1)>=20),]
dim(sce)
## [1] 4756 10338
plQCplot(colData(sce), show.legend=FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table(sce$batch)
##
## fresh DMSO CS10 PSC
## 3289 3180 2081 1788
sce <- scDblFinder(sce, samples="batch", BPPARAM=MulticoreParam(2))
table(sce$scDblFinder.class)
##
## doublet singlet
## 242 10096
sce <- sce[,!sce$scDblFinder.class %in% "doublet"]
# Scater
set.seed(1000)
clusters <- quickCluster(sce, use.ranks=FALSE)
table(clusters)
## clusters
## 1 2 3 4 5 6 7 8 9 10 11 12
## 879 748 1083 584 631 849 1594 2147 207 1078 138 158
sce <- computeSumFactors(sce, min.mean=0.1, cluster=clusters) ##cluster information added
sce <- scater::normalize(sce)
## Warning: 'normalizeSCE' is deprecated.
## Use 'logNormCounts' instead.
## See help("Deprecated")
## Warning: 'centreSizeFactors' is deprecated.
## See help("Deprecated")
# create SeuratObject
seurat <- as.Seurat(sce)
# normalize, find variable genes, and scale
sl <- lapply(unique(as.character(seurat@meta.data$batch)), FUN=function(x){
x <- subset(seurat, cells=which(seurat@meta.data$batch==x))
x <- ScaleData(x)
x <- FindVariableFeatures(x, verbose=F)
# use non-standardized variance
v <- x@assays$RNA@meta.features[["vst.variance"]]
VariableFeatures(x) <- row.names(x@assays$RNA@meta.features)[order(v, decreasing=TRUE)[1:500]]
x
})
## Centering and scaling data matrix
## Centering and scaling data matrix
## Centering and scaling data matrix
## Centering and scaling data matrix
# find anchors & integrate
anchors <- FindIntegrationAnchors(sl, verbose = FALSE)
seurat <- IntegrateData(anchorset = anchors, dims = seq_len(30),
features.to.integrate = rownames(sce),
verbose = FALSE)
## Warning: Adding a command log without an assay associated with it
# scale integrated data
DefaultAssay(object=seurat) <- "integrated"
seurat <- ScaleData(seurat, verbose=F)
seurat <- RunPCA(object = seurat, npcs = 30, verbose = FALSE)
seurat <- RunTSNE(object = seurat, perplexity = 30,reduction = "pca", dims = seq_len(20),
seed.use = seed, do.fast = TRUE, verbose = FALSE)
seurat <- RunUMAP(object = seurat, reduction = "pca", dims = seq_len(20),
seed.use = seed, verbose = FALSE, n.neighbors = 30, min.dist = 0.5)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
seurat <- FindNeighbors(object = seurat, reduction = "pca", dims = seq_len(20), verbose = FALSE)
for (res in c(0.1, 0.2, 0.4, 0.8, 1, 1.2, 2))
seurat <- FindClusters(object = seurat, resolution = res, random.seed = seed, verbose = FALSE)
seurat <- SetIdent(seurat, value="integrated_snn_res.0.2")
seurat@meta.data$cluster <- seurat$integrated_snn_res.0.2
DimPlot(seurat, reduction = "umap")
DimPlot(seurat, reduction = "tsne")
DimPlot(seurat, reduction = "tsne", group.by = "batch")
sce <- sce[, colnames(seurat)]
sce <- SingleCellExperiment(
assays=list(
counts=seurat@assays$RNA@counts,
logcounts=seurat@assays$RNA@data),
colData=seurat@meta.data,
reducedDims=lapply(seurat@reductions, FUN=function(x) x@cell.embeddings)
)
label <- as.factor(colData(sce)$batch)
batch_label <- list()
names_list <- c()
seed <- 1234
for(i in c(10,20,30,40,50,60,70,80,90)){
label_new <- label
lab_pc <- sample_frac(as.data.frame(label), i/100)
label_new[as.numeric(rownames(lab_pc))] <- sample(label, length(rownames(lab_pc)), replace = TRUE)
batch_label[[i/10]] <- label_new
names_list[i/10] <- paste0("batch", i)
}
names(batch_label) <- names_list
colData(sce)$batch0 <- label
colData(sce)$batch90 <- batch_label$batch90
colData(sce)$batch80 <- batch_label$batch80
colData(sce)$batch70 <- batch_label$batch70
colData(sce)$batch60 <- batch_label$batch60
colData(sce)$batch50 <- batch_label$batch50
colData(sce)$batch40 <- batch_label$batch40
colData(sce)$batch30 <- batch_label$batch30
colData(sce)$batch20 <- batch_label$batch20
colData(sce)$batch10 <- batch_label$batch10
colData(sce)$batch100 <- sample(label, length(label), replace = TRUE)
## Visualize randomization
batch_list <- str_subset(names(colData(sce)), "batch")
lapply(batch_list, function(feature_name){
visGroup(sce, feature_name, dim_red= "tsne")
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
# Save data
saveRDS(sce, file = paste0(out_path, "/sce_pbmc_roche.rds"))