CellBench

CellBench uses three human lung adenocarcinoma cell lines HCC827, H1975 and H2228, which were cultured separately, and then processed in three different ways. Firstly, single cells from each cell line were mixed in equal proportions, with libraries generated using three different protocols: CEL-seq2, Drop-seq (with Dolomite equipment) and 10X Chromium. Secondly, the single cells were sorted from the three cell lines into 384-well plates, with an equal number of cells per well in different combinations (generally 9-cells, but with some 90-cell population controls). See

suppressPackageStartupMessages({
    library(scran)
    library(magrittr)
    library(dplyr)
    library(CellBench)
    library(plyr)
    library(EnsDb.Hsapiens.v86)
    library(AnnotationDbi)
})

seed <- 1000
out_path <- here::here("out")

sc_data <- load_sc_data()
## Using temporary cache /tmp/RtmpoU7gQB/BiocFileCache
## Downloading data file from https://github.com/Shians/scBenchData/raw/master/single_cell_data.RData
colData(sc_data[[1]])$protocol <- rep(names(sc_data)[1], ncol(sc_data[[1]]))
sce <- sc_data[[1]]

for(i in 2:length(sc_data)){
  colData(sc_data[[i]])$protocol <- rep(names(sc_data)[i], ncol(sc_data[[i]]))
  gene_overlap <- intersect(rownames(sce), rownames(sc_data[[i]]))
  coldata_overlap <- intersect(names(colData(sce)), names(colData(sc_data[[i]])))
  sc_data[[i]] <- sc_data[[i]][gene_overlap,]
  colData(sc_data[[i]]) <- colData(sc_data[[i]])[, coldata_overlap]
  colData(sce) <- colData(sce)[, coldata_overlap]
  sce <- sce[gene_overlap,]
  sce <- cbind(sce, sc_data[[i]])
}
colnames(sce) <- paste0(colnames(sce), "_", sce$protocol)
dim(sce)
## [1] 13575  1401
sce$protocol <- mapvalues(sce$protocol, from = c("sc_10x","sc_celseq","sc_dropseq"), to = c("tenx", "celseq", "dropseq"))

#Gene annotation to symbol
edb <- EnsDb.Hsapiens.v86
rowData(sce)$symbol <- mapIds(edb, keys=rownames(sce), column="SYMBOL", keytype="GENEID", multiVals="first")
## Warning: Unable to map 2 of 13575 requested IDs.
rownames(sce) <- paste0(rownames(sce), ".", rowData(sce)$symbol)

## Filter out genes that are not expressed in any cell
sce <- sce[which(rowSums(counts(sce) > 0) > 0), ]
dim(sce)
## [1] 13575  1401
saveRDS(sce, file = paste0(out_path, "/sce_cellBench.rds"))
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /home/aluetg/R/lib/R/lib/libRblas.so
## LAPACK: /home/aluetg/R/lib/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] EnsDb.Hsapiens.v86_2.99.0   ensembldb_2.10.2           
##  [3] AnnotationFilter_1.10.0     GenomicFeatures_1.38.2     
##  [5] AnnotationDbi_1.48.0        plyr_1.8.6                 
##  [7] CellBench_1.2.0             tibble_2.1.3               
##  [9] dplyr_0.8.5                 magrittr_1.5               
## [11] scran_1.14.6                SingleCellExperiment_1.8.0 
## [13] SummarizedExperiment_1.16.1 DelayedArray_0.12.2        
## [15] BiocParallel_1.20.1         matrixStats_0.55.0         
## [17] Biobase_2.46.0              GenomicRanges_1.38.0       
## [19] GenomeInfoDb_1.22.0         IRanges_2.20.2             
## [21] S4Vectors_0.24.3            BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##  [1] ProtGenerics_1.18.0      bitops_1.0-6             lubridate_1.7.4         
##  [4] bit64_0.9-7              progress_1.2.2           httr_1.4.1              
##  [7] rprojroot_1.3-2          backports_1.1.5          tools_3.6.1             
## [10] R6_2.4.1                 irlba_2.3.3              vipor_0.4.5             
## [13] lazyeval_0.2.2           DBI_1.1.0                colorspace_1.4-1        
## [16] prettyunits_1.1.1        tidyselect_1.0.0         gridExtra_2.3           
## [19] bit_1.1-15.2             curl_4.3                 compiler_3.6.1          
## [22] BiocNeighbors_1.4.2      rtracklayer_1.46.0       scales_1.1.0            
## [25] askpass_1.1              rappdirs_0.3.1           Rsamtools_2.2.3         
## [28] stringr_1.4.0            digest_0.6.25            rmarkdown_2.1           
## [31] XVector_0.26.0           scater_1.14.6            pkgconfig_2.0.3         
## [34] htmltools_0.4.0          dbplyr_1.4.2             limma_3.42.2            
## [37] rlang_0.4.5              RSQLite_2.2.0            DelayedMatrixStats_1.8.0
## [40] RCurl_1.98-1.1           BiocSingular_1.2.2       GenomeInfoDbData_1.2.2  
## [43] Matrix_1.2-18            Rcpp_1.0.3               ggbeeswarm_0.6.0        
## [46] munsell_0.5.0            viridis_0.5.1            lifecycle_0.2.0         
## [49] stringi_1.4.6            yaml_2.2.1               edgeR_3.28.1            
## [52] zlibbioc_1.32.0          BiocFileCache_1.10.2     grid_3.6.1              
## [55] blob_1.2.1               dqrng_0.2.1              crayon_1.3.4            
## [58] lattice_0.20-40          Biostrings_2.54.0        hms_0.5.3               
## [61] locfit_1.5-9.1           knitr_1.28               pillar_1.4.3            
## [64] igraph_1.2.4.2           biomaRt_2.42.0           XML_3.99-0.3            
## [67] glue_1.3.1               evaluate_0.14            vctrs_0.2.4             
## [70] openssl_1.4.1            gtable_0.3.0             purrr_0.3.3             
## [73] assertthat_0.2.1         ggplot2_3.3.0            xfun_0.12               
## [76] rsvd_1.0.3               viridisLite_0.3.0        GenomicAlignments_1.22.1
## [79] beeswarm_0.2.3           memoise_1.1.0            statmod_1.4.34          
## [82] here_0.1