Pancreas dataset

A human pancreatic islet cell datasets produced across four technologies, CelSeq (GSE81076) CelSeq2 (GSE85241), Fluidigm C1 (GSE86469), and SMART-Seq2 (E-MTAB-5061). Count data are provided by https://satijalab.org, where they are used to demonstrate seurat integration. Further details can be assessed via https://satijalab.org/seurat/v3.0/integration.html.

suppressPackageStartupMessages({
  library(CellBench)
  library(cowplot)
  library(ggplot2)
  library(scater)
  library(jcolors)
  library(CellMixS)
  library(gridExtra)
  library(purrr)
  library(jcolors)
  library(here)
  library(tidyr)
  library(dplyr)
  library(stringr)
  library(variancePartition)
  library(stringr)
  library(Seurat)
})

seed <- 1000

Data

Load data

out_path <- here::here("out")
data_path <- here::here("data")

pancreas.data <- readRDS(paste0(data_path,"/pancreas_expression_matrix.rds"))

# load metadata
metadata <- readRDS(paste0(data_path,"/pancreas_metadata.rds"))

# create SeuratObject
pancreas <- CreateSeuratObject(pancreas.data, meta.data = metadata)

Integration

pancreas.list <- SplitObject(pancreas, split.by = "tech")

for (i in 1:length(pancreas.list)) {
  pancreas.list[[i]] <- NormalizeData(pancreas.list[[i]], verbose = FALSE)
  pancreas.list[[i]] <- FindVariableFeatures(pancreas.list[[i]], 
                                             selection.method = "vst", 
                                             nfeatures = 2000, 
                                             verbose = FALSE)
}
reference.list <- pancreas.list[c("celseq", "celseq2", "smartseq2")]

# find anchors & integrate
pancreas.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30)
## Computing 2000 integration features
## Scaling features for provided objects
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 3510 anchors
## Filtering anchors
##  Retained 2759 anchors
## Extracting within-dataset neighbors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 3501 anchors
## Filtering anchors
##  Retained 2729 anchors
## Extracting within-dataset neighbors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
##  Found 6176 anchors
## Filtering anchors
##  Retained 4561 anchors
## Extracting within-dataset neighbors
pancreas.integrated <- IntegrateData(anchorset = pancreas.anchors, dims = 1:30)
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 3 into 2 1
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Warning: Adding a command log without an assay associated with it
# scale integrated data
DefaultAssay(pancreas.integrated) <- "integrated"
pancreas.integrated <- ScaleData(pancreas.integrated, verbose = FALSE)

Dimension reduction

pancreas.integrated <- RunPCA(pancreas.integrated, npcs = 30, verbose = FALSE)
pancreas.integrated <- RunTSNE(pancreas.integrated, reduction = "pca", dims = 1:30)
pancreas.integrated <- RunUMAP(pancreas.integrated, reduction = "pca", dims = seq_len(30),
                               seed.use = seed, verbose = FALSE,n.neighbors = 30, min.dist = 0.5)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
p1 <- DimPlot(pancreas.integrated, reduction = "tsne", group.by = "tech")
p2 <- DimPlot(pancreas.integrated, reduction = "tsne", group.by = "celltype", 
              label = TRUE, repel = TRUE) + NoLegend()
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
plot_grid(p1, p2)

Convert seurat to sce

seurat <- pancreas.integrated
sce <- SingleCellExperiment(
    assays=list(
        counts=seurat@assays$RNA@counts,
        logcounts=seurat@assays$RNA@data),
    colData=seurat@meta.data,
    reducedDims=lapply(seurat@reductions, FUN=function(x) x@cell.embeddings)
)

# Save data
saveRDS(sce, file = paste0(out_path, "/sce_pancreas.rds"))
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /home/aluetg/R/lib/R/lib/libRblas.so
## LAPACK: /home/aluetg/R/lib/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] Seurat_3.1.4                variancePartition_1.16.1   
##  [3] scales_1.1.0                foreach_1.4.8              
##  [5] limma_3.42.2                stringr_1.4.0              
##  [7] dplyr_0.8.4                 tidyr_1.0.2                
##  [9] here_0.1                    purrr_0.3.3                
## [11] gridExtra_2.3               CellMixS_1.2.3             
## [13] kSamples_1.2-9              SuppDists_1.1-9.5          
## [15] jcolors_0.0.4               scater_1.14.6              
## [17] ggplot2_3.2.1               cowplot_1.0.0              
## [19] CellBench_1.2.0             tibble_2.1.3               
## [21] magrittr_1.5                SingleCellExperiment_1.8.0 
## [23] SummarizedExperiment_1.16.1 DelayedArray_0.12.2        
## [25] BiocParallel_1.20.1         matrixStats_0.55.0         
## [27] Biobase_2.46.0              GenomicRanges_1.38.0       
## [29] GenomeInfoDb_1.22.0         IRanges_2.20.2             
## [31] S4Vectors_0.24.3            BiocGenerics_0.32.0        
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.5          BiocFileCache_1.10.2     sn_1.5-5                
##   [4] plyr_1.8.5               igraph_1.2.4.2           lazyeval_0.2.2          
##   [7] splines_3.6.1            listenv_0.8.0            TH.data_1.0-10          
##  [10] digest_0.6.25            htmltools_0.4.0          viridis_0.5.1           
##  [13] gdata_2.18.0             memoise_1.1.0            cluster_2.1.0           
##  [16] doParallel_1.0.15        ROCR_1.0-7               globals_0.12.5          
##  [19] RcppParallel_4.4.4       sandwich_2.5-1           prettyunits_1.1.1       
##  [22] colorspace_1.4-1         blob_1.2.1               rappdirs_0.3.1          
##  [25] ggrepel_0.8.1            xfun_0.12                crayon_1.3.4            
##  [28] RCurl_1.98-1.1           jsonlite_1.6.1           lme4_1.1-21             
##  [31] survival_3.1-8           zoo_1.8-7                iterators_1.0.12        
##  [34] ape_5.3                  glue_1.3.1               gtable_0.3.0            
##  [37] zlibbioc_1.32.0          XVector_0.26.0           listarrays_0.3.0        
##  [40] leiden_0.3.3             BiocSingular_1.2.2       future.apply_1.4.0      
##  [43] mvtnorm_1.1-0            DBI_1.1.0                bibtex_0.4.2.2          
##  [46] Rcpp_1.0.3               plotrix_3.7-7            metap_1.3               
##  [49] viridisLite_0.3.0        progress_1.2.2           reticulate_1.14         
##  [52] bit_1.1-15.2             rsvd_1.0.3               tsne_0.1-3              
##  [55] htmlwidgets_1.5.1        httr_1.4.1               gplots_3.0.3            
##  [58] RColorBrewer_1.1-2       TFisher_0.2.0            ica_1.0-2               
##  [61] farver_2.0.3             pkgconfig_2.0.3          uwot_0.1.5              
##  [64] dbplyr_1.4.2             labeling_0.3             tidyselect_1.0.0        
##  [67] rlang_0.4.5              reshape2_1.4.3           munsell_0.5.0           
##  [70] tools_3.6.1              RSQLite_2.2.0            ggridges_0.5.2          
##  [73] evaluate_0.14            yaml_2.2.1               npsurv_0.4-0            
##  [76] knitr_1.28               bit64_0.9-7              fitdistrplus_1.0-14     
##  [79] caTools_1.18.0           RANN_2.6.1               pbapply_1.4-2           
##  [82] future_1.16.0            nlme_3.1-144             compiler_3.6.1          
##  [85] pbkrtest_0.4-8.6         png_0.1-7                plotly_4.9.2            
##  [88] beeswarm_0.2.3           curl_4.3                 lsei_1.2-0              
##  [91] stringi_1.4.6            lattice_0.20-40          Matrix_1.2-18           
##  [94] nloptr_1.2.2             multtest_2.42.0          vctrs_0.2.3             
##  [97] mutoss_0.1-12            pillar_1.4.3             lifecycle_0.1.0         
## [100] Rdpack_0.11-1            lmtest_0.9-37            RcppAnnoy_0.0.15        
## [103] BiocNeighbors_1.4.2      data.table_1.12.8        bitops_1.0-6            
## [106] irlba_2.3.3              gbRd_0.4-11              patchwork_1.0.0         
## [109] colorRamps_2.3           R6_2.4.1                 KernSmooth_2.23-16      
## [112] vipor_0.4.5              codetools_0.2-16         boot_1.3-24             
## [115] MASS_7.3-51.5            gtools_3.8.1             assertthat_0.2.1        
## [118] rprojroot_1.3-2          withr_2.1.2              sctransform_0.2.1       
## [121] mnormt_1.5-6             multcomp_1.4-12          GenomeInfoDbData_1.2.2  
## [124] hms_0.5.3                grid_3.6.1               minqa_1.2.4             
## [127] rmarkdown_2.1            DelayedMatrixStats_1.8.0 Rtsne_0.15              
## [130] numDeriv_2016.8-1.1      lubridate_1.7.4          ggbeeswarm_0.6.0