In this tutorial, we are going to perform a quick introduction to GEDI. We will be using the PBMC data from this publication. This is a collection of immune cell types profiled from peripherial blood from two human donors across six single-cell RNA-seq technologies (10x Chromium v2 and v3, CEL-seq2, Drop-seq, inDrops, Seq-Well and Smart-seq2).
library(scuttle)
library(scran)
library(scater)
library(uwot)
library(SingleCellExperiment)
library(ggplot2)
library(HDF5Array)
library(SeuratData)
library(GEDI)
set.seed(43)
The raw count matrices and metadata are available through the SeuratData package:
InstallData("pbmcsca") # Install pbmcsca object
data("pbmcsca") # Load the Seurat Object
We will generate a Single Cell Experiment (SCE) Object.
meta<- pbmcsca[[]] # Obtaining metadata
meta$Barcode<- rownames(meta) # Naming Barcode
sce<- SingleCellExperiment(list(counts=Seurat::GetAssayData(object = pbmcsca, slot = "counts")), colData=meta) # create SCE
sce
## class: SingleCellExperiment
## dim: 33694 31021
## metadata(0):
## assays(1): counts
## rownames(33694): TSPAN6 TNMD ... RP11-107E5.4 RP11-299P2.2
## rowData names(0):
## colnames(31021): pbmc1_SM2_Cell_108 pbmc1_SM2_Cell_115 ...
## pbmc2_inDrops_1_TGAATCCT.TTATGCGA.CATCTCCC
## pbmc2_inDrops_1_TGAGCACA.GAGCCTTA.CGAGTCTG
## colData names(11): orig.ident nCount_RNA ... Method Barcode
## reducedDimNames(0):
## altExpNames(0):
We will perform basic QC steps using the scuttle library.
sce<- addPerCellQC(sce, subsets=list(Mito=grep("MT-", rownames(sce) )) )
sce$log10_total_counts<- log10(sce$sum)
sce$log10_total_features_by_counts<- log10(sce$detected)
Visualize the quality of the data
ggplot(data.frame(colData(sce)), aes(log10_total_counts, log10_total_features_by_counts,color=subsets_Mito_percent ) )+
geom_point(size=0.5) +
viridis::scale_color_viridis() +
facet_wrap(~sce$Method)
Filtering low-quality cells
filter<- sce$log10_total_counts > 2.5 &
sce$log10_total_features_by_counts > 2 &
sce$subsets_Mito_percent <= 20
sce<- sce[,filter]
We will select two samples and then select 200 cells from each sample randomly.
cells_donor1_10x<- sample(colnames(sce)[sce$Method == "10x Chromium (v2) A" & sce$orig.ident == "pbmc1" ], 200)
cells_donor1_dropseq<- sample(colnames(sce)[sce$Method == "Drop-seq" & sce$orig.ident == "pbmc1" ], 200)
sce<- sce[, c(cells_donor1_10x, cells_donor1_dropseq)]
Filtering low-expressed genes
# We kept genes that had more than 3 counts in more than 3 cells across the entire dataset
sum_genes<- rowSums(assay(sce, "counts")>3)
genes_use<- sum_genes>3
table(genes_use)
## genes_use
## FALSE TRUE
## 33126 568
genes_use<- names(which(genes_use))
# Filtering SCE
sce<- sce[genes_use,]
sce
## class: SingleCellExperiment
## dim: 568 400
## metadata(0):
## assays(1): counts
## rownames(568): CD99 SLC25A5 ... AC090498.1 AP000769.1
## rowData names(0):
## colnames(400): pbmc1_10x_v2_A_ACACTGACAGCCACCA
## pbmc1_10x_v2_A_GAATAAGTCCAAACTG ... pbmc1_Drop_AATTACGCGATG
## pbmc1_Drop_TATCAGATGGTC
## colData names(19): orig.ident nCount_RNA ... log10_total_counts
## log10_total_features_by_counts
## reducedDimNames(0):
## altExpNames(0):
After pre-processing, we are ready to run GEDI. First, we need to indicate a vector of samples. We will create a vector that indicates each sample, which in this case is a combination of the donor and single-cell technology used.
sce$donor_sample<- paste0(sce$orig.ident, ".", sce$Method)
meta<- data.frame(colData(sce))
IMPORTANT: The vector of samples (and potentially the metadata) needs to be in the same order as the gene expression matrix.
We will run GEDI using the raw counts, which need to be passed as a sparse Matrix.
# Accesing the raw counts in the SCE object and convert to a sparse Matrix format
raw_counts<- as(as.matrix(assay(sce, "counts")), "dgCMatrix")
Now, we are ready to run GEDI. These are the main arguments:
Optional arguments:
For this example, we are going to use GEDI with the raw counts and the Bsphere mode. This step should take a couple of minutes with this dataset.
## Set up GEDI model
model <- new("GEDI") # Initialize GEDI object
model$setup(Samples = sce$donor_sample, # Vector indicating which sample belongs to each cell
colData=meta, # Metadata (optional)
M = raw_counts, # Expression data
K = 5, # Number of latent variables to use
mode = "Bsphere", # Modes to use: Either Bsphere (hyperellipsoid) or Bl2 (hyperplane)
oi_shrinkage = 0.001 # Shrinkage multiplier for oi. In here we use 0.001, to better accommodated the mean abundance differences that exist between multiple scRNA-seq technologies.
)
## Setting up the GEDI model...
model$initialize.LVs(randomSeed = 1) # initialize LVs
## Initializing LVs...
## Initializing oi...
## Performing initial decompsition of Y...
model$optimize(iterations=50) # run model with 50 iterations
## Performing block coordinate descent optimization...
## Iteration 1/50 (total 1)...
## 1.135132
## 1
## Mean(o): -0.7547346 ; Var(o): 1.065657 ; Var(si): 0.008446325
## Iteration 2/50 (total 2)...
## 0.8738663
## 1
## Mean(o): -0.7545425 ; Var(o): 1.376523 ; Var(si): 0.03537189
## Iteration 3/50 (total 3)...
## 0.749041
## 1
## Mean(o): -0.7544743 ; Var(o): 1.598822 ; Var(si): 0.06243741
## Iteration 4/50 (total 4)...
## 0.6616295
## 1
## Mean(o): -0.7544268 ; Var(o): 1.803197 ; Var(si): 0.07344039
## Iteration 5/50 (total 5)...
## 0.590758
## 1
## Mean(o): -0.7543819 ; Var(o): 2.012816 ; Var(si): 0.07317494
## Iteration 6/50 (total 6)...
## 0.5317217
## 1
## Mean(o): -0.7543347 ; Var(o): 2.229724 ; Var(si): 0.06837794
## Iteration 7/50 (total 7)...
## 0.4827452
## 1
## Mean(o): -0.7542839 ; Var(o): 2.449492 ; Var(si): 0.06283484
## Iteration 8/50 (total 8)...
## 0.4424246
## 1
## Mean(o): -0.7542296 ; Var(o): 2.666398 ; Var(si): 0.05785041
## Iteration 9/50 (total 9)...
## 0.4093384
## 1
## Mean(o): -0.7541732 ; Var(o): 2.875695 ; Var(si): 0.05358362
## Iteration 10/50 (total 10)...
## 0.3821151
## 1
## Mean(o): -0.7541157 ; Var(o): 3.074439 ; Var(si): 0.04988358
## Iteration 11/50 (total 11)...
## 0.3595567
## 1
## Mean(o): -0.7540585 ; Var(o): 3.261278 ; Var(si): 0.04659177
## Iteration 12/50 (total 12)...
## 0.3406975
## 1
## Mean(o): -0.7540026 ; Var(o): 3.435828 ; Var(si): 0.04358846
## Iteration 13/50 (total 13)...
## 0.3247835
## 1
## Mean(o): -0.7539488 ; Var(o): 3.598269 ; Var(si): 0.04077346
## Iteration 14/50 (total 14)...
## 0.3112258
## 1
## Mean(o): -0.753898 ; Var(o): 3.749168 ; Var(si): 0.03808883
## Iteration 15/50 (total 15)...
## 0.2995702
## 1
## Mean(o): -0.753851 ; Var(o): 3.889245 ; Var(si): 0.03552969
## Iteration 16/50 (total 16)...
## 0.2894696
## 1
## Mean(o): -0.7538084 ; Var(o): 4.019208 ; Var(si): 0.0331123
## Iteration 17/50 (total 17)...
## 0.280656
## 1
## Mean(o): -0.753771 ; Var(o): 4.139725 ; Var(si): 0.03084993
## Iteration 18/50 (total 18)...
## 0.2729185
## 1
## Mean(o): -0.7537398 ; Var(o): 4.25143 ; Var(si): 0.02874577
## Iteration 19/50 (total 19)...
## 0.2660887
## 1
## Mean(o): -0.7537155 ; Var(o): 4.35493 ; Var(si): 0.02679375
## Iteration 20/50 (total 20)...
## 0.2600304
## 1
## Mean(o): -0.7536991 ; Var(o): 4.450803 ; Var(si): 0.02498213
## Iteration 21/50 (total 21)...
## 0.2546327
## 1
## Mean(o): -0.7536913 ; Var(o): 4.539592 ; Var(si): 0.02329677
## Iteration 22/50 (total 22)...
## 0.249805
## 1
## Mean(o): -0.7536929 ; Var(o): 4.621797 ; Var(si): 0.02172341
## Iteration 23/50 (total 23)...
## 0.2454723
## 1
## Mean(o): -0.7537046 ; Var(o): 4.697875 ; Var(si): 0.02024894
## Iteration 24/50 (total 24)...
## 0.241572
## 1
## Mean(o): -0.7537272 ; Var(o): 4.768251 ; Var(si): 0.01886197
## Iteration 25/50 (total 25)...
## 0.2380516
## 1
## Mean(o): -0.7537612 ; Var(o): 4.83332 ; Var(si): 0.01755271
## Iteration 26/50 (total 26)...
## 0.2348662
## 1
## Mean(o): -0.7538072 ; Var(o): 4.89345 ; Var(si): 0.0163126
## Iteration 27/50 (total 27)...
## 0.2319776
## 1
## Mean(o): -0.7538657 ; Var(o): 4.948985 ; Var(si): 0.015134
## Iteration 28/50 (total 28)...
## 0.2293529
## 1
## Mean(o): -0.7539371 ; Var(o): 5.000245 ; Var(si): 0.01400989
## Iteration 29/50 (total 29)...
## 0.2269639
## 1
## Mean(o): -0.7540217 ; Var(o): 5.047525 ; Var(si): 0.01293376
## Iteration 30/50 (total 30)...
## 0.2247859
## 1
## Mean(o): -0.7541198 ; Var(o): 5.091099 ; Var(si): 0.01189943
## Iteration 31/50 (total 31)...
## 0.2227976
## 1
## Mean(o): -0.7542316 ; Var(o): 5.131221 ; Var(si): 0.01090105
## Iteration 32/50 (total 32)...
## 0.2209802
## 1
## Mean(o): -0.7543572 ; Var(o): 5.168128 ; Var(si): 0.009932974
## Iteration 33/50 (total 33)...
## 0.2193175
## 1
## Mean(o): -0.7544966 ; Var(o): 5.202036 ; Var(si): 0.008989768
## Iteration 34/50 (total 34)...
## 0.217795
## 1
## Mean(o): -0.7546499 ; Var(o): 5.233147 ; Var(si): 0.008066133
## Iteration 35/50 (total 35)...
## 0.2164
## 1
## Mean(o): -0.7548172 ; Var(o): 5.261647 ; Var(si): 0.00715693
## Iteration 36/50 (total 36)...
## 0.2151214
## 1
## Mean(o): -0.7549984 ; Var(o): 5.287704 ; Var(si): 0.006257253
## Iteration 37/50 (total 37)...
## 0.2139493
## 1
## Mean(o): -0.7551935 ; Var(o): 5.311471 ; Var(si): 0.005362668
## Iteration 38/50 (total 38)...
## 0.2128753
## 1
## Mean(o): -0.7554028 ; Var(o): 5.333084 ; Var(si): 0.004469814
## Iteration 39/50 (total 39)...
## 0.2118924
## 1
## Mean(o): -0.7556264 ; Var(o): 5.352657 ; Var(si): 0.003577789
## Iteration 40/50 (total 40)...
## 0.2109948
## 1
## Mean(o): -0.7558646 ; Var(o): 5.370278 ; Var(si): 0.002691327
## Iteration 41/50 (total 41)...
## 0.2101786
## 1
## Mean(o): -0.7561175 ; Var(o): 5.385997 ; Var(si): 0.001827955
## Iteration 42/50 (total 42)...
## 0.2094423
## 1
## Mean(o): -0.7563837 ; Var(o): 5.399811 ; Var(si): 0.001033165
## Iteration 43/50 (total 43)...
## 0.2087873
## 1
## Mean(o): -0.7566566 ; Var(o): 5.411649 ; Var(si): 0.0004036312
## Iteration 44/50 (total 44)...
## 0.2082143
## 1
## Mean(o): -0.7569149 ; Var(o): 5.421453 ; Var(si): 6.889453e-05
## Iteration 45/50 (total 45)...
## 0.2077119
## 1
## Mean(o): -0.7571237 ; Var(o): 5.42949 ; Var(si): 1.748201e-06
## Iteration 46/50 (total 46)...
## 0.2072601
## 1
## Mean(o): -0.7572901 ; Var(o): 5.436268 ; Var(si): 8.727945e-10
## Iteration 47/50 (total 47)...
## 0.2068483
## 1
## Mean(o): -0.7574492 ; Var(o): 5.442043 ; Var(si): 2.067687e-16
## Iteration 48/50 (total 48)...
## 0.2064717
## 1
## Mean(o): -0.7576087 ; Var(o): 5.446938 ; Var(si): 1.168144e-29
## Iteration 49/50 (total 49)...
## 0.2061264
## 1
## Mean(o): -0.7577688 ; Var(o): 5.451042 ; Var(si): 2.592221e-33
## Iteration 50/50 (total 50)...
## 0.2058095
## 1
## Mean(o): -0.7579293 ; Var(o): 5.454428 ; Var(si): 5.620549e-33
We check convergence of the GEDI model.
model$plotTracking()
## $ZAo
##
## $Bi
##
## $Qi
##
## $oi
##
## $si
##
## $Rk
## NULL
##
## $sigma2
These plots show the convergence of the different components of the model (o, Z, Bi, Qi, oi and si). The last plot shows the convergence of the mean squared error of the model.
IMPORTANT: GEDI reorders the cells by sample, so we need to reorder the original metadata.
# Reorder meta
meta<- meta[model$aux$cellIDs,]
First, let's analyze the effect of sample without using GEDI:
sce<- logNormCounts(sce) # Obtain log-normalized counts
# PCA on the log normalized counts (without integration)
pca_nointeg<- rsvd::rpca(t(assay(sce, "logcounts")), k=5)
# Doing UMAP of 2 dimensions
umap_nointeg <- umap(pca_nointeg$x,
n_components = 2,
min_dist=0.01,
metric="euclidean",
verbose = TRUE)
## 11:54:24 UMAP embedding parameters a = 1.896 b = 0.8006
## 11:54:24 Read 400 rows and found 5 numeric columns
## 11:54:24 Using FNN for neighbor search, n_neighbors = 15
## 11:54:24 Commencing smooth kNN distance calibration using 16 threads
## 11:54:26 Initializing from normalized Laplacian + noise
## 11:54:26 Commencing optimization for 500 epochs, with 7308 positive edges
## 11:54:27 Optimization finished
colnames(umap_nointeg) <- paste0("umap", 1:2)
rownames(umap_nointeg) <- colnames(sce)
## Plot embeddings
GEDI::plot_embedding(umap_nointeg, meta$donor_sample, size_point=2) + labs(x="umap1", y="umap2", title="UMAP (no integration)", color="Sample")
GEDI::plot_embedding(umap_nointeg, meta$CellType, size_point=2) + labs(x="umap1", y="umap2", title="UMAP (no integration)", color="Cell Type")
We observe that the cells cluster by the sequencing technology. Now, let's compare the results to the integrated space from GEDI. After running the model, we can recover the integrated reference space.
# Generating svd
svd_res <- svd.gedi( model )
embedding_res_svd<- svd_res$v %*% diag(svd_res$d)
colnames(embedding_res_svd)<- paste0("embedding", 1:ncol(embedding_res_svd))
# Generating umap of 2 dimensions
umap_2_res <- umap(embedding_res_svd, min_dist=0.01, metric="euclidean")
colnames(umap_2_res)<- paste0("umap", 1:2)
rownames(umap_2_res)<- model$aux$cellIDs
umap_2_res<- data.frame(umap_2_res)
## Plot embeddings
GEDI::plot_embedding( umap_2_res, meta$donor_sample, size_point=2) + labs(x="umap1", y="umap2", title="UMAP (integration with GEDI)", color="Sample")
GEDI::plot_embedding( umap_2_res, meta$CellType, size_point=2) + labs(x="umap1", y="umap2", title="UMAP (integration with GEDI)", color="Cell Type")
We observe alignment between the two sequencing technologies, and that overall, cells cluster by their original cell types.
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2020/Core/imkl/2020.1.217/compilers_and_libraries_2020.1.217/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] GEDI_0.0.0.9000 pbmcsca.SeuratData_3.0.0
## [3] pancreasref.SeuratData_1.0.0 panc8.SeuratData_3.0.2
## [5] kidneyref.SeuratData_1.0.1 ifnb.SeuratData_3.1.0
## [7] hcabm40k.SeuratData_3.0.0 SeuratData_0.2.2
## [9] HDF5Array_1.18.1 rhdf5_2.34.0
## [11] DelayedArray_0.16.3 uwot_0.1.10
## [13] Matrix_1.3-3 scater_1.18.6
## [15] ggplot2_3.4.2 scran_1.18.6
## [17] scuttle_1.0.4 SingleCellExperiment_1.12.0
## [19] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [21] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
## [23] IRanges_2.24.1 S4Vectors_0.28.1
## [25] BiocGenerics_0.36.0 MatrixGenerics_1.2.1
## [27] matrixStats_0.58.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 reticulate_1.18
## [3] tidyselect_1.2.0 htmlwidgets_1.5.3
## [5] grid_4.0.0 BiocParallel_1.24.1
## [7] Rtsne_0.15 munsell_0.5.0
## [9] codetools_0.2-16 ica_1.0-2
## [11] statmod_1.4.35 future_1.33.0
## [13] miniUI_0.1.1.1 withr_2.5.0
## [15] colorspace_2.0-0 progressr_0.10.1
## [17] highr_0.8 knitr_1.31
## [19] Seurat_4.1.1 ROCR_1.0-11
## [21] tensor_1.5 listenv_0.8.0
## [23] labeling_0.4.2 GenomeInfoDbData_1.2.4
## [25] polyclip_1.10-0 farver_2.1.0
## [27] parallelly_1.36.0 vctrs_0.6.1
## [29] generics_0.1.3 xfun_0.22
## [31] R6_2.5.0 ggbeeswarm_0.6.0
## [33] rsvd_1.0.5 RcppEigen_0.3.3.9.1
## [35] locfit_1.5-9.4 bitops_1.0-6
## [37] rhdf5filters_1.2.0 spatstat.utils_2.1-0
## [39] cachem_1.0.4 promises_1.2.0.1
## [41] scales_1.2.1 rgeos_0.5-9
## [43] beeswarm_0.3.1 gtable_0.3.0
## [45] beachmat_2.6.4 globals_0.16.2
## [47] goftest_1.2-2 rlang_1.1.1
## [49] splines_4.0.0 lazyeval_0.2.2
## [51] spatstat.geom_2.1-0 yaml_2.2.1
## [53] reshape2_1.4.4 abind_1.4-5
## [55] httpuv_1.5.5 tools_4.0.0
## [57] ellipsis_0.3.2 spatstat.core_2.1-2
## [59] jquerylib_0.1.3 RColorBrewer_1.1-2
## [61] ggridges_0.5.3 Rcpp_1.0.8.3
## [63] plyr_1.8.6 sparseMatrixStats_1.2.1
## [65] zlibbioc_1.36.0 purrr_1.0.1
## [67] RCurl_1.98-1.3 rpart_4.1-15
## [69] deldir_0.2-10 pbapply_1.4-3
## [71] viridis_0.6.4 cowplot_1.1.1
## [73] zoo_1.8-9 SeuratObject_4.1.0
## [75] ggrepel_0.9.2 cluster_2.1.0
## [77] magrittr_2.0.3 RSpectra_0.16-0
## [79] data.table_1.14.0 scattermore_0.7
## [81] lmtest_0.9-38 RANN_2.6.1
## [83] fitdistrplus_1.1-3 patchwork_1.1.1
## [85] mime_0.10 evaluate_0.22
## [87] xtable_1.8-4 gridExtra_2.3
## [89] compiler_4.0.0 tibble_3.2.1
## [91] KernSmooth_2.23-16 crayon_1.5.2
## [93] htmltools_0.5.6 mgcv_1.8-31
## [95] later_1.1.0.1 tidyr_1.3.0
## [97] DBI_1.1.1 MASS_7.3-51.5
## [99] rappdirs_0.3.3 cli_3.6.1
## [101] igraph_1.3.4 pkgconfig_2.0.3
## [103] sp_1.4-5 plotly_4.9.3
## [105] spatstat.sparse_2.0-0 vipor_0.4.5
## [107] bslib_0.5.1 dqrng_0.2.1
## [109] XVector_0.30.0 stringr_1.5.0
## [111] digest_0.6.33 sctransform_0.3.3
## [113] RcppAnnoy_0.0.18 spatstat.data_2.1-0
## [115] rmarkdown_2.7 leiden_0.3.8
## [117] edgeR_3.32.1 DelayedMatrixStats_1.12.3
## [119] shiny_1.6.0 lifecycle_1.0.3
## [121] nlme_3.1-147 jsonlite_1.8.7
## [123] Rhdf5lib_1.12.1 BiocNeighbors_1.8.2
## [125] viridisLite_0.4.2 limma_3.46.0
## [127] fansi_1.0.4 pillar_1.9.0
## [129] lattice_0.20-41 fastmap_1.1.1
## [131] httr_1.4.5 survival_3.2-11
## [133] glue_1.6.2 FNN_1.1.3
## [135] png_0.1-7 bluster_1.0.0
## [137] stringi_1.5.3 sass_0.4.7
## [139] BiocSingular_1.6.0 dplyr_1.1.1
## [141] irlba_2.3.3 future.apply_1.7.0