library(Rcpp) # required for scConvexNMF
library(Matrix) # required for scConvexNMF
library(quadprog) # required for scConvexNMF
library(ComplexHeatmap)
library(circlize)
library(ggplot2)
library(Seurat) # Required for data
library(SeuratData) # Required for data
library(scConvexNMF)
set.seed(43)
This tutorial will use data from the SeuratData package.
Only run for the first time.
InstallData("pbmc3k")
We’ll use data from a collection of PBMCs profiled across multiple technologies.
data("pbmcsca", message=F)
## Warning in data("pbmcsca", message = F): data set 'F' not found
pbmcsca<- UpdateSeuratObject(pbmcsca)
## Validating object structure
## Updating object slots
## Ensuring keys are in the proper structure
## Warning: Assay RNA changing from Assay to Assay
## Ensuring keys are in the proper structure
## Ensuring feature names don't have underscores or pipes
## Updating slots in RNA
## Validating object structure for Assay 'RNA'
## Object representation is consistent with the most current Seurat version
pbmcsca
## An object of class Seurat
## 33694 features across 31021 samples within 1 assay
## Active assay: RNA (33694 features, 0 variable features)
## 2 layers present: counts, data
meta<- pbmcsca[[]]
rc<- pbmcsca[["RNA"]]$counts
class(meta)
## [1] "data.frame"
class(rc)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(meta)
## [1] 31021 10
dim(rc)
## [1] 33694 31021
As an example, we’ll run scConvexNMF on a small selection of cells. We’ll identify gene programs in CD4+ T cells from different technologies.
# Filter cells
table(meta$CellType)
##
## B cell CD14+ monocyte
## 5020 5550
## CD16+ monocyte CD4+ T cell
## 804 7391
## Cytotoxic T cell Dendritic cell
## 9071 433
## Megakaryocyte Natural killer cell
## 977 1565
## Plasmacytoid dendritic cell Unassigned
## 164 46
meta$Sample<- paste0(meta$orig.ident, "_", meta$Method)
meta<- meta[meta$CellType == "Cytotoxic T cell",]
cells_donor1_10x<- sample(rownames(meta)[meta$Sample == "pbmc1_10x Chromium (v2) A"], 150)
cells_donor1_dropseq<- sample(rownames(meta)[meta$Sample == "pbmc1_Drop-seq"], 150)
cells_donor1_sw<- sample(rownames(meta)[meta$Sample == "pbmc1_Seq-Well"], 150)
meta<- meta[c(cells_donor1_10x, cells_donor1_dropseq, cells_donor1_sw),]
table(meta$Sample)
##
## pbmc1_10x Chromium (v2) A pbmc1_Drop-seq pbmc1_Seq-Well
## 150 150 150
# Filter genes
rc<- rc[,rownames(meta)]
sum_genes<- rowSums(rc>1)
genes_use<- sum_genes>3
table(genes_use)
## genes_use
## FALSE TRUE
## 31074 2620
rc<- rc[genes_use,]
model <- new("scConvexNMF")
model$setup(M=rc, # Input - raw counts
K=15, # Number of programs
Groups=meta$Sample, # Group-specific effects
maskingRatio=0.01 # Whether to mask some data to perform cross-validation
)
## Setting up the scConvexNMF model...
## 0.2400343
model$initialize_LVs()
## Initializing LVs ...
model$optimize(30) # Number of iterations
## Iteration 1/30 (total 1)...
## sigma2: 0.2371
## Iteration 10/30 (total 10)...
## sigma2: 0.1648
## Iteration 20/30 (total 20)...
## sigma2: 0.1276
## Iteration 30/30 (total 30)...
## sigma2: 0.1088
WGH <- getWGH_scCNMF(model)
W<- WGH$W[rownames(meta),]
G<- WGH$G
Let’s inspect correlation between the gene programs
draw(Heatmap(cor(W))
)
First let’s retrieve the KEGG LEGACY genesets from msigdbr.
library(msigdbr) # Required for msigdb annotations
geneset <- data.frame(msigdbr(species = "human", collection = "C2", subcollection="CP:KEGG_LEGACY"))[,c("gene_symbol", "gs_name")]
path<- table(geneset$gene_symbol, geneset$gs_name)
path <- (path > 0) * 1L
wJ_G <- weightedJaccard_G(model,path)
## Warning in stats::pgamma(wJ, shape = null.E^2/null.var, rate = null.E/null.var,
## : NaNs produced
## Warning in sqrt(null.E2 - null.E^2): NaNs produced
wJ_G_plot <- visualize_jaccard(wJ_G)
draw(wJ_G_plot$heatmap)
We observe association between the GP1 and GP9 with cytotoxic pathways, while GP10 is associated with actin cytoskeleton. We can visualize the imputed expression of these gene programs.
vec_lvs<- c("LV1", "LV9", "LV10")
n_genes<- 10
meta_heat<- meta[,c("Experiment", "Sample")]
ht<- visualize_Yp(model, vec_lvs=vec_lvs, n_genes=10, meta_heat=meta_heat, wj= wJ_G, path=path)
## Gene LGALS1 is a member of: LV1,LV10
## Only the highest association is kept, which is for LV10
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
draw(ht)
saveRDS(model, file="model.rds")
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] msigdbr_26.1.0 scConvexNMF_0.0.0.9000 pbmcsca.SeuratData_3.0.0
## [4] SeuratData_0.2.2.9002 Seurat_5.1.0 SeuratObject_5.0.2
## [7] sp_2.1-4 ggplot2_3.5.1 circlize_0.4.17
## [10] ComplexHeatmap_2.22.0 quadprog_1.5-8 Matrix_1.7-2
## [13] Rcpp_1.0.13
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 jsonlite_1.8.9 shape_1.4.6.1
## [4] magrittr_2.0.3 spatstat.utils_3.1-1 farver_2.1.2
## [7] rmarkdown_2.28 GlobalOptions_0.1.3 vctrs_0.6.5
## [10] ROCR_1.0-11 Cairo_1.6-5 spatstat.explore_3.3-3
## [13] htmltools_0.5.8.1 curl_5.2.3 sass_0.4.9
## [16] sctransform_0.4.1 parallelly_1.40.1 KernSmooth_2.23-26
## [19] bslib_0.8.0 htmlwidgets_1.6.4 ica_1.0-3
## [22] plyr_1.8.9 plotly_4.10.4 zoo_1.8-12
## [25] cachem_1.1.0 igraph_2.1.1 mime_0.12
## [28] lifecycle_1.0.4 iterators_1.0.14 pkgconfig_2.0.3
## [31] R6_2.5.1 fastmap_1.2.0 fitdistrplus_1.2-1
## [34] future_1.34.0 shiny_1.9.1 clue_0.3-68
## [37] digest_0.6.37 colorspace_2.1-1 patchwork_1.3.0
## [40] S4Vectors_0.44.0 tensor_1.5 RSpectra_0.16-2
## [43] irlba_2.3.5.1 progressr_0.15.1 fansi_1.0.6
## [46] spatstat.sparse_3.1-0 httr_1.4.7 polyclip_1.10-7
## [49] abind_1.4-8 compiler_4.4.3 withr_3.0.1
## [52] doParallel_1.0.17 fastDummies_1.7.4 highr_0.11
## [55] MASS_7.3-64 rappdirs_0.3.3 rjson_0.2.23
## [58] tools_4.4.3 lmtest_0.9-40 httpuv_1.6.15
## [61] future.apply_1.11.3 goftest_1.2-3 glue_1.8.0
## [64] nlme_3.1-167 promises_1.3.0 Rtsne_0.17
## [67] cluster_2.1.8 reshape2_1.4.4 generics_0.1.3
## [70] gtable_0.3.6 spatstat.data_3.1-4 tidyr_1.3.1
## [73] data.table_1.15.4 utf8_1.2.4 BiocGenerics_0.52.0
## [76] spatstat.geom_3.3-4 RcppAnnoy_0.0.22 ggrepel_0.9.6
## [79] RANN_2.6.2 foreach_1.5.2 pillar_1.9.0
## [82] stringr_1.5.1 babelgene_22.9 spam_2.11-0
## [85] RcppHNSW_0.6.0 later_1.3.2 splines_4.4.3
## [88] dplyr_1.1.4 lattice_0.22-5 deldir_2.0-4
## [91] survival_3.8-3 tidyselect_1.2.1 miniUI_0.1.1.1
## [94] pbapply_1.7-2 knitr_1.48 gridExtra_2.3
## [97] IRanges_2.40.0 scattermore_1.2 stats4_4.4.3
## [100] xfun_0.48 matrixStats_1.4.1 stringi_1.8.4
## [103] lazyeval_0.2.2 yaml_2.3.10 evaluate_1.0.1
## [106] codetools_0.2-19 tibble_3.2.1 cli_3.6.3
## [109] uwot_0.2.2 xtable_1.8-4 reticulate_1.40.0
## [112] munsell_0.5.1 jquerylib_0.1.4 spatstat.random_3.3-2
## [115] globals_0.16.3 png_0.1-8 spatstat.univar_3.1-1
## [118] parallel_4.4.3 assertthat_0.2.1 dotCall64_1.2
## [121] listenv_0.9.1 viridisLite_0.4.2 scales_1.3.0
## [124] ggridges_0.5.6 leiden_0.4.3.1 purrr_1.0.2
## [127] crayon_1.5.3 GetoptLong_1.1.0 rlang_1.1.4
## [130] cowplot_1.1.3