Loading Libraries

library(Rcpp) # required for scConvexNMF
library(Matrix) # required for scConvexNMF
library(quadprog) # required for scConvexNMF
library(ComplexHeatmap)
library(circlize)
library(ggplot2)
library(Seurat) # Required for data 
library(SeuratData) # Required for data
library(scConvexNMF)
set.seed(43)

Load data

This tutorial will use data from the SeuratData package.

Only run for the first time.

InstallData("pbmc3k")

We’ll use data from a collection of PBMCs profiled across multiple technologies.

data("pbmcsca", message=F)
## Warning in data("pbmcsca", message = F): data set 'F' not found
pbmcsca<- UpdateSeuratObject(pbmcsca)
## Validating object structure
## Updating object slots
## Ensuring keys are in the proper structure
## Warning: Assay RNA changing from Assay to Assay
## Ensuring keys are in the proper structure
## Ensuring feature names don't have underscores or pipes
## Updating slots in RNA
## Validating object structure for Assay 'RNA'
## Object representation is consistent with the most current Seurat version
pbmcsca
## An object of class Seurat 
## 33694 features across 31021 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)
##  2 layers present: counts, data
meta<- pbmcsca[[]]
rc<- pbmcsca[["RNA"]]$counts
class(meta)
## [1] "data.frame"
class(rc)
## [1] "dgCMatrix"
## attr(,"package")
## [1] "Matrix"
dim(meta)
## [1] 31021    10
dim(rc)
## [1] 33694 31021

As an example, we’ll run scConvexNMF on a small selection of cells. We’ll identify gene programs in CD4+ T cells from different technologies.

# Filter cells
table(meta$CellType)
## 
##                      B cell              CD14+ monocyte 
##                        5020                        5550 
##              CD16+ monocyte                 CD4+ T cell 
##                         804                        7391 
##            Cytotoxic T cell              Dendritic cell 
##                        9071                         433 
##               Megakaryocyte         Natural killer cell 
##                         977                        1565 
## Plasmacytoid dendritic cell                  Unassigned 
##                         164                          46
meta$Sample<- paste0(meta$orig.ident, "_", meta$Method)
meta<- meta[meta$CellType == "Cytotoxic T cell",]
cells_donor1_10x<- sample(rownames(meta)[meta$Sample == "pbmc1_10x Chromium (v2) A"], 150)
cells_donor1_dropseq<- sample(rownames(meta)[meta$Sample == "pbmc1_Drop-seq"], 150)
cells_donor1_sw<- sample(rownames(meta)[meta$Sample == "pbmc1_Seq-Well"], 150)
meta<- meta[c(cells_donor1_10x, cells_donor1_dropseq, cells_donor1_sw),]
table(meta$Sample)
## 
## pbmc1_10x Chromium (v2) A            pbmc1_Drop-seq            pbmc1_Seq-Well 
##                       150                       150                       150
# Filter genes
rc<- rc[,rownames(meta)]
sum_genes<- rowSums(rc>1)
genes_use<- sum_genes>3
table(genes_use)
## genes_use
## FALSE  TRUE 
## 31074  2620
rc<- rc[genes_use,]

Set up and run the scConvexNMF model

model <- new("scConvexNMF")
model$setup(M=rc, # Input - raw counts
            K=15, # Number of programs
            Groups=meta$Sample, # Group-specific effects
            maskingRatio=0.01 # Whether to mask some data to perform cross-validation
            )
## Setting up the scConvexNMF model...
## 0.2400343
model$initialize_LVs()
## Initializing LVs ...
model$optimize(30) # Number of iterations
##   Iteration 1/30 (total 1)...
## sigma2: 0.2371
##   Iteration 10/30 (total 10)...
## sigma2: 0.1648
##   Iteration 20/30 (total 20)...
## sigma2: 0.1276
##   Iteration 30/30 (total 30)...
## sigma2: 0.1088

Inspect output of the model

  • W: Activity of each gene program across cells
  • G: Contribution of each gene to gene program
  • H: Each entry in row k, column j, tells us how much meta-gene k contributes to the reconstruction of the expression of gene j (usually G and H are closely related: if a gene contributes a lot to a meta-gene, that meta-gene also contributes a lot to the reconstruction of the expression of that gene).
WGH <- getWGH_scCNMF(model)
W<- WGH$W[rownames(meta),]
G<- WGH$G

Gene program activities

Let’s inspect correlation between the gene programs

draw(Heatmap(cor(W))
    )

Overlap of inferred gene programs with pathways

First let’s retrieve the KEGG LEGACY genesets from msigdbr.

library(msigdbr) # Required for msigdb annotations
geneset <- data.frame(msigdbr(species = "human", collection = "C2", subcollection="CP:KEGG_LEGACY"))[,c("gene_symbol", "gs_name")]
path<- table(geneset$gene_symbol, geneset$gs_name)
path <- (path > 0) * 1L 
wJ_G <- weightedJaccard_G(model,path)
## Warning in stats::pgamma(wJ, shape = null.E^2/null.var, rate = null.E/null.var,
## : NaNs produced
## Warning in sqrt(null.E2 - null.E^2): NaNs produced
wJ_G_plot <- visualize_jaccard(wJ_G)
draw(wJ_G_plot$heatmap)

We observe association between the GP1 and GP9 with cytotoxic pathways, while GP10 is associated with actin cytoskeleton. We can visualize the imputed expression of these gene programs.

vec_lvs<- c("LV1", "LV9", "LV10")
n_genes<- 10
meta_heat<- meta[,c("Experiment", "Sample")]
ht<- visualize_Yp(model, vec_lvs=vec_lvs, n_genes=10, meta_heat=meta_heat, wj= wJ_G, path=path)
## Gene LGALS1 is a member of: LV1,LV10
## Only the highest association is kept, which is for LV10
## 'magick' package is suggested to install to give better rasterization.
## 
## Set `ht_opt$message = FALSE` to turn off this message.
draw(ht)

Save objects

saveRDS(model, file="model.rds")
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] msigdbr_26.1.0           scConvexNMF_0.0.0.9000   pbmcsca.SeuratData_3.0.0
##  [4] SeuratData_0.2.2.9002    Seurat_5.1.0             SeuratObject_5.0.2      
##  [7] sp_2.1-4                 ggplot2_3.5.1            circlize_0.4.17         
## [10] ComplexHeatmap_2.22.0    quadprog_1.5-8           Matrix_1.7-2            
## [13] Rcpp_1.0.13             
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     jsonlite_1.8.9         shape_1.4.6.1         
##   [4] magrittr_2.0.3         spatstat.utils_3.1-1   farver_2.1.2          
##   [7] rmarkdown_2.28         GlobalOptions_0.1.3    vctrs_0.6.5           
##  [10] ROCR_1.0-11            Cairo_1.6-5            spatstat.explore_3.3-3
##  [13] htmltools_0.5.8.1      curl_5.2.3             sass_0.4.9            
##  [16] sctransform_0.4.1      parallelly_1.40.1      KernSmooth_2.23-26    
##  [19] bslib_0.8.0            htmlwidgets_1.6.4      ica_1.0-3             
##  [22] plyr_1.8.9             plotly_4.10.4          zoo_1.8-12            
##  [25] cachem_1.1.0           igraph_2.1.1           mime_0.12             
##  [28] lifecycle_1.0.4        iterators_1.0.14       pkgconfig_2.0.3       
##  [31] R6_2.5.1               fastmap_1.2.0          fitdistrplus_1.2-1    
##  [34] future_1.34.0          shiny_1.9.1            clue_0.3-68           
##  [37] digest_0.6.37          colorspace_2.1-1       patchwork_1.3.0       
##  [40] S4Vectors_0.44.0       tensor_1.5             RSpectra_0.16-2       
##  [43] irlba_2.3.5.1          progressr_0.15.1       fansi_1.0.6           
##  [46] spatstat.sparse_3.1-0  httr_1.4.7             polyclip_1.10-7       
##  [49] abind_1.4-8            compiler_4.4.3         withr_3.0.1           
##  [52] doParallel_1.0.17      fastDummies_1.7.4      highr_0.11            
##  [55] MASS_7.3-64            rappdirs_0.3.3         rjson_0.2.23          
##  [58] tools_4.4.3            lmtest_0.9-40          httpuv_1.6.15         
##  [61] future.apply_1.11.3    goftest_1.2-3          glue_1.8.0            
##  [64] nlme_3.1-167           promises_1.3.0         Rtsne_0.17            
##  [67] cluster_2.1.8          reshape2_1.4.4         generics_0.1.3        
##  [70] gtable_0.3.6           spatstat.data_3.1-4    tidyr_1.3.1           
##  [73] data.table_1.15.4      utf8_1.2.4             BiocGenerics_0.52.0   
##  [76] spatstat.geom_3.3-4    RcppAnnoy_0.0.22       ggrepel_0.9.6         
##  [79] RANN_2.6.2             foreach_1.5.2          pillar_1.9.0          
##  [82] stringr_1.5.1          babelgene_22.9         spam_2.11-0           
##  [85] RcppHNSW_0.6.0         later_1.3.2            splines_4.4.3         
##  [88] dplyr_1.1.4            lattice_0.22-5         deldir_2.0-4          
##  [91] survival_3.8-3         tidyselect_1.2.1       miniUI_0.1.1.1        
##  [94] pbapply_1.7-2          knitr_1.48             gridExtra_2.3         
##  [97] IRanges_2.40.0         scattermore_1.2        stats4_4.4.3          
## [100] xfun_0.48              matrixStats_1.4.1      stringi_1.8.4         
## [103] lazyeval_0.2.2         yaml_2.3.10            evaluate_1.0.1        
## [106] codetools_0.2-19       tibble_3.2.1           cli_3.6.3             
## [109] uwot_0.2.2             xtable_1.8-4           reticulate_1.40.0     
## [112] munsell_0.5.1          jquerylib_0.1.4        spatstat.random_3.3-2 
## [115] globals_0.16.3         png_0.1-8              spatstat.univar_3.1-1 
## [118] parallel_4.4.3         assertthat_0.2.1       dotCall64_1.2         
## [121] listenv_0.9.1          viridisLite_0.4.2      scales_1.3.0          
## [124] ggridges_0.5.6         leiden_0.4.3.1         purrr_1.0.2           
## [127] crayon_1.5.3           GetoptLong_1.1.0       rlang_1.1.4           
## [130] cowplot_1.1.3