library(SingleCellExperiment)
library(ggplot2)
library(Matrix)
library(pheatmap)
library(RColorBrewer)
library(uwot)
library(batchelor)
library(GEDI)
set.seed(43)
#' Function to return object for ggplot2 plotting
#' @param size_text Size of the text to use for plotting
#' @param leg_pos Position of legend
#' @return ggplot theme object
theme_plot <- function(size_text=15,
leg_pos="right",
size_text_titles=NULL,
classic=FALSE,
line_size=0.85,
line_size_color="black",
axis_ticks_length=0.25,
axis_ticks_color="black",
rotate_x_axis=TRUE
){
require(ggplot2)
if( is.null(size_text_titles) ){
size_text_titles<- size_text + 2
}
if( classic ){
gg.plots<- theme_classic()
}else{
gg.plots<- theme_bw()
}
gg.plots <- gg.plots +
theme(axis.line=element_line(linewidth=line_size, colour=line_size_color),
axis.ticks=element_line(colour=axis_ticks_color, linewidth=line_size),
axis.ticks.length=unit(axis_ticks_length, "cm"),
axis.title.x=element_text(size=size_text_titles),
legend.position=leg_pos,
plot.title=element_text(hjust=0.5, size=size_text),
axis.text.y=element_text(size=size_text, color="black"),
axis.title.y=element_text(size=size_text_titles) )
if( rotate_x_axis) {
gg.plots<- gg.plots +
theme(axis.text.x=element_text(size=size_text, angle=45, hjust=1, color=line_size_color) )
}else{
gg.plots<- gg.plots +
theme(axis.text.x = element_text(size=size_text, angle = 90, vjust = 0.5, hjust=1, color=line_size_color))
}
}
dotplot<- function(vec_genes, var_de, meta, norm_counts){
# Ensure same ordering between counts and metadata
groups <- factor(meta[, var_de])
ngroups <- length(levels(groups))
# ---- Mean expression ----
# Sum per group
sum_dat <- rowsum(t(norm_counts[vec_genes, ,drop=F]), group = groups) # genes x groups (transposed once)
# Divide by group sizes
mean_dat <- t(sum_dat / as.vector(table(groups)))
colnames(mean_dat) <- levels(groups)
# ---- % expressed cells ----
is_expressed <- (norm_counts[vec_genes,,drop=F ] > 0) * 1
expr_sum <- rowsum(t(is_expressed), group = groups)
per_dat <- t(expr_sum / as.vector(table(groups)) * 100)
colnames(per_dat) <- levels(groups)
df_mean<- reshape2::melt(as.matrix(mean_dat))
df_per<- reshape2::melt(as.matrix(per_dat))
colnames(df_mean)<- c("Gene", "Celltype", "Mean")
colnames(df_per)<- c("Gene", "Celltype", "Percentage")
df_plot<- cbind(df_mean, df_per[,3,drop=F])
# Dot plot
dot.scale<- 5
scale.by<- 'size'
scale.func <- switch(
EXPR = scale.by,
'size' = scale_size,
'radius' = scale_radius,
stop("'scale.by' must be either 'size' or 'radius'")
)
scale_dot_min<- 5
scale_dot_max<- NA
# Setting up breaks and limits
my_breaks<- seq(0, ceiling(max(df_plot$Mean)), length.out = 7)
# Gene order
df_plot$Gene<- as.character( df_plot$Gene )
df_plot$Gene<-factor(df_plot$Gene, levels=rev(vec_genes))
# Cell order
df_plot$Celltype<- factor(df_plot$Celltype, levels=rev(levels(groups)))
# ggplot2
ggp <- ggplot(data = df_plot, mapping = aes(x = Celltype, y = Gene)) +
geom_point(mapping = aes(size = Percentage, color = Mean)) +
scale_color_gradientn(colours=c("light grey", "yellow", "red", "#b00b1e"), breaks=my_breaks)+
scale.func(range = c(0, dot.scale), limits=c(scale_dot_min, scale_dot_max)) +
coord_flip()+
labs(color="Log2(Mean)")+
theme_plot(10, leg_pos="right", classic=T,rotate_x_axis=F)
# List of results
list_results<- list(mean_dat=mean_dat, per_dat=per_dat, ggp=ggp)
return(list_results)
}
The gene expression data can be found at the GEO GSE328575.
# Reading SCE object
sce<- readRDS("sce.rds")
meta<- colData(sce)
We will also load the GEDI model for the integrated tumor data, which can be accessed in here.
model<- readRDS("gedi_model_tumor.rds")
dim(meta)
## [1] 85563 33
length(model$aux$cellIDs)
## [1] 62739
length(intersect(model$aux$cellIDs, rownames(meta)))
## [1] 62739
# reorder meta based on GEDI order
meta<- meta[model$aux$cellIDs,]
sce<- sce[,rownames(meta)]
# Check H matrix
t(model$aux$inputH)
## (Intercept) tumor_sitemetastasis
## PID003_TP1_T1_1_1 1 0
## PID005_TM1_T1_1_1 1 1
## PID008_TP1_T1_1_1 1 0
## PID019_TP1_T1_1_1 1 0
## PID020_TP1_T1_1_1 1 0
## PID021_TM1_T1_1_1 1 1
## PID022_TM1_T1_1_1 1 1
## PID023_TP1_T1_1_1 1 0
## PID024_TP1_T1_1_1 1 0
## PID025_TM1_T1_1_1 1 1
## PID026_TP1_T1_1_1 1 0
## PID028_TP1_T1_1_1 1 0
## PID030_TM1_T1_1_1 1 1
## PID031_TM1_T1_1_1 1 1
## PID027_TP1_T1_1_1 1 0
## PID029_TP1_T1_1_1 1 0
# Project the vector field on the SVD space
vectorField <- svd.vectorField.gedi(
model, start.cond = c(1,0), end.cond = c(1,1) )
# Euclidean distance
set.seed(43)
umap_vectorField <- umap(
vectorField$v %*% diag(vectorField$d),
a=10, b=2,
#min_dist=0.01,
metric="euclidean")
ggp<- plot_embedding(umap_vectorField[vectorField$embedding_indices,],
meta$cell_type) +
theme_void() +
theme(legend.position="right")
print(ggp)
Get normalized counts
sce<- multiBatchNorm(sce, batch=sce$Sample)
norm_counts<- as.matrix( assay(sce, "logcounts") )
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 14.2 GiB
# Adjust name
rownames(norm_counts)[rownames(norm_counts) == "RGS5_ENSG00000143248" ]<- "RGS5"
# Filter and order names
meta_plot<- meta[!meta$cell_type %in% c("Malignant", "Normal","Unclassified"),]
celltype_order<- c("T_helper", "T_reg", "T_cytotoxic", "NK", "B", "B_plasma", "pDC", "Mast" ,"DC", "Mac","Mono.Mac","Mono","Myelin", "Endothelial", "Pericyte", "Fibroblast" )
meta_plot$cell_type<- factor(meta_plot$cell_type, levels=rev(celltype_order ))
norm_counts<- norm_counts[,rownames(meta_plot)]
# Vector of genes
vec_genes<- c('PTPRC', # General Immune
'CD3D', 'CD3E', # General T cells
'IL7R', # CD4 T helper
'FOXP3', 'CTLA4', # CD4 T reg
'CD8A', 'CD8B', # CD8 T cells
'GZMK', 'NKG7', # Cytotoxic markers
'KLRD1', 'GNLY', # NK
'MS4A1', 'BANK1', 'TNFRSF13C', # B cell markers
'CD79A', # All B lineage
'FCRL5', 'MZB1', 'DERL3', 'JCHAIN', # B plasma
'SCT', 'IL3RA', 'LILRA4', # pDC
'KIT','TPSB2','CPA3', # Mast cells
'FCER1A', 'CD1C', # DC
#'FCGR3A',# CD16 (non classical monocytes)
'C1QA', 'C1QB', 'C1QC', # Macrophages
'CD14', 'FCN1','S100A9', # Monocytes
'LYZ', # pan-myeloid
'MOG', 'MAG', 'CLDN11', # Myelin cells
'CDH5', 'PTPRB', 'CD34','VWF','FLT1', 'PLVAP', # Endothelial cells
'RGS5', 'PDGFRB','ACTA2', 'NOTCH3', # Pericytes
'COL1A2', 'PDGFRA', 'LUM' # Fibroblast
)
res_dotplot<- dotplot(vec_genes, var_de="cell_type", meta_plot, norm_counts)
res_dotplot$ggp
## Warning: Removed 545 rows containing missing values or values outside the scale
## range (`geom_point()`).
# Let's delete previous model and reload objects
rm(model)
# Reading SCE object
sce<- readRDS("sce.rds")
meta<- colData(sce)
We will also load the GEDI model for the integrated malignant RCC data, which can be accessed in here.
model<- readRDS("gedi_model_malignant.rds")
dim(meta)
## [1] 85563 33
length(model$aux$cellIDs)
## [1] 44739
length(intersect(model$aux$cellIDs, rownames(meta)))
## [1] 44739
# reorder meta based on GEDI order
meta<- meta[model$aux$cellIDs,]
sce<- sce[,rownames(meta)]
# Generating svd
svd_res <- svd.gedi( model )
embedding_res_svd<- svd_res$v %*% diag(svd_res$d)
colnames(embedding_res_svd)<- paste0("embedding", 1:ncol(embedding_res_svd))
# Generating umap of 2 dimensions
umap_2_res <- umap(embedding_res_svd, min_dist=0.01, metric="euclidean")
colnames(umap_2_res)<- paste0("umap", 1:2)
rownames(umap_2_res)<- model$aux$cellIDs
umap_2_res<- data.frame(umap_2_res)
## Plot embeddings
plot_embedding( umap_2_res, meta$Patient) + labs(x="umap1", y="umap2", color="Patient") +
theme_void() +
theme(legend.position="right")
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: AlmaLinux 9.6 (Sage Margay)
##
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS IMKL; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_CA.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_CA.UTF-8 LC_COLLATE=en_CA.UTF-8
## [5] LC_MONETARY=en_CA.UTF-8 LC_MESSAGES=en_CA.UTF-8
## [7] LC_PAPER=en_CA.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Canada/Eastern
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] GEDI_1.0.2 batchelor_1.18.1
## [3] uwot_0.1.16 RColorBrewer_1.1-3
## [5] pheatmap_1.0.12 Matrix_1.6-4
## [7] ggplot2_3.5.2 SingleCellExperiment_1.24.0
## [9] SummarizedExperiment_1.32.0 Biobase_2.62.0
## [11] GenomicRanges_1.54.1 GenomeInfoDb_1.38.5
## [13] IRanges_2.36.0 S4Vectors_0.40.2
## [15] BiocGenerics_0.48.1 MatrixGenerics_1.14.0
## [17] matrixStats_1.2.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.4 xfun_0.41
## [3] bslib_0.6.1 lattice_0.22-5
## [5] vctrs_0.6.5 tools_4.3.1
## [7] bitops_1.0-7 generics_0.1.3
## [9] parallel_4.3.1 tibble_3.2.1
## [11] fansi_1.0.6 highr_0.10
## [13] pkgconfig_2.0.3 BiocNeighbors_1.20.2
## [15] sparseMatrixStats_1.14.0 lifecycle_1.0.4
## [17] GenomeInfoDbData_1.2.11 stringr_1.5.1
## [19] farver_2.1.1 compiler_4.3.1
## [21] munsell_0.5.0 codetools_0.2-19
## [23] htmltools_0.5.7 sass_0.4.8
## [25] RCurl_1.98-1.14 yaml_2.3.8
## [27] pillar_1.9.0 crayon_1.5.2
## [29] jquerylib_0.1.4 BiocParallel_1.36.0
## [31] DelayedArray_0.28.0 cachem_1.0.8
## [33] abind_1.4-5 rsvd_1.0.5
## [35] tidyselect_1.2.0 digest_0.6.33
## [37] stringi_1.8.3 reshape2_1.4.4
## [39] BiocSingular_1.18.0 dplyr_1.1.4
## [41] labeling_0.4.3 fastmap_1.1.1
## [43] grid_4.3.1 colorspace_2.1-0
## [45] cli_3.6.2 SparseArray_1.2.3
## [47] magrittr_2.0.3 S4Arrays_1.2.0
## [49] utf8_1.2.4 withr_2.5.2
## [51] DelayedMatrixStats_1.24.0 scales_1.3.0
## [53] rmarkdown_2.29 XVector_0.42.0
## [55] igraph_2.1.4 ResidualMatrix_1.12.0
## [57] RcppEigen_0.3.3.9.4 beachmat_2.18.0
## [59] ScaledMatrix_1.10.0 evaluate_0.23
## [61] knitr_1.45 RcppAnnoy_0.0.21
## [63] irlba_2.3.5.1 rlang_1.1.6
## [65] Rcpp_1.0.12 scuttle_1.12.0
## [67] glue_1.7.0 jsonlite_1.8.8
## [69] plyr_1.8.9 R6_2.5.1
## [71] zlibbioc_1.48.0