Loading libraries

library(SingleCellExperiment)
library(ggplot2)
library(Matrix)
library(pheatmap)
library(RColorBrewer)
library(uwot)
library(batchelor)
library(GEDI)
set.seed(43)

Functions

#' Function to return object for ggplot2 plotting
#' @param size_text Size of the text to use for plotting
#' @param leg_pos Position of legend
#' @return ggplot theme object
theme_plot <- function(size_text=15,
                           leg_pos="right",
                           size_text_titles=NULL,
                           classic=FALSE,
                           line_size=0.85,
                           line_size_color="black",
                           axis_ticks_length=0.25,
                           axis_ticks_color="black",
                           rotate_x_axis=TRUE
                           ){
    require(ggplot2)
    if( is.null(size_text_titles) ){
        size_text_titles<- size_text + 2
    }
    if( classic ){
        gg.plots<- theme_classic()
    }else{
        gg.plots<- theme_bw()
    }
    gg.plots <- gg.plots +
        theme(axis.line=element_line(linewidth=line_size, colour=line_size_color),
              axis.ticks=element_line(colour=axis_ticks_color, linewidth=line_size),
              axis.ticks.length=unit(axis_ticks_length, "cm"),
              axis.title.x=element_text(size=size_text_titles),
              legend.position=leg_pos,
              plot.title=element_text(hjust=0.5, size=size_text),
              axis.text.y=element_text(size=size_text, color="black"),
              axis.title.y=element_text(size=size_text_titles) )
    if( rotate_x_axis) {
        gg.plots<- gg.plots +
            theme(axis.text.x=element_text(size=size_text, angle=45, hjust=1, color=line_size_color) )
    }else{
        gg.plots<- gg.plots +
            theme(axis.text.x = element_text(size=size_text, angle = 90, vjust = 0.5, hjust=1, color=line_size_color))
    }
}

dotplot<- function(vec_genes, var_de, meta, norm_counts){
 
    # Ensure same ordering between counts and metadata
    groups <- factor(meta[, var_de])
    ngroups <- length(levels(groups))
    
    # ---- Mean expression ----
    # Sum per group
    sum_dat <- rowsum(t(norm_counts[vec_genes, ,drop=F]), group = groups)   # genes x groups (transposed once)
    # Divide by group sizes
    mean_dat <- t(sum_dat / as.vector(table(groups)))
    colnames(mean_dat) <- levels(groups)

    # ---- % expressed cells ----
    is_expressed <- (norm_counts[vec_genes,,drop=F ] > 0) * 1
    expr_sum <- rowsum(t(is_expressed), group = groups)
    per_dat <- t(expr_sum / as.vector(table(groups)) * 100)
    colnames(per_dat) <- levels(groups)

    df_mean<- reshape2::melt(as.matrix(mean_dat))
    df_per<- reshape2::melt(as.matrix(per_dat))

    colnames(df_mean)<- c("Gene", "Celltype", "Mean")
    colnames(df_per)<- c("Gene", "Celltype", "Percentage")

    df_plot<- cbind(df_mean, df_per[,3,drop=F])

    # Dot plot
    dot.scale<- 5
    scale.by<- 'size'
    scale.func <- switch(
        EXPR = scale.by,
        'size' = scale_size,
        'radius' = scale_radius,
        stop("'scale.by' must be either 'size' or 'radius'")
    )
    scale_dot_min<- 5
    scale_dot_max<- NA
    
    # Setting up breaks and limits
    my_breaks<- seq(0, ceiling(max(df_plot$Mean)), length.out = 7)

    # Gene order
    df_plot$Gene<- as.character( df_plot$Gene )
    df_plot$Gene<-factor(df_plot$Gene, levels=rev(vec_genes))

    # Cell order
    df_plot$Celltype<- factor(df_plot$Celltype, levels=rev(levels(groups)))

    # ggplot2
    ggp <- ggplot(data = df_plot, mapping = aes(x = Celltype, y = Gene)) +
        geom_point(mapping = aes(size = Percentage, color = Mean)) +
        scale_color_gradientn(colours=c("light grey", "yellow", "red", "#b00b1e"), breaks=my_breaks)+
        scale.func(range = c(0, dot.scale), limits=c(scale_dot_min, scale_dot_max)) +
        coord_flip()+
        labs(color="Log2(Mean)")+
        theme_plot(10, leg_pos="right", classic=T,rotate_x_axis=F)

    # List of results
    list_results<- list(mean_dat=mean_dat, per_dat=per_dat, ggp=ggp)
    return(list_results)
}

Loading data

The gene expression data can be found at the GEO GSE328575.

# Reading SCE object
sce<- readRDS("sce.rds")
meta<- colData(sce)

UMAP of tumor cells

We will also load the GEDI model for the integrated tumor data, which can be accessed in here.

model<- readRDS("gedi_model_tumor.rds")

dim(meta)

## [1] 85563    33

length(model$aux$cellIDs)

## [1] 62739

length(intersect(model$aux$cellIDs, rownames(meta)))

## [1] 62739

# reorder meta based on GEDI order
meta<- meta[model$aux$cellIDs,]
sce<- sce[,rownames(meta)]
# Check H matrix
t(model$aux$inputH)

##                   (Intercept) tumor_sitemetastasis
## PID003_TP1_T1_1_1           1                    0
## PID005_TM1_T1_1_1           1                    1
## PID008_TP1_T1_1_1           1                    0
## PID019_TP1_T1_1_1           1                    0
## PID020_TP1_T1_1_1           1                    0
## PID021_TM1_T1_1_1           1                    1
## PID022_TM1_T1_1_1           1                    1
## PID023_TP1_T1_1_1           1                    0
## PID024_TP1_T1_1_1           1                    0
## PID025_TM1_T1_1_1           1                    1
## PID026_TP1_T1_1_1           1                    0
## PID028_TP1_T1_1_1           1                    0
## PID030_TM1_T1_1_1           1                    1
## PID031_TM1_T1_1_1           1                    1
## PID027_TP1_T1_1_1           1                    0
## PID029_TP1_T1_1_1           1                    0

# Project the vector field on the SVD space
vectorField <- svd.vectorField.gedi(
    model, start.cond = c(1,0), end.cond = c(1,1) )

# Euclidean distance
set.seed(43)
umap_vectorField <- umap(
    vectorField$v %*% diag(vectorField$d),
    a=10, b=2,
    #min_dist=0.01,
    metric="euclidean")

Plot embedding

ggp<- plot_embedding(umap_vectorField[vectorField$embedding_indices,],
                     meta$cell_type) +
    theme_void() +
    theme(legend.position="right")
print(ggp)

Dotplot

Get normalized counts

sce<- multiBatchNorm(sce, batch=sce$Sample)
norm_counts<- as.matrix( assay(sce, "logcounts") )

## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 14.2 GiB

# Adjust name
rownames(norm_counts)[rownames(norm_counts) == "RGS5_ENSG00000143248" ]<- "RGS5"

# Filter and order names
meta_plot<- meta[!meta$cell_type %in% c("Malignant", "Normal","Unclassified"),]
celltype_order<- c("T_helper", "T_reg", "T_cytotoxic", "NK", "B", "B_plasma", "pDC", "Mast" ,"DC", "Mac","Mono.Mac","Mono","Myelin", "Endothelial", "Pericyte", "Fibroblast" )
meta_plot$cell_type<- factor(meta_plot$cell_type, levels=rev(celltype_order ))
norm_counts<- norm_counts[,rownames(meta_plot)]

# Vector of genes
vec_genes<- c('PTPRC', # General Immune              
              'CD3D', 'CD3E', # General T cells
              'IL7R', # CD4 T helper
              'FOXP3', 'CTLA4', # CD4 T reg              
              'CD8A', 'CD8B', # CD8 T cells
              'GZMK', 'NKG7', # Cytotoxic markers
              'KLRD1', 'GNLY', # NK
              'MS4A1', 'BANK1', 'TNFRSF13C', # B cell markers
              'CD79A', # All B lineage
              'FCRL5', 'MZB1', 'DERL3', 'JCHAIN', # B plasma
              'SCT', 'IL3RA', 'LILRA4', # pDC                
              'KIT','TPSB2','CPA3', # Mast cells
              'FCER1A', 'CD1C', # DC              
              #'FCGR3A',# CD16 (non classical monocytes)
              'C1QA', 'C1QB', 'C1QC', # Macrophages
              'CD14', 'FCN1','S100A9', # Monocytes
              'LYZ', # pan-myeloid 
              'MOG', 'MAG', 'CLDN11', # Myelin cells
              'CDH5', 'PTPRB', 'CD34','VWF','FLT1', 'PLVAP', # Endothelial cells
              'RGS5', 'PDGFRB','ACTA2', 'NOTCH3', # Pericytes
              'COL1A2',  'PDGFRA', 'LUM' # Fibroblast
              )

res_dotplot<- dotplot(vec_genes, var_de="cell_type", meta_plot, norm_counts)
res_dotplot$ggp

## Warning: Removed 545 rows containing missing values or values outside the scale
## range (`geom_point()`).

UMAP of malignant cells

# Let's delete previous model and reload objects
rm(model)
# Reading SCE object
sce<- readRDS("sce.rds")
meta<- colData(sce)

We will also load the GEDI model for the integrated malignant RCC data, which can be accessed in here.

model<- readRDS("gedi_model_malignant.rds")

dim(meta)

## [1] 85563    33

length(model$aux$cellIDs)

## [1] 44739

length(intersect(model$aux$cellIDs, rownames(meta)))

## [1] 44739

# reorder meta based on GEDI order
meta<- meta[model$aux$cellIDs,]
sce<- sce[,rownames(meta)]

# Generating svd
svd_res <- svd.gedi( model )
embedding_res_svd<- svd_res$v %*% diag(svd_res$d)
colnames(embedding_res_svd)<- paste0("embedding", 1:ncol(embedding_res_svd))

# Generating umap of 2 dimensions
umap_2_res <- umap(embedding_res_svd, min_dist=0.01, metric="euclidean")
colnames(umap_2_res)<- paste0("umap", 1:2)
rownames(umap_2_res)<- model$aux$cellIDs
umap_2_res<- data.frame(umap_2_res)

## Plot embeddings
plot_embedding( umap_2_res, meta$Patient) + labs(x="umap1", y="umap2", color="Patient") +
    theme_void() +
    theme(legend.position="right")

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: AlmaLinux 9.6 (Sage Margay)
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS IMKL;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Canada/Eastern
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] GEDI_1.0.2                  batchelor_1.18.1           
##  [3] uwot_0.1.16                 RColorBrewer_1.1-3         
##  [5] pheatmap_1.0.12             Matrix_1.6-4               
##  [7] ggplot2_3.5.2               SingleCellExperiment_1.24.0
##  [9] SummarizedExperiment_1.32.0 Biobase_2.62.0             
## [11] GenomicRanges_1.54.1        GenomeInfoDb_1.38.5        
## [13] IRanges_2.36.0              S4Vectors_0.40.2           
## [15] BiocGenerics_0.48.1         MatrixGenerics_1.14.0      
## [17] matrixStats_1.2.0          
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4              xfun_0.41                
##  [3] bslib_0.6.1               lattice_0.22-5           
##  [5] vctrs_0.6.5               tools_4.3.1              
##  [7] bitops_1.0-7              generics_0.1.3           
##  [9] parallel_4.3.1            tibble_3.2.1             
## [11] fansi_1.0.6               highr_0.10               
## [13] pkgconfig_2.0.3           BiocNeighbors_1.20.2     
## [15] sparseMatrixStats_1.14.0  lifecycle_1.0.4          
## [17] GenomeInfoDbData_1.2.11   stringr_1.5.1            
## [19] farver_2.1.1              compiler_4.3.1           
## [21] munsell_0.5.0             codetools_0.2-19         
## [23] htmltools_0.5.7           sass_0.4.8               
## [25] RCurl_1.98-1.14           yaml_2.3.8               
## [27] pillar_1.9.0              crayon_1.5.2             
## [29] jquerylib_0.1.4           BiocParallel_1.36.0      
## [31] DelayedArray_0.28.0       cachem_1.0.8             
## [33] abind_1.4-5               rsvd_1.0.5               
## [35] tidyselect_1.2.0          digest_0.6.33            
## [37] stringi_1.8.3             reshape2_1.4.4           
## [39] BiocSingular_1.18.0       dplyr_1.1.4              
## [41] labeling_0.4.3            fastmap_1.1.1            
## [43] grid_4.3.1                colorspace_2.1-0         
## [45] cli_3.6.2                 SparseArray_1.2.3        
## [47] magrittr_2.0.3            S4Arrays_1.2.0           
## [49] utf8_1.2.4                withr_2.5.2              
## [51] DelayedMatrixStats_1.24.0 scales_1.3.0             
## [53] rmarkdown_2.29            XVector_0.42.0           
## [55] igraph_2.1.4              ResidualMatrix_1.12.0    
## [57] RcppEigen_0.3.3.9.4       beachmat_2.18.0          
## [59] ScaledMatrix_1.10.0       evaluate_0.23            
## [61] knitr_1.45                RcppAnnoy_0.0.21         
## [63] irlba_2.3.5.1             rlang_1.1.6              
## [65] Rcpp_1.0.12               scuttle_1.12.0           
## [67] glue_1.7.0                jsonlite_1.8.8           
## [69] plyr_1.8.9                R6_2.5.1                 
## [71] zlibbioc_1.48.0