Loading libraries

library(SingleCellExperiment)
library(ggplot2)
library(Matrix)
library(pheatmap)
library(RColorBrewer)
library(uwot)
library(lsa)
library(GEDI)
library(batchelor)
set.seed(43)

Functions

#' Function to return object for ggplot2 plotting
#' @param size_text Size of the text to use for plotting
#' @param leg_pos Position of legend
#' @return ggplot theme object
theme_plot <- function(size_text=15,
                           leg_pos="right",
                           size_text_titles=NULL,
                           classic=FALSE,
                           line_size=0.85,
                           line_size_color="black",
                           axis_ticks_length=0.25,
                           axis_ticks_color="black",
                           rotate_x_axis=TRUE
                           ){
    require(ggplot2)
    if( is.null(size_text_titles) ){
        size_text_titles<- size_text + 2
    }
    if( classic ){
        gg.plots<- theme_classic()
    }else{
        gg.plots<- theme_bw()
    }
    gg.plots <- gg.plots +
        theme(axis.line=element_line(linewidth=line_size, colour=line_size_color),
              axis.ticks=element_line(colour=axis_ticks_color, linewidth=line_size),
              axis.ticks.length=unit(axis_ticks_length, "cm"),
              axis.title.x=element_text(size=size_text_titles),
              legend.position=leg_pos,
              plot.title=element_text(hjust=0.5, size=size_text),
              axis.text.y=element_text(size=size_text, color="black"),
              axis.title.y=element_text(size=size_text_titles) )
    if( rotate_x_axis) {
        gg.plots<- gg.plots +
            theme(axis.text.x=element_text(size=size_text, angle=45, hjust=1, color=line_size_color) )
    }else{
        gg.plots<- gg.plots +
            theme(axis.text.x = element_text(size=size_text, angle = 90, vjust = 0.5, hjust=1, color=line_size_color))
    }
}

Loading data

The gene expression data can be found at the GEO GSE328575.

# Reading SCE object
sce<- readRDS("sce.rds")
meta_sc<- colData(sce)

Read TCGA data The TCGA data can be found at Zenodo.

# Reading SCE object
lis_tcga<- readRDS("lis_tcga.rds")

meta_tc<- lis_tcga$meta

ccpRCT signature

x<- meta_tc$ccpRCT_sig
meta_tc$scale_ccpRCT_sig<- (x - min(x) ) / ( max(x) - min(x) )
ggplot(meta_tc, aes(Molecular_clusters, scale_ccpRCT_sig, fill=Molecular_clusters) ) +
    geom_violin(trim=T,scale="width") +
    theme_plot(rotate_x_axis=F, classic=T)

Cosine similarity

We will also load the GEDI model for the integrated tumor data, which can be accessed in here.

model<- readRDS("gedi_model_rcc_tcga.rds")
tcga_indices <- 1:model$aux$Ni[[1]]
RCC_indices <- (model$aux$Ni[[1]]+1):(sum(model$aux$Ni))

# Reorder metadatas
meta_tc<- meta_tc[model$aux$cellIDs[tcga_indices],]
meta_sc<- meta_sc[model$aux$cellIDs[RCC_indices],]

Calculate the mean embedding vector per TCGA cluster, followed by cosine similarity of each single cell to that vector.

target<- 22
snn_clusters <- meta_tc$RNA_snn_res.4
names(snn_clusters)<- rownames(meta_tc)

reference_vector <- rowMeans(model$params$Bi[[1]][,which(snn_clusters==target)])

predicted_scores <- apply(model$params$Bi[[2]],2,cosine,y=reference_vector)

meta_sc$subtype<- factor(meta_sc$subtype, levels=c("ccRCC", "pRCC", "uRCC", "chRCC", "ccpRCT") )
ggplot(data.frame(x=meta_sc$subtype, y=predicted_scores), aes(x=x,y=y, fill=x)) +
    geom_violin(trim=T,scale="width") +
    ggtitle(target) +
    theme_plot(rotate_x_axis=F, classic=T)

Inversely, define the reference vector based on scRNA-seq data, and calculate cosine similarity with TCGA samples

predicted_scores <- apply(model$params$Bi[[1]],2,cosine,y=reference_vector)

# Violin plot
ggdat<- data.frame(x=snn_clusters, y=predicted_scores, subtype=meta_tc$Molecular_clusters)
ggplot(ggdat, aes(x=subtype,y=y, fill=subtype)) +
    geom_violin(trim=T,scale="width") +
    theme_plot(rotate_x_axis=F, classic=T)

Expression sc

sce<- multiBatchNorm(sce, batch=sce$Sample)
norm_counts<- as.matrix( assay(sce, "logcounts") )
## Warning in asMethod(object): sparse->dense coercion: allocating vector of size
## 19.4 GiB
meta_sc<- data.frame(colData(sce))

# Filter for only tumor
meta_sc<- meta_sc[meta_sc$class == "patient",]

meta_sc$cell_subtype<- meta_sc$cell_type
meta_sc$cell_subtype[meta_sc$cell_type == "Malignant" & meta_sc$subtype == "ccpRCT"]<- "Malignant.ccpRCT"
meta_sc$cell_subtype[meta_sc$cell_type == "Malignant" & meta_sc$subtype == "ccRCC"]<- "Malignant.ccRCC"
meta_sc$cell_subtype[meta_sc$cell_type == "Malignant" & meta_sc$subtype == "chRCC"]<- "Malignant.chRCC"
meta_sc$cell_subtype[meta_sc$cell_type == "Malignant" & meta_sc$subtype == "pRCC"]<- "Malignant.pRCC"
meta_sc$cell_subtype[meta_sc$cell_type == "Malignant" & meta_sc$subtype == "uRCC"]<- "Malignant.uRCC"

norm_counts<- norm_counts[,rownames(meta_sc)]

df_plot<- data.frame(expr=norm_counts["CASP14",],
                     cell_subtype=meta_sc$cell_subtype)

ggplot(df_plot, aes(cell_subtype, expr, fill=cell_subtype) ) +
    geom_violin(trim=T,scale="width") +
    theme_plot(rotate_x_axis=F, classic=T)

Expression TCGA

meta_tc<- lis_tcga$meta
norm_data<- lis_tcga$norm_data

df_plot<- data.frame(expr=norm_data["CASP14",],
                     Molecular_subtype=meta_tc$Molecular_clusters)

ggplot(df_plot, aes(Molecular_subtype, expr, fill=Molecular_subtype) ) +
    geom_violin(trim=T,scale="width") +
    theme_plot(rotate_x_axis=F, classic=T)

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: AlmaLinux 9.6 (Sage Margay)
## 
## Matrix products: default
## BLAS/LAPACK: FlexiBLAS IMKL;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Canada/Eastern
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] batchelor_1.18.1            GEDI_1.0.2                 
##  [3] lsa_0.73.3                  SnowballC_0.7.1            
##  [5] uwot_0.1.16                 RColorBrewer_1.1-3         
##  [7] pheatmap_1.0.12             Matrix_1.6-4               
##  [9] ggplot2_3.5.2               SingleCellExperiment_1.24.0
## [11] SummarizedExperiment_1.32.0 Biobase_2.62.0             
## [13] GenomicRanges_1.54.1        GenomeInfoDb_1.38.5        
## [15] IRanges_2.36.0              S4Vectors_0.40.2           
## [17] BiocGenerics_0.48.1         MatrixGenerics_1.14.0      
## [19] matrixStats_1.2.0          
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.4              xfun_0.41                
##  [3] bslib_0.6.1               lattice_0.22-5           
##  [5] vctrs_0.6.5               tools_4.3.1              
##  [7] bitops_1.0-7              generics_0.1.3           
##  [9] parallel_4.3.1            tibble_3.2.1             
## [11] fansi_1.0.6               highr_0.10               
## [13] pkgconfig_2.0.3           BiocNeighbors_1.20.2     
## [15] sparseMatrixStats_1.14.0  lifecycle_1.0.4          
## [17] GenomeInfoDbData_1.2.11   farver_2.1.1             
## [19] compiler_4.3.1            munsell_0.5.0            
## [21] codetools_0.2-19          htmltools_0.5.7          
## [23] sass_0.4.8                RCurl_1.98-1.14          
## [25] yaml_2.3.8                pillar_1.9.0             
## [27] crayon_1.5.2              jquerylib_0.1.4          
## [29] BiocParallel_1.36.0       DelayedArray_0.28.0      
## [31] cachem_1.0.8              abind_1.4-5              
## [33] rsvd_1.0.5                tidyselect_1.2.0         
## [35] digest_0.6.33             BiocSingular_1.18.0      
## [37] dplyr_1.1.4               labeling_0.4.3           
## [39] fastmap_1.1.1             grid_4.3.1               
## [41] colorspace_2.1-0          cli_3.6.2                
## [43] SparseArray_1.2.3         magrittr_2.0.3           
## [45] S4Arrays_1.2.0            utf8_1.2.4               
## [47] withr_2.5.2               DelayedMatrixStats_1.24.0
## [49] scales_1.3.0              rmarkdown_2.29           
## [51] XVector_0.42.0            igraph_2.1.4             
## [53] ResidualMatrix_1.12.0     RcppEigen_0.3.3.9.4      
## [55] beachmat_2.18.0           ScaledMatrix_1.10.0      
## [57] evaluate_0.23             knitr_1.45               
## [59] irlba_2.3.5.1             rlang_1.1.6              
## [61] Rcpp_1.0.12               scuttle_1.12.0           
## [63] glue_1.7.0                jsonlite_1.8.8           
## [65] R6_2.5.1                  zlibbioc_1.48.0