In this tutorial, we are going to perform an analysis of sample-to-sample variability using the PBMC data from this publication. This is a collection of immune cell types profiled from peripherial blood from two human donors across six single-cell RNA-seq technologies (10x Chromimum v2 and v3, CEL-seq2, Drop-seq, inDrops, Seq-Well and Smart-seq2).

01 - Pre-processing

Load libraries

library(scuttle)
library(scran)
library(scater)
library(uwot)
library(SingleCellExperiment)
library(ggplot2)
library(HDF5Array)
library(GEDI)
set.seed(43)

Downloading data

The raw count matrices and metadata are available through the SeuratData package:

library(SeuratData)
InstallData("pbmcsca") # Install pbmcsca object
data("pbmcsca") # Load the Seurat Object

We will generate a Single Cell Experiment (SCE) Object.

meta<- pbmcsca[[]] # Obtaining metadata
meta$Barcode<- rownames(meta) # Naming Barcode
sce<- SingleCellExperiment(list(counts=Seurat::GetAssayData(object = pbmcsca, slot = "counts")), colData=meta) # create SCE
sce

Quality control steps

We will perform basic QC steps using the scuttle library.

sce<- addPerCellQC(sce, subsets=list(Mito=grep("MT-", rownames(sce) )) )
sce$log10_total_counts<- log10(sce$sum)
sce$log10_total_features_by_counts<- log10(sce$detected)

In our case, we already have a SingleCellExperiment created, which we will upload.Instructions on how to download the data can be found in here.

sce<- loadHDF5SummarizedExperiment(dir="./my_h5_se")
sce$subsets_Mito_percent<- sce$pct_counts_mito # Renaming so it's concordant with addPerCellQC

Visualize the quality of the data

ggplot(data.frame(colData(sce)), aes(log10_total_counts, log10_total_features_by_counts,color=subsets_Mito_percent ) )+
    geom_point(size=0.5) +    
    viridis::scale_color_viridis() +
    facet_wrap(~sce$Method)

Filtering low-quality cells

filter<- sce$log10_total_counts > 2.5 &
    sce$log10_total_features_by_counts > 2 &
    sce$subsets_Mito_percent <= 20
sce<- sce[,filter]

Filtering low-expressed genes

# We kept genes that had more than 5 counts in more than 3 cells across the entire dataset
sum_genes<- rowSums(assay(sce, "counts")>5)
genes_use<- sum_genes>3
table(genes_use)

## genes_use
## FALSE  TRUE 
## 19288 14406

genes_use<- names(which(genes_use))

# Filtering SCE
sce<- sce[genes_use,]
sce

## class: SingleCellExperiment 
## dim: 14406 28035 
## metadata(0):
## assays(2): counts logcounts
## rownames(14406): DPM1 SCYL3 ... RP11-107E5.4 RP11-299P2.2
## rowData names(8): is_feature_control is_feature_control_mito ...
##   total_counts log10_total_counts
## colnames(28035): pbmc1_SM2_Cell_108 pbmc1_SM2_Cell_115 ...
##   pbmc2_inDrops_1_TGAATCCT.GAGCCTTA.CCCAAGCA
##   pbmc2_inDrops_1_TGAATCCT.TTATGCGA.CATCTCCC
## colData names(51): orig.ident nCount_RNA ... sizeFactor
##   subsets_Mito_percent
## reducedDimNames(0):
## altExpNames(0):

02 - Run GEDI

After pre-processing, we are ready to run GEDI. First, we need to indicate a vector of samples. We will create a vector that indicates each sample, which in this case is a combination of the donor and single-cell technology used.

sce$donor_sample<- paste0(sce$orig.ident, ".", sce$Method)
meta<- data.frame(colData(sce))

IMPORTANT: The vector of samples (and potentially the metadata) needs to be in the same order as the gene expression matrix.

We will run GEDI using the raw counts, which need to be passed as a sparse Matrix.

# Accesing the raw counts in the SCE object and convert to a sparse Matrix format
raw_counts<- as(as.matrix(assay(sce, "counts")), "dgCMatrix")

Now, we are ready to run GEDI. These are the main arguments:

Samples: Batch variable to use. It should be a character vector indicating which sample belongs to each cell.
Expression matrix: Could be one of the following:
- Y: The log-transformed (and possibly normalized) gene expression matrix.
- M: The raw read count matrix, which needs to be in the sparse format. It can also be a list of two matrices, in which case they are considered as paired observations whose log-ratio must be modelled.
K: The number of latent variables.
mode: Two values are allowed:
- Bsphere: L2 norms of B columns are fixed. Interpretation is that we’re projecting the data on a hyperellipsoid of dimension K.
- Bl2: L2 norm of the entire B matrix is fixed. Interpretation is that we’re projecting the data on a lower-dimensional hyperplane with dimension K.
itelim: Number of iterations for optimization.

Optional arguments:

C: The gene-level biological prior. If NULL, it means that there is no prior information for Z.
H: Sample-level prior for sources of variation. If NULL, there will be no prior for Qi.
oi_shrinkage: Shrinkage multiplier for oi (offset vector per sample i).

For this example, we are going to use GEDI with the raw counts and the Bsphere mode.

NOTE: With this complete dataset(~15K genes and ~28K cells), this step might take several hours, so it is recommended to run this using a high-performance computing cluster.

## Set up GEDI model
model <- new("GEDI") # Initialize GEDI object
model$setup(Samples = sce$donor_sample, # Vector indicating which sample belongs to each cell
            colData=meta, # Metadata (optional)
            M = raw_counts, # Expression data
            K = 40, # Number of latent variables to use
            mode = "Bsphere", # Modes to use: Either Bsphere (hyperellipsoid) or Bl2 (hyperplane)
            oi_shrinkage = 0.001 # Shrinkage multiplier for oi. In here we use 0.001, to better accommodated the mean abundance differences that exist between multiple scRNA-seq technologies.
            ) 
model$initialize.LVs(randomSeed = 1) # initialize LVs
model$optimize(iterations=500) # run model with 500 iterations
saveRDS( model, file="pbmc_gedi_model_bothDonors.rds") # Saving output model

The output GEDI model for this tutorial can also be accessed in here.

We check convergence of the GEDI model.

model$plotTracking()

## $ZAo

## 
## $Bi

## 
## $Qi

## 
## $oi

## 
## $si

## 
## $Rk
## NULL
## 
## $sigma2

These plots show the convergence of the different components of the model (o, Z, Bi, Qi, oi and si). The last plot shows the convergence of the mean squared error of the model.

IMPORTANT: GEDI reorders the cells by sample, so we need to reorder the original metadata.

# Reorder meta
meta<- meta[model$aux$cellIDs,]

03 - Visualize integration results

After running the model, we can recover the integrated reference space.

# Generating svd
svd_res <- svd.gedi( model )
embedding_res_svd<- svd_res$v %*% diag(svd_res$d)
colnames(embedding_res_svd)<- paste0("embedding", 1:ncol(embedding_res_svd))

# Generating umap of 2 dimensions
umap_2_res <- umap(embedding_res_svd, min_dist=0.01, metric="euclidean")
colnames(umap_2_res)<- paste0("umap", 1:2)
rownames(umap_2_res)<- model$aux$cellIDs
umap_2_res<- data.frame(umap_2_res)

## Plot embeddings
GEDI::plot_embedding( umap_2_res, meta$donor_sample) + labs(x="umap1", y="umap2", title="UMAP (integration with GEDI)", color="Sample")

GEDI::plot_embedding( umap_2_res, meta$CellType) + labs(x="umap1", y="umap2", title="UMAP (integration with GEDI)", color="Cell Type")

We observe clear separation between the cell types without any obvious separation by the single-cell technology.

04 - Analysis of sample-to-sample variability

We first create a sample-level metadata matrix.

# Create a sample-level metadata matrix
H <- unique(
    cbind(
        meta$donor_sample, # Vector of samples 
        as.character(meta$orig.ident), # Donor
        sub("[[:space:]]A$|[[:space:]]B$", "", meta$Method), # Technology (removing A and B 10x versions
        meta$Method # Technology
    ) )
rownames(H) <- H[,1]
head(H)

##                           [,1]                        [,2]   
## pbmc1.Smart-seq2          "pbmc1.Smart-seq2"          "pbmc1"
## pbmc1.CEL-Seq2            "pbmc1.CEL-Seq2"            "pbmc1"
## pbmc1.10x Chromium (v2) A "pbmc1.10x Chromium (v2) A" "pbmc1"
## pbmc1.10x Chromium (v2) B "pbmc1.10x Chromium (v2) B" "pbmc1"
## pbmc1.10x Chromium (v3)   "pbmc1.10x Chromium (v3)"   "pbmc1"
## pbmc1.Drop-seq            "pbmc1.Drop-seq"            "pbmc1"
##                           [,3]                [,4]                 
## pbmc1.Smart-seq2          "Smart-seq2"        "Smart-seq2"         
## pbmc1.CEL-Seq2            "CEL-Seq2"          "CEL-Seq2"           
## pbmc1.10x Chromium (v2) A "10x Chromium (v2)" "10x Chromium (v2) A"
## pbmc1.10x Chromium (v2) B "10x Chromium (v2)" "10x Chromium (v2) B"
## pbmc1.10x Chromium (v3)   "10x Chromium (v3)" "10x Chromium (v3)"  
## pbmc1.Drop-seq            "Drop-seq"          "Drop-seq"

We then gather the set of sample-specific manifold parameters learned by GEDI (excluding sample-specific translation vectors).

Q <- NULL
for( i in 1:model$aux$numSamples ) {
  Q <- cbind(Q,c(model$params$Qi[[i]])) # sample-specific manifold parameters are stored in model$params$Qi
}
colnames(Q) <- model$aux$Samples
rownames(Q) <- paste0( rep(model$aux$geneIDs,model$aux$K), ".Q", rep(1:model$aux$K,each=model$aux$J) )

dim(Q)

## [1] 576240     14

head(Q)

##             pbmc1.Smart-seq2 pbmc1.CEL-Seq2 pbmc1.10x Chromium (v2) A
## DPM1.Q1            3.0753703       2.692895                0.21061525
## SCYL3.Q1          -4.2046855      -4.627479                0.05325036
## C1orf112.Q1       -0.7212641       3.042990                0.22357886
## FGR.Q1           -14.1739946       1.932431                1.21535994
## CFH.Q1            -0.7778644       2.618229               -0.15009692
## FUCA2.Q1          -3.5036795      -0.467825                0.30754981
##             pbmc1.10x Chromium (v2) B pbmc1.10x Chromium (v3) pbmc1.Drop-seq
## DPM1.Q1                    -0.5415665              0.19947351    0.104934041
## SCYL3.Q1                   -0.2912763             -0.01943017   -0.474411299
## C1orf112.Q1                 0.1631571              0.15508654   -0.007578783
## FGR.Q1                     -0.9745976              1.95666129    0.532549804
## CFH.Q1                     -0.3318041             -0.30000274   -0.038752417
## FUCA2.Q1                   -0.6211875              0.72702544    0.007465043
##             pbmc1.Seq-Well pbmc1.inDrops pbmc2.Smart-seq2 pbmc2.CEL-Seq2
## DPM1.Q1        0.345641165   0.160595259         1.333081      0.8746625
## SCYL3.Q1      -0.125962741  -0.262768503        13.112629     -1.1901022
## C1orf112.Q1    0.002837026   0.051449644        14.820585      0.4944724
## FGR.Q1         0.662727919  -0.790564370       -35.728184    -10.4607422
## CFH.Q1        -0.103552952   0.007647797         6.307930      0.4886349
## FUCA2.Q1      -0.473190989  -0.593790753        -1.184934     -1.9937301
##             pbmc2.10x Chromium (v2) pbmc2.Drop-seq pbmc2.Seq-Well pbmc2.inDrops
## DPM1.Q1                 -0.03246163     0.48979497     -0.8657543    0.32980807
## SCYL3.Q1                 0.58696486     0.06990959     -0.4370313   -0.08295852
## C1orf112.Q1             -0.25416075     0.05541931     -0.8547718    0.13041050
## FGR.Q1                  -5.06902935    -3.92230960     -2.3776228   -2.37889642
## CFH.Q1                  -0.05641977    -0.35516364      0.1054431   -0.14496198
## FUCA2.Q1                -0.61754370    -0.48753373     -0.8113449   -0.06690240

The matrix of the sample-specific manifold parameters (Q) has dimensions:

rows: number of genes * number of latent variables
columns: number of samples

Regress out effect of technology

We then regress out the effect of technology from the Q matrix.

tab<- table( H[,3], H[,2])
tab

##                    
##                     pbmc1 pbmc2
##   10x Chromium (v2)     2     1
##   10x Chromium (v3)     1     0
##   CEL-Seq2              1     1
##   Drop-seq              1     1
##   inDrops               1     1
##   Seq-Well              1     1
##   Smart-seq2            1     1

techs_check<- rownames(tab)[rowSums(tab)> 1] # We only consider technologies with more than one sample

lis_H<- list()
lis_Q<- list()

# Regress out effect of technology
for( i in techs_check ){
    print(i)
    H_sample<- H[ H[,3]== i,]
    Q_sample<- Q[ , H[,3]==i ]
    Q_sample <- Q_sample - apply(Q_sample,1,mean) # remove mean from the Q matrix
    lis_H[[i]]<- H_sample
    lis_Q[[i]]<- Q_sample
}

## [1] "10x Chromium (v2)"
## [1] "CEL-Seq2"
## [1] "Drop-seq"
## [1] "inDrops"
## [1] "Seq-Well"
## [1] "Smart-seq2"

H_subset<- do.call(rbind, lis_H)
Q_subset<- do.call(cbind, lis_Q)

We select the top 20 most variable parameters learned by GEDI.

### top parameters 
top_parameters <- order( apply(Q_subset,1,sd), decreasing=T )[1:20]

We then perform PCA and UMAP of the sample-specific manifold distortions.

# PCA
svd_Q <- svd(Q_subset[top_parameters,])
Q_rot <- svd_Q$v %*% diag(svd_Q$d)
rownames(Q_rot) <- unlist(lapply(lis_Q, colnames))

# umap
umap_res<- umap(Q_rot, min_dist=0.01, metric="euclidean", n_neighbors=3)
rownames(umap_res)<- colnames(Q_subset)
colnames(umap_res)<- paste0("umap", 1:2)

GEDI::plot_embedding( umap_res, H_subset[,2], size_point=3) + labs(x="umap1", y="umap2", title="Regress out effect of technology", color="Donor")

GEDI::plot_embedding( umap_res, H_subset[,3], size_point=3) + labs(x="umap1", y="umap2", title="Regress out effect of technology", color="Technology")

After regressing out the effect of technology on the sample-specific manifold parameters, we can observe that samples cluster by Donor.

Regress out effect of Donor

We can also regress out the effect of Donor from the Q matrix.

tab<- table( H[,3], H[,2])
tab

##                    
##                     pbmc1 pbmc2
##   10x Chromium (v2)     2     1
##   10x Chromium (v3)     1     0
##   CEL-Seq2              1     1
##   Drop-seq              1     1
##   inDrops               1     1
##   Seq-Well              1     1
##   Smart-seq2            1     1

techs_check<- rownames(tab)[rowSums(tab)> 1] # We only consider technologies with more than one sample

# Filter out H and Q matrices for technologies present in more than 1 sample
vec_filter<- H[,3] %in% techs_check

H<- H[vec_filter, ]
Q<- Q[, vec_filter ]

lis_H<- list()
lis_Q<- list()

# Regress out effect of Donor
for( i in colnames(tab) ){
    print(i)
    H_sample<- H[ H[,2]== i,]
    Q_sample<- Q[ , H[,2]==i ]
    Q_sample <- Q_sample - apply(Q_sample,1,mean) # remove mean from the Q matrix
    lis_H[[i]]<- H_sample
    lis_Q[[i]]<- Q_sample
}

## [1] "pbmc1"
## [1] "pbmc2"

H_subset<- do.call(rbind, lis_H)
Q_subset<- do.call(cbind, lis_Q)