set.seed(42)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
options(
readr.show_progress = FALSE,
digits = 2
)
suppressPackageStartupMessages({
library(scater)
library(BiocSingular)
library(scran)
library(tidyverse)
})
data_dir <- "./data"
figures_dir <- file.path("./figures", "qc_cells")
sce10x <- readRDS(file.path(data_dir, "preprocessed", "sce10x.rds"))
plotColData(sce10x,
x = "sum",
by_exprs_values = "counts",
y = "detected",
colour_by = "high_mito",
point_size = .7
) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~ colData(sce10x)$sample, ncol = 2) +
geom_vline(aes(xintercept = 100), color = "green", linetype = 2) +
geom_hline(aes(yintercept = 100), color = "green", linetype = 2)
sum_thresh <- colData(sce10x)$sum < 100
detected_thresh <- colData(sce10x)$detected < 100
mito_high <- colData(sce10x)$subsets_mito_percent > 15
discard <-
(sum_thresh |
detected_thresh |
mito_high)
sum(discard)
[1] 5400
table(colData(sce10x)$sample, discard)
discard
FALSE TRUE
aged1 3320 728
aged2 3751 590
aged3 2584 819
aged4 2617 766
yng1 2544 846
yng2 3194 793
yng3 2542 858
sce10x <- sce10x[, !discard]
add_doublet_score <- function(sample_name, sce10x) {
sce10x_subset <- subset(sce10x, colData(sce10x)$sample == sample_name)
set.seed(42)
dbl_dens <- doubletCells(sce10x_subset)
dt <- tibble(
barcode = colnames(sce10x_subset),
doublet_score = log10(dbl_dens + 1)
)
return(dt)
}
doublet_dt <- map_dfr(unique(sce10x$sample),
add_doublet_score,
sce10x = sce10x
)
sce10x$doublet_score <- deframe(doublet_dt)[colnames(sce10x)]
ggplot(colData(sce10x) %>%
as_tibble()) +
geom_violin(aes(sample, doublet_score))
ggplot(colData(sce10x) %>%
as_tibble()) +
geom_point(aes(sum_spliced,
sum_unspliced,
colour = doublet_score > 3
),
size = 0.7,
alpha = 0.8
) +
scale_x_log10() +
scale_y_log10() +
geom_vline(xintercept = c(1000), color = "green", linetype = 2) +
geom_hline(yintercept = c(350), color = "green", linetype = 2) +
geom_vline(xintercept = c(2500), color = "red", linetype = 2) +
geom_hline(yintercept = c(800), color = "red", linetype = 2) +
facet_wrap(sample ~ ., ncol = 4)
ggplot(colData(sce10x) %>%
as_tibble()) +
geom_histogram(aes(unspliced_ov_total), bins = 150) +
facet_wrap(sample ~ ., ncol = 2) +
geom_vline(xintercept = c(.15, .5), color = "green", linetype = 2) +
geom_vline(xintercept = c(.1, 0.4), color = "red", linetype = 2)
ggplot(colData(sce10x) %>%
as_tibble()) +
geom_point(aes(sum,
detected,
colour = doublet_score > 3
),
size = 0.7,
alpha = 0.8
) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(sample ~ ., ncol = 4) +
geom_hline(aes(yintercept = c(1000)), color = "green", linetype = 2) +
geom_hline(aes(yintercept = c(1800)), color = "red", linetype = 2) +
geom_hline(aes(yintercept = c(7000)), color = "grey", linetype = 2)
sum_unspliced_low_yng <- colData(sce10x)$sum_unspliced < 800 &
colData(sce10x)$condition == "yng"
sum_spliced_low_yng <- colData(sce10x)$sum_spliced < 2500 &
colData(sce10x)$condition == "yng"
sum_spliced_low_aged <- colData(sce10x)$sum_spliced < 1000 &
colData(sce10x)$condition == "aged"
sum_unspliced_low_aged <- colData(sce10x)$sum_unspliced < 350 &
colData(sce10x)$condition == "aged"
unspliced_ratio_aged <-
(colData(sce10x)$unspliced_ov_total > .5 |
colData(sce10x)$unspliced_ov_total < 0.15) &
colData(sce10x)$condition == "aged"
unspliced_ratio_yng <-
(colData(sce10x)$unspliced_ov_total > .4 |
colData(sce10x)$unspliced_ov_total < 0.1) &
colData(sce10x)$condition == "yng"
detected_low_yng <- colData(sce10x)$detected < 1800 &
colData(sce10x)$condition == "yng"
detected_low_aged <- colData(sce10x)$detected < 1000 &
colData(sce10x)$condition == "aged"
sum_unspliced_outlier <- colData(sce10x)$sum_unspliced > 7000
discard <-
(sum_spliced_low_aged |
sum_spliced_low_yng |
sum_unspliced_low_aged |
sum_unspliced_low_yng |
unspliced_ratio_aged |
unspliced_ratio_yng |
detected_low_aged |
detected_low_yng |
sum_unspliced_outlier)
sum(discard)
[1] 2601
table(colData(sce10x)$sample, discard)
discard
FALSE TRUE
aged1 2805 515
aged2 3457 294
aged3 2220 364
aged4 2228 389
yng1 2237 307
yng2 2777 417
yng3 2227 315
sce10x <- sce10x[, !discard]
plotScater(sce10x,
nfeatures = 500,
block1 = "sample",
ncol = 3,
exprs_values = "counts",
colour_by = "subsets_mito_percent",
line_width = 0.3
) +
geom_hline(aes(yintercept = .75)) +
geom_hline(aes(yintercept = .65))
outliers_1 <- colData(sce10x)$percent_top_500 > 75 & colData(sce10x)$sample %in% c("aged3", "aged4", "yng2")
outliers_2 <- colData(sce10x)$percent_top_500 > 70 & colData(sce10x)$sample %in% c("yng3")
outliers <- outliers_1 | outliers_2
sum(outliers)
[1] 23
sce10x <- sce10x[, !outliers]
table(colData(sce10x)$sample)
aged1 aged2 aged3 aged4 yng1 yng2 yng3
2805 3457 2212 2219 2237 2776 2222
ggplot(colData(sce10x) %>%
as_tibble()) +
geom_point(aes(sum_spliced,
sum_unspliced,
colour = doublet_score > 3
),
size = 0.7,
alpha = 0.8
) +
scale_x_log10() +
scale_y_log10() +
# geom_vline(xintercept = c(20000, 30000), color = "green", linetype = 2) +
geom_hline(yintercept = c(8000), color = "green", linetype = 2) +
facet_wrap(sample ~ ., ncol = 4)
n_exprs_genes_s <- nexprs(sce10x,
detection_limit = 1,
byrow = TRUE,
exprs_values = "spliced"
)
head(table(n_exprs_genes_s), 100)
n_exprs_genes_s
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
16543 1699 795 519 367 360 237 213 200 164 163 142 132 136 117 95 106 94 86 91 80 68 85 78 66 57 55 51 70 79
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
80 54 63 50 56 59 51 53 52 63 46 36 36 37 43 43 54 42 39 35 44 33 40 47 36 33 25 32 47 39
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
39 33 43 37 34 31 37 33 34 37 38 27 27 35 23 31 23 28 33 36 28 29 30 30 35 22 29 32 30 24
90 91 92 93 94 95 96 97 98 99
27 20 25 17 21 27 34 19 28 30
n_exprs_genes_u <- nexprs(sce10x,
detection_limit = 1,
byrow = TRUE,
exprs_values = "unspliced"
)
head(table(n_exprs_genes_u), 25)
n_exprs_genes_u
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
17197 1979 1038 663 558 443 385 285 309 232 260 214 173 161 175 140 148 115 127 129 123 117 92 117 93
n_exprs_genes <-
bind_cols(enframe(n_exprs_genes_s, "gene_name", "s"),
u = n_exprs_genes_u
) %>%
mutate(keep = !(s <= 25 & u <= 25))
n_exprs_genes
rowData(sce10x)$nexprs_l1_u <- n_exprs_genes_u
rowData(sce10x)$nexprs_l1_s <- n_exprs_genes_s
rowData(sce10x)$keep <- !(n_exprs_genes_s <= 25 & n_exprs_genes_u <= 25)
sum(rowData(sce10x)$keep)
[1] 13645
saveRDS(sce10x, file.path(data_dir, "preprocessed", "sce10x_filtered_cells.rds"))
sessionInfo()
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.4 readr_1.3.1 tidyr_1.0.2
[7] tibble_3.0.1 tidyverse_1.3.0 scran_1.16.0 BiocSingular_1.4.0 scater_1.16.0 ggplot2_3.3.0
[13] SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.1 DelayedArray_0.14.0 matrixStats_0.56.0 Biobase_2.48.0 GenomicRanges_1.40.0
[19] GenomeInfoDb_1.24.0 IRanges_2.22.1 S4Vectors_0.26.0 BiocGenerics_0.34.0
loaded via a namespace (and not attached):
[1] nlme_3.1-147 fs_1.4.1 bitops_1.0-6 lubridate_1.7.8 httr_1.4.1 rprojroot_1.3-2
[7] R.cache_0.14.0 tools_4.0.0 backports_1.1.6 R6_2.4.1 irlba_2.3.3 vipor_0.4.5
[13] DBI_1.1.0 colorspace_1.4-1 withr_2.2.0 tidyselect_1.0.0 gridExtra_2.3 compiler_4.0.0
[19] cli_2.0.2 rvest_0.3.5 BiocNeighbors_1.6.0 xml2_1.3.2 labeling_0.3 scales_1.1.0
[25] digest_0.6.25 rmarkdown_2.1 R.utils_2.9.2 XVector_0.28.0 base64enc_0.1-3 pkgconfig_2.0.3
[31] htmltools_0.4.0 styler_1.3.2 dbplyr_1.4.3 limma_3.44.1 readxl_1.3.1 rlang_0.4.6
[37] rstudioapi_0.11 DelayedMatrixStats_1.10.0 farver_2.0.3 generics_0.0.2 jsonlite_1.6.1 BiocParallel_1.22.0
[43] R.oo_1.23.0 RCurl_1.98-1.2 magrittr_1.5 GenomeInfoDbData_1.2.3 Matrix_1.2-18 fansi_0.4.1
[49] Rcpp_1.0.4.6 ggbeeswarm_0.6.0 munsell_0.5.0 viridis_0.5.1 lifecycle_0.2.0 R.methodsS3_1.8.0
[55] stringi_1.4.6 yaml_2.2.1 edgeR_3.30.0 zlibbioc_1.34.0 grid_4.0.0 dqrng_0.2.1
[61] crayon_1.3.4 lattice_0.20-41 cowplot_1.0.0 haven_2.2.0 hms_0.5.3 locfit_1.5-9.4
[67] knitr_1.28 pillar_1.4.3 igraph_1.2.5 reprex_0.3.0 glue_1.4.0 evaluate_0.14
[73] modelr_0.1.7 vctrs_0.2.4 cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1 xfun_0.13
[79] rsvd_1.0.3 broom_0.5.6 viridisLite_0.3.0 beeswarm_0.2.3 statmod_1.4.34 ellipsis_0.3.0