set.seed(42)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
options(
readr.show_progress = FALSE,
digits = 2
)
suppressPackageStartupMessages({
library(tximeta)
library(rjson)
library(scater)
library(alevinQC)
library(biomaRt)
library(tidyverse)
})
create_qc_report <- function(sample_dir, sample_name, reports_dir) {
checkAlevinInputFiles(sample_dir)
alevinQCReport(
baseDir = sample_dir,
sampleId = sample_name,
outputFile = paste0(sample_name, "_alevin_report.html"),
outputFormat = "html_document",
outputDir = reports_dir,
forceOverwrite = TRUE
)
}
get_markers <- function(sce10x) {
macrophage <-
which(rowData(sce10x)$gene_name %in% c("Adgre1", "Ptprc", "Itgam"))
stem <- which(rowData(sce10x)$gene_name %in% c("Pax7"))
# other <- which(rowData(sce10x)$gene_name %in% c("Pecam1"))
fap <- which(rowData(sce10x)$gene_name %in% c("Pdgfra"))
markers_dt <-
assay(sce10x)[c(stem, fap, macrophage), ] %>%
as.data.frame.matrix()
barcode_labels <-
colnames(markers_dt)
no_markers <-
markers_dt %>%
summarize_all(list(~ (sum(.) == 0)))
markers <-
markers_dt %>%
summarize_all(which.max)
markers[unlist(no_markers)] <- 0L
markers <-
markers %>%
data.table::transpose() %>%
set_names("marker")
markers <-
markers %>%
mutate(
marker = case_when(
marker == 0L ~ "none",
marker == 1L ~ "stem",
marker == 2L ~ "fap",
marker %in% c(3L, 4L, 5L) ~ "macrophage"
)
) %>%
pull(marker)
return(markers)
}
add_metadata <- function(sce10x, gene_metadata, sample_name, mito_thresh = 15) {
rowData(sce10x) <-
gene_metadata %>%
left_join(tibble(ensembl_gene_id_version = rownames(sce10x)),
.,
by = "ensembl_gene_id_version"
) %>%
dplyr::rename(chr = "chromosome_name") %>%
mutate(gene_name = uniquifyFeatureNames(
ensembl_gene_id_version,
external_gene_name
))
rownames(sce10x) <- rowData(sce10x)$gene_name
combined_metrics <-
perCellQCMetrics(
sce10x,
subsets = list(mito = which(rowData(sce10x)$chr ==
"MT")),
percent_top = c(500),
exprs_values = c("counts"),
flatten = TRUE
) %>%
as_tibble() %>%
dplyr::select(-total)
spliced_metrics <-
perCellQCMetrics(
sce10x,
exprs_values = c("spliced"),
percent_top = NULL,
flatten = TRUE
) %>%
as_tibble() %>%
dplyr::select(-total) %>%
set_names(c("sum_spliced", "detected_spliced"))
unspliced_metrics <-
perCellQCMetrics(
sce10x,
exprs_values = c("unspliced"),
percent_top = NULL,
flatten = TRUE
) %>%
as_tibble() %>%
dplyr::select(-total) %>%
set_names(c("sum_unspliced", "detected_unspliced"))
counts_dt <-
bind_cols(
combined_metrics,
spliced_metrics,
unspliced_metrics
) %>%
mutate(
unspliced_ov_spliced = sum_unspliced / sum_spliced,
unspliced_ov_total = sum_unspliced / sum,
sample = sample_name,
condition = case_when(
str_sub(sample, 1, 1) == "y" ~ "yng",
str_sub(sample, 1, 1) == "a" ~ "aged"
),
high_mito = combined_metrics$subsets_mito_percent > mito_thresh
) %>%
DataFrame(.)
rownames(counts_dt) <-
paste(colnames(sce10x),
str_sub(counts_dt$sample, start = -2),
sep = "_"
)
colData(sce10x) <- counts_dt
return(sce10x)
}
load_data <- function(sample_name, sample_dir, spliced_unspliced) {
col_data <-
data.frame(
names = sample_name,
files = file.path(
sample_dir,
"alevin",
"quants_mat.gz"
),
stringsAsFactors = FALSE
)
sce10x <-
tximeta::tximeta(
coldata = col_data,
type = "alevin"
)
# split into spliced and nspliced abundances matrices
sce10x <-
tximeta::splitSE(sce10x,
as.data.frame(spliced_unspliced),
assayName = "counts"
)
sce10x <- as(sce10x, "SingleCellExperiment")
assays(sce10x) <- list(
counts = assay(sce10x, "spliced") + assay(sce10x, "unspliced"),
spliced = assay(sce10x, "spliced"),
unspliced = assay(sce10x, "unspliced")
)
return(sce10x)
}
create_diag_plots <- function(sce10x) {
plot_dt <-
colData(sce10x) %>%
as_tibble()
p1 <-
ggplot(plot_dt) +
geom_point(aes(sum_spliced,
sum_unspliced,
colour = subsets_mito_percent
),
size = 0.3,
alpha = 0.3
) +
scale_x_log10() +
scale_y_log10() +
# geom_vline(xintercept = unlist(metrics[10,2]), color = "green", linetype = 2) +
# geom_hline(yintercept = unlist(metrics[11,2]), color = "green", linetype = 2) +
facet_wrap(sample ~ ., ncol = 2)
p2 <-
ggplot(plot_dt) +
geom_histogram(aes(unspliced_ov_total, fill = markers), bins = 150) +
facet_wrap(sample ~ ., ncol = 2)
p3 <-
ggplot(colData(sce10x) %>%
as_tibble()) +
geom_histogram(aes(detected), bins = 150) +
facet_wrap(sample ~ ., ncol = 2) +
scale_x_log10() +
geom_vline(xintercept = c(300, 7000), color = "green", linetype = 2)
p4 <-
plotColData(sce10x,
x = "sum",
by_exprs_values = "counts",
y = "detected",
colour_by = "subsets_mito_percent",
point_size = 1
) +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~ colData(sce10x)$sample, ncol = 2) +
geom_vline(aes(xintercept = 1000), color = "green", linetype = 2) +
geom_hline(aes(yintercept = 500), color = "green", linetype = 2)
p5 <-
plotScater(sce10x,
nfeatures = 500,
block1 = "sample",
ncol = 2,
exprs_values = "counts",
colour_by = "subsets_mito_percent",
line_width = 0.2
) +
geom_hline(aes(yintercept = .95))
ggsave(file.path(figures_dir, "spliced_vs_unspliced_scatter.pdf"), p1)
ggsave(file.path(figures_dir, "spliced_ov_unspliced_histogram.pdf"), p2)
ggsave(file.path(figures_dir, "detected_histogram.pdf"), p3)
ggsave(file.path(figures_dir, "sum_vs_detected.pdf"), p4)
ggsave(file.path(figures_dir, "cumulative_expression.jpeg"), p5)
return(list(p1 = p1, p2 = p2, p3 = p3, p3 = p4, p3 = p5))
}
data_dir <- "./data"
salmon_dir <- file.path(data_dir, "salmon_out")
reports_dir <- file.path(data_dir, "alevin_reports")
figures_dir <- file.path("./figures", "qc_cells")
index_dir <-
"/run/user/1000/gvfs/sftp:host=graham.computecanada.ca,user=rfarouni/home/rfarouni/projects/rrg-hsn/rfarouni/data/salmon_indices/m24/spliced_introns_collapsed_flank90"
json_file <-
file.path(
data_dir,
"metadata",
"gencode.vM24.annotation.expanded.json"
)
gene_expanded_file <-
file.path(
index_dir,
"gencode.vM24.annotation.expanded.features.tsv"
)
list.files(index_dir)
[1] "gencode.vM24.annotation.expanded.fa" "gencode.vM24.annotation.expanded.features.tsv"
[3] "gencode.vM24.annotation.expanded.gtf" "gencode.vM24.annotation.expanded.sidx"
[5] "gencode.vM24.annotation.expanded.tx2gene.tsv" "gencode.vM24.annotation.gtf.gz"
[7] "GRCm38.primary_assembly.genome.chrnames.txt" "GRCm38.primary_assembly.genome.fa"
[9] "salmon_index.job" "salmon_m24_index.out"
sample_dirs <-
list.dirs(salmon_dir,
full.names = T,
recursive = FALSE
)
sample_names <- c(
paste0("aged", 1:4),
paste0("yng", 1:3)
)
sample_names
[1] "aged1" "aged2" "aged3" "aged4" "yng1" "yng2" "yng3"
purrr::map2(sample_dirs,
sample_names,
create_qc_report,
reports_dir = reports_dir
)
spliced_unspliced <-
read_tsv(gene_expanded_file) %>%
dplyr::rename(unspliced = "intron") # Rename the 'intron' column 'unspliced' to make it compatible with scVelo
Parsed with column specification:
cols(
spliced = [31mcol_character()[39m,
intron = [31mcol_character()[39m
)
spliced_unspliced
mart <-
useDataset(
"mmusculus_gene_ensembl",
useMart("ensembl")
)
gene_metadata <-
getBM(
filters = "ensembl_gene_id_version",
attributes = c(
"ensembl_gene_id_version",
"external_gene_name",
"gene_biotype",
"chromosome_name",
"description"
),
values = spliced_unspliced$spliced,
mart = mart
)
gene_metadata
tximeta::makeLinkedTxome(
indexDir = file.path(index_dir, "gencode.vM24.annotation.expanded.sidx"),
source = "GENCODE",
genome = "GRCm38",
organism = "Mus musculus",
release = "M24",
fasta = file.path(index_dir, "gencode.vM24.annotation.expanded.fa"),
gtf = file.path(index_dir, "gencode.vM24.annotation.expanded.gtf"),
write = TRUE,
jsonFile = json_file
)
writing linkedTxome to ./data/metadata/gencode.vM24.annotation.expanded.json
linkedTxome is same as already in bfc
rjson::fromJSON(file = json_file)
[[1]]
[[1]]$index
[1] "gencode.vM24.annotation.expanded.sidx"
[[1]]$source
[1] "GENCODE"
[[1]]$organism
[1] "Mus musculus"
[[1]]$release
[1] "M24"
[[1]]$genome
[1] "GRCm38"
[[1]]$fasta
[1] "/run/user/1000/gvfs/sftp:host=graham.computecanada.ca,user=rfarouni/home/rfarouni/projects/rrg-hsn/rfarouni/data/salmon_indices/m24/spliced_introns_collapsed_flank90/gencode.vM24.annotation.expanded.fa"
[[1]]$gtf
[1] "/run/user/1000/gvfs/sftp:host=graham.computecanada.ca,user=rfarouni/home/rfarouni/projects/rrg-hsn/rfarouni/data/salmon_indices/m24/spliced_introns_collapsed_flank90/gencode.vM24.annotation.expanded.gtf"
[[1]]$sha256
[1] "ea9dfa0cdf7ed85fe348bf8ed32215c2261b01ba3dae7ebf007ef2afaf0213b1"
tximeta::loadLinkedTxome(json_file)
linkedTxome is same as already in bfc
load_annotate_data <- function(sample_name, sample_dir, spliced_unspliced, gene_metadata) {
sce10x <- load_data(sample_name, sample_dir, spliced_unspliced)
sce10x <- add_metadata(sce10x, gene_metadata, sample_name)
colData(sce10x)$markers <- get_markers(sce10x)
return(sce10x)
}
sce10x <-
map2(sample_names,
sample_dirs,
load_annotate_data,
spliced_unspliced = spliced_unspliced,
gene_metadata = gene_metadata
)
importing quantifications
importing alevin data is much faster after installing `fishpond` (>= 1.2.0)
reading in alevin gene-level counts across cells
found matching linked transcriptome:
[ GENCODE - Mus musculus - release M24 ]
loading existing TxDb created: 2020-04-30 19:48:32
Loading required package: GenomicFeatures
Loading required package: AnnotationDbi
Attaching package: 'AnnotationDbi'
The following object is masked from 'package:dplyr':
select
generating gene ranges
loading existing gene ranges created: 2020-04-30 19:48:34
fetching genome info for GENCODE
importing quantifications
importing alevin data is much faster after installing `fishpond` (>= 1.2.0)
reading in alevin gene-level counts across cells
found matching linked transcriptome:
[ GENCODE - Mus musculus - release M24 ]
loading existing TxDb created: 2020-04-30 19:48:32
generating gene ranges
loading existing gene ranges created: 2020-04-30 19:48:34
fetching genome info for GENCODE
importing quantifications
importing alevin data is much faster after installing `fishpond` (>= 1.2.0)
reading in alevin gene-level counts across cells
found matching linked transcriptome:
[ GENCODE - Mus musculus - release M24 ]
loading existing TxDb created: 2020-04-30 19:48:32
generating gene ranges
loading existing gene ranges created: 2020-04-30 19:48:34
fetching genome info for GENCODE
importing quantifications
importing alevin data is much faster after installing `fishpond` (>= 1.2.0)
reading in alevin gene-level counts across cells
found matching linked transcriptome:
[ GENCODE - Mus musculus - release M24 ]
loading existing TxDb created: 2020-04-30 19:48:32
generating gene ranges
loading existing gene ranges created: 2020-04-30 19:48:34
fetching genome info for GENCODE
importing quantifications
importing alevin data is much faster after installing `fishpond` (>= 1.2.0)
reading in alevin gene-level counts across cells
found matching linked transcriptome:
[ GENCODE - Mus musculus - release M24 ]
loading existing TxDb created: 2020-04-30 19:48:32
generating gene ranges
loading existing gene ranges created: 2020-04-30 19:48:34
fetching genome info for GENCODE
importing quantifications
importing alevin data is much faster after installing `fishpond` (>= 1.2.0)
reading in alevin gene-level counts across cells
found matching linked transcriptome:
[ GENCODE - Mus musculus - release M24 ]
loading existing TxDb created: 2020-04-30 19:48:32
generating gene ranges
loading existing gene ranges created: 2020-04-30 19:48:34
fetching genome info for GENCODE
importing quantifications
importing alevin data is much faster after installing `fishpond` (>= 1.2.0)
reading in alevin gene-level counts across cells
found matching linked transcriptome:
[ GENCODE - Mus musculus - release M24 ]
loading existing TxDb created: 2020-04-30 19:48:32
generating gene ranges
loading existing gene ranges created: 2020-04-30 19:48:34
fetching genome info for GENCODE
sce10x <- reduce(sce10x, cbind)
colData(sce10x)
DataFrame with 25952 rows and 16 columns
sum detected percent_top_500 subsets_mito_sum subsets_mito_detected subsets_mito_percent sum_spliced detected_spliced
<numeric> <integer> <numeric> <numeric> <integer> <numeric> <numeric> <integer>
ATACCGATCACCTGGG_d1 17510 5013 49.1488 425.500 14 2.43004 10878.1 3681
GACTATGTCCGGCTTT_d1 19538 5122 53.1112 526.998 13 2.69730 14653.1 4088
ATGGTTGGTTGTAAAG_d1 18438 4910 52.3674 1327.485 14 7.19973 14088.2 3978
GAGTCTACAGAAGTTA_d1 20504 5063 56.9594 1087.560 14 5.30414 17555.5 4408
GATTGGTAGGGAGTGG_d1 21025 5503 50.6502 653.396 13 3.10771 15001.1 4277
... ... ... ... ... ... ... ... ...
GTTCCGTTCCGAAATC_g3 155 45 100 104.986 11 67.7329 148.000 38
ACGTACATCGGCTGGT_g3 200 102 100 105.490 11 52.7451 184.417 85
AGTTCGAGTTCTCCTG_g3 175 54 100 116.490 9 66.5658 166.000 44
AATTTCCTCATGGATC_g3 155 33 100 129.990 9 83.8645 150.000 28
AAGCGTTAGAAATTCG_g3 165 55 100 109.990 11 66.6608 158.500 46
sum_unspliced detected_unspliced unspliced_ov_spliced unspliced_ov_total sample condition high_mito markers
<numeric> <integer> <numeric> <numeric> <character> <character> <logical> <character>
ATACCGATCACCTGGG_d1 6631.95 2446 0.609663 0.378752 aged1 aged FALSE none
GACTATGTCCGGCTTT_d1 4884.93 1993 0.333372 0.250022 aged1 aged FALSE fap
ATGGTTGGTTGTAAAG_d1 4349.80 1889 0.308755 0.235915 aged1 aged FALSE macrophage
GAGTCTACAGAAGTTA_d1 2948.46 1249 0.167950 0.143799 aged1 aged FALSE fap
GATTGGTAGGGAGTGG_d1 6023.90 2377 0.401564 0.286511 aged1 aged FALSE fap
... ... ... ... ... ... ... ... ...
GTTCCGTTCCGAAATC_g3 7.0000 7 0.0472973 0.0451613 yng3 yng TRUE none
ACGTACATCGGCTGGT_g3 15.5833 17 0.0845007 0.0779167 yng3 yng TRUE none
AGTTCGAGTTCTCCTG_g3 9.0000 10 0.0542169 0.0514286 yng3 yng TRUE none
AATTTCCTCATGGATC_g3 5.0000 5 0.0333333 0.0322581 yng3 yng TRUE none
AAGCGTTAGAAATTCG_g3 6.5000 9 0.0410095 0.0393939 yng3 yng TRUE none
table(colData(sce10x)$sample)
aged1 aged2 aged3 aged4 yng1 yng2 yng3
4048 4341 3403 3383 3390 3987 3400
n_exprs_genes <-
nexprs(sce10x, byrow = TRUE, exprs_values = "counts")
sce10x <- sce10x[n_exprs_genes > 0, ]
sce10x
class: SingleCellExperiment
dim: 34559 25952
metadata(42): tximetaInfo quantInfo ... txomeInfo txdbInfo
assays(3): counts spliced unspliced
rownames(34559): Gm1992 Gm37587 ... mt-Cytb mt-Nd6
rowData names(6): ensembl_gene_id_version external_gene_name ... description gene_name
colnames(25952): ATACCGATCACCTGGG_d1 GACTATGTCCGGCTTT_d1 ... AATTTCCTCATGGATC_g3 AAGCGTTAGAAATTCG_g3
colData names(16): sum detected ... high_mito markers
reducedDimNames(0):
altExpNames(0):
saveRDS(sce10x, file.path(data_dir, "preprocessed", "sce10x.rds"))
plots <- create_diag_plots(sce10x)
Saving 7 x 7 in image
plots
$p1
$p2
$p3
$p3
$p3
sessionInfo()
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] GenomicFeatures_1.40.0 AnnotationDbi_1.50.0 forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5
[6] purrr_0.3.4 readr_1.3.1 tidyr_1.0.2 tibble_3.0.1 tidyverse_1.3.0
[11] biomaRt_2.44.0 alevinQC_1.4.0 scater_1.16.0 ggplot2_3.3.0 SingleCellExperiment_1.10.1
[16] SummarizedExperiment_1.18.1 DelayedArray_0.14.0 matrixStats_0.56.0 Biobase_2.48.0 GenomicRanges_1.40.0
[21] GenomeInfoDb_1.24.0 IRanges_2.22.1 S4Vectors_0.26.0 BiocGenerics_0.34.0 rjson_0.2.20
[26] tximeta_1.6.1
loaded via a namespace (and not attached):
[1] readxl_1.3.1 backports_1.1.6 AnnotationHub_2.20.0 BiocFileCache_1.12.0
[5] plyr_1.8.6 lazyeval_0.2.2 shinydashboard_0.7.1 BiocParallel_1.22.0
[9] digest_0.6.25 ensembldb_2.12.0 htmltools_0.4.0 viridis_0.5.1
[13] fansi_0.4.1 magrittr_1.5 memoise_1.1.0 Biostrings_2.56.0
[17] modelr_0.1.7 askpass_1.1 prettyunits_1.1.1 colorspace_1.4-1
[21] blob_1.2.1 rvest_0.3.5 rappdirs_0.3.1 haven_2.2.0
[25] xfun_0.13 crayon_1.3.4 RCurl_1.98-1.2 jsonlite_1.6.1
[29] tximport_1.16.0 glue_1.4.0 gtable_0.3.0 zlibbioc_1.34.0
[33] XVector_0.28.0 BiocSingular_1.4.0 scales_1.1.0 DBI_1.1.0
[37] GGally_1.5.0 Rcpp_1.0.4.6 viridisLite_0.3.0 xtable_1.8-4
[41] progress_1.2.2 bit_1.1-15.2 rsvd_1.0.3 DT_0.13
[45] htmlwidgets_1.5.1 httr_1.4.1 RColorBrewer_1.1-2 ellipsis_0.3.0
[49] farver_2.0.3 pkgconfig_2.0.3 reshape_0.8.8 XML_3.99-0.3
[53] dbplyr_1.4.3 labeling_0.3 tidyselect_1.0.0 rlang_0.4.6
[57] later_1.0.0 munsell_0.5.0 BiocVersion_3.11.1 cellranger_1.1.0
[61] tools_4.0.0 cli_2.0.2 generics_0.0.2 RSQLite_2.2.0
[65] broom_0.5.6 evaluate_0.14 fastmap_1.0.1 yaml_2.2.1
[69] knitr_1.28 bit64_0.9-7 fs_1.4.1 AnnotationFilter_1.12.0
[73] nlme_3.1-147 mime_0.9 xml2_1.3.2 compiler_4.0.0
[77] rstudioapi_0.11 beeswarm_0.2.3 curl_4.3 interactiveDisplayBase_1.26.0
[81] reprex_0.3.0 stringi_1.4.6 lattice_0.20-41 ProtGenerics_1.20.0
[85] Matrix_1.2-18 vctrs_0.2.4 pillar_1.4.3 lifecycle_0.2.0
[89] BiocManager_1.30.10 BiocNeighbors_1.6.0 data.table_1.12.8 cowplot_1.0.0
[93] bitops_1.0-6 irlba_2.3.3 httpuv_1.5.2 rtracklayer_1.48.0
[97] R6_2.4.1 promises_1.1.0 gridExtra_2.3 vipor_0.4.5
[101] assertthat_0.2.1 openssl_1.4.1 rprojroot_1.3-2 withr_2.2.0
[105] GenomicAlignments_1.24.0 Rsamtools_2.4.0 GenomeInfoDbData_1.2.3 hms_0.5.3
[109] grid_4.0.0 rmarkdown_2.1 DelayedMatrixStats_1.10.0 base64enc_0.1-3
[113] shiny_1.4.0.2 lubridate_1.7.8 ggbeeswarm_0.6.0