library(tidyverse)
library(furrr)
plan(multiprocess)
options(future.globals.maxSize= +Inf, future.fork.enable=TRUE)
call_emptydrops <-
function(sample_umi_count,
lower = 200,
fdr_thresh = 0.005) {
sample_emptydrops <-
DropletUtils::emptyDrops(sample_umi_count,
lower = lower) %>%
as.data.frame() %>%
rownames_to_column(var = "cell") %>%
select(cell, FDR)
sample_emptydrops[is.na(sample_emptydrops)] <- 1
sample_emptydrops <-
sample_emptydrops %>%
mutate(is_cell = FDR <= fdr_thresh,
FDR = NULL)
return(sample_emptydrops)
}
call_defaultdrops <- function(sample_umi_count) {
sample_defaultdrops <- DropletUtils::defaultDrops(sample_umi_count)
sample_defaultdrops <- enframe(sample_defaultdrops,
name = "cell",
value = "is_cell")
return(sample_defaultdrops)
}
filter_cells <- function(mat, cell) {
mat[, cell$is_cell]
}
output_dir <- "./data/novaseq_l1/output"
out <- readRDS( file.path(output_dir, "out_novaseq_l1.rds"))
umi_counts <- map2(out$umi_counts$retained,
out$umi_counts$discarded, `+`)
out$umi_counts$discarded <- NULL
called_cells <-
future_map(
umi_counts,
plyr::failwith(
NULL,
call_emptydrops)
)
filtered_umi_counts <-
future_map2(umi_counts,
called_cells,
filter_cells)
called_cells_retained <-
future_map(
out$umi_counts$retained,
plyr::failwith(
NULL,
call_emptydrops
)
)
filtered_umi_counts_retained <-
future_map2(out$umi_counts$retained,
called_cells_retained,
filter_cells)
saveRDS(filtered_umi_counts,
"./data/filtered_umi_counts_novaseq_l1.rds")
saveRDS(filtered_umi_counts_retained,
"./data/filtered_umi_counts_retained_novaseq_l1.rds")
Restart first!
library(tidyverse)
library(AnnotationHub)
library(scran)
library(scater)
library(uwot) # For umap
library(zeallot)
set.seed(42)
get_transcriptome <- function(ah_id) {
ah <- AnnotationHub()
gene_metadata <- ah[[ah_id]]
gene_metadata <-
gene_metadata[mcols(gene_metadata)$type == "gene",
c("gene_id", "gene_name", "gene_biotype")]
seqlevelsStyle(gene_metadata) <- "UCSC"
#gene_metadata <- keepStandardChromosomes(gene_metadata,
# pruning.mode="coarse")
gene_metadata <-
gene_metadata %>%
as.data.frame() %>%
dplyr::select(gene_id,
seqnames,
gene_name,
gene_biotype,
gene_id)
return(gene_metadata)
}
add_metadata <- function(sce10x, gene_metadata) {
rowData(sce10x) <-
gene_metadata %>%
left_join(as_tibble(rowData(sce10x)),
.,
by = "gene_id") %>%
dplyr::rename(chr = seqnames)
rownames(sce10x) <- uniquifyFeatureNames(rowData(sce10x)$gene_id,
rowData(sce10x)$gene_name)
sce10x <- addPerCellQC(sce10x,
subsets = list(mito = which(rowData(sce10x)$chr ==
"chrM")),
flatten = TRUE)
sce10x <- addPerFeatureQC(sce10x)
return(sce10x)
}
to_sce <- function(data, suff) {
col_names <- paste(colnames(data), suff, sep = "_")
row_names <- rownames(data)
data <-
data %>%
as.matrix()
colnames(data) <- NULL
rownames(data) <- NULL
data <- SingleCellExperiment(
assays = list(counts = data),
rowData = DataFrame(gene_id = row_names),
colData = DataFrame(barcode = col_names)
)
return(data)
}
create_sce <- function(data_list, gene_metadata) {
suffixes <- str_split_fixed(names(data_list), "_", 2)[, 2]
data_list <-
map2(data_list,
suffixes,
to_sce)
data_list <- map(data_list, add_metadata, gene_metadata)
return(data_list)
}
process_sample <-
function(sample,
umi_counts_purged,
sce10x_unpurged) {
sce10x_purged <- umi_counts_purged[[sample]]
sce10x_unpurged <- umi_counts_unpurged[[sample]]
n_exprs_genes_unpurged <-
nexprs(sce10x_unpurged,
detection_limit = 0,
byrow = TRUE)
sce10x_unpurged <- sce10x_unpurged[n_exprs_genes_unpurged > 0, ]
n_exprs_genes_purged <-
nexprs(sce10x_purged,
detection_limit = 0,
byrow = TRUE)
sce10x_purged <- sce10x_purged[n_exprs_genes_purged > 0, ]
dt_summary <-
left_join(
colData(sce10x_unpurged)[, c("barcode", "sum", "detected")] %>% as_tibble(),
colData(sce10x_purged)[, c("barcode", "sum", "detected")] %>% as_tibble(),
by = "barcode"
) %>%
mutate(
phantom_sum = sum.x - sum.y ,
phantom_detected = detected.x - detected.y ,
uncalled = !(
colData(sce10x_unpurged)$barcode %in% colData(sce10x_purged)$barcode
)
) %>%
dplyr::select(barcode, phantom_sum, phantom_detected, uncalled)
colData(sce10x_unpurged) <-
left_join(colData(sce10x_unpurged) %>% as_tibble(), dt_summary ,
by = "barcode") %>%
as(., "DataFrame")
gene_summary_dt <-
full_join(
rowData(sce10x_unpurged)[, c("gene_id", "gene_name", "mean")] %>%
as_tibble() %>%
mutate(sum = mean * dim(sce10x_unpurged)[2]) %>%
dplyr::select(-mean),
rowData(sce10x_purged)[, c("gene_id", "gene_name", "mean")] %>%
as_tibble() %>%
mutate(sum = mean * dim(sce10x_purged)[2]) %>%
dplyr::select(-mean),
by = c("gene_id", "gene_name")
) %>%
mutate(phantoms_sum = sum.x - sum.y) %>%
arrange(-phantoms_sum)
clusters <- quickCluster(sce10x_unpurged,
min.mean=0.12)
sce10x_unpurged <- computeSumFactors(sce10x_unpurged,
clusters=clusters,
min.mean=0.12)
clusters <- quickCluster(sce10x_purged,
min.mean=0.12)
sce10x_purged <- computeSumFactors(sce10x_purged,
clusters=clusters,
min.mean=0.12)
sce10x_unpurged <- scater::logNormCounts(sce10x_unpurged)
sce10x_purged <- scater::logNormCounts(sce10x_purged)
return(
list(
sce10x_unpurged = sce10x_unpurged,
sce10x_purged = sce10x_purged,
gene_summary_dt = gene_summary_dt
)
)
}
run_umap <- function(sce10x,
assay = "counts",
prefix = "umap") {
sce10x_mtx <-
t(assay(sce10x, assay) %>%
as.matrix())
sce_cells_umap <- umap(
sce10x_mtx,
# n_neighbors=100L,
n_components = 2,
# min_dist=0.005,
# metric="manhattan",
verbose = TRUE,
n_threads = 7
)
colnames(sce_cells_umap) <- paste0(prefix, seq(1, 2))
reducedDim(sce10x, prefix) <- sce_cells_umap
return(sce10x)
}
input_dir <- "~/Documents/index_hopping/data"
target_samples <- c("P7_1", "P7_8" )
gene_metadata <- get_transcriptome("AH50870") #Mus_musculus.GRCm38.84
snapshotDate(): 2019-10-29
loading from cache
Importing File into R ..
require(“rtracklayer”)
umi_counts_unpurged <- readRDS( file.path(input_dir,
"filtered_umi_counts_novaseq_l1.rds"))
umi_counts_unpurged <- umi_counts_unpurged[target_samples]
umi_counts_purged <- readRDS( file.path(input_dir,
"filtered_umi_counts_retained_novaseq_l1.rds"))
umi_counts_purged <- umi_counts_purged[target_samples]
umi_counts_unpurged <- create_sce(umi_counts_unpurged, gene_metadata)
umi_counts_purged <- create_sce(umi_counts_purged, gene_metadata)
c(sce10x_unpurged, sce10x_purged, gene_summary_dt_1) %<-%
process_sample("P7_1", umi_counts_purged, sce10x_unpurged )
encountered negative size factor estimates
gene_summary_dt_1
sce10x_unpurged <- run_umap(sce10x_unpurged, assay="logcounts", prefix="umap")
16:57:51 UMAP embedding parameters a = 1.896 b = 0.8006
16:57:51 Read 772 rows and found 13316 numeric columns
16:57:51 Using FNN for neighbor search, n_neighbors = 15
16:57:58 Commencing smooth kNN distance calibration using 7 threads
16:57:59 Initializing from normalized Laplacian + noise
16:57:59 Commencing optimization for 500 epochs, with 20730 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:58:01 Optimization finished
df_to_plot_1 <-
bind_cols(reducedDim(sce10x_unpurged, "umap") %>%
as_tibble(),
t(assay(sce10x_unpurged, "logcounts")[c("Sftpc", "Lyz2"),]) %>%
as_tibble(),
colData(sce10x_unpurged)[, c("sum","uncalled",
"phantom_sum",
"phantom_detected")] %>%
as_tibble())
ggplot(df_to_plot_1 ) +
geom_point(aes( umap1,
umap2,
colour= phantom_sum==0),
size=0.9,
alpha=1) +
theme_classic()
ggsave("P7_1_umap.pdf")
Saving 10 x 6.18 in image
ggplot(df_to_plot_1 ) +
geom_point(aes( umap1,
umap2,
colour= Lyz2!=0),
size=0.9,
alpha=1) +
theme_classic()
ggsave("P7_1_Lyz2_umap.pdf")
Saving 10 x 6.18 in image
c(sce10x_unpurged, sce10x_purged, gene_summary_dt_2) %<-%
process_sample("P7_8", umi_counts_purged, sce10x_unpurged )
gene_summary_dt_2
sce10x_unpurged <- run_umap(sce10x_unpurged, assay="logcounts", prefix="umap")
16:58:13 UMAP embedding parameters a = 1.896 b = 0.8006
16:58:13 Read 1194 rows and found 17018 numeric columns
16:58:13 Using FNN for neighbor search, n_neighbors = 15
16:58:33 Commencing smooth kNN distance calibration using 7 threads
16:58:34 Initializing from normalized Laplacian + noise
16:58:34 Commencing optimization for 500 epochs, with 31250 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:58:36 Optimization finished
df_to_plot_2 <-
bind_cols(reducedDim(sce10x_unpurged, "umap") %>%
as_tibble(),
t(assay(sce10x_unpurged, "logcounts")[c("Fabp1", "Scgb1a1"),]) %>%
as_tibble(),
colData(sce10x_unpurged)[, c("sum",
"uncalled",
"phantom_sum",
"phantom_detected")] %>%
as_tibble())
ggplot(df_to_plot_2) +
geom_point(aes( umap1,
umap2,
colour= phantom_sum==0),
size=0.9,
alpha=1) +
theme_classic()
ggsave("P7_8_umap.pdf")
Saving 10 x 6.18 in image
ggplot(df_to_plot_2 ) +
geom_point(aes( umap1,
umap2,
colour= Fabp1 != 0),
size=0.9,
alpha=1) +
theme_classic()
ggsave("P7_8_Fabp1_umap.pdf")
Saving 10 x 6.18 in image
sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] rtracklayer_1.46.0 zeallot_0.1.0 uwot_0.1.5 Matrix_1.2-18
[5] scater_1.14.6 scran_1.14.5 SingleCellExperiment_1.8.0 SummarizedExperiment_1.16.1
[9] DelayedArray_0.12.2 BiocParallel_1.20.1 matrixStats_0.55.0 Biobase_2.46.0
[13] GenomicRanges_1.38.0 GenomeInfoDb_1.22.0 IRanges_2.20.1 S4Vectors_0.24.1
[17] AnnotationHub_2.18.0 BiocFileCache_1.10.2 dbplyr_1.4.2 BiocGenerics_0.32.0
[21] forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.3
[25] readr_1.3.1 tidyr_1.0.0 tibble_2.1.3 ggplot2_3.2.1
[29] tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] ggbeeswarm_0.6.0 colorspace_1.4-1 XVector_0.26.0 BiocNeighbors_1.4.1
[5] fs_1.3.1 rstudioapi_0.10 farver_2.0.1 bit64_0.9-7
[9] RSpectra_0.16-0 interactiveDisplayBase_1.24.0 AnnotationDbi_1.48.0 fansi_0.4.1
[13] lubridate_1.7.4 xml2_1.2.2 knitr_1.26 jsonlite_1.6
[17] Rsamtools_2.2.1 broom_0.5.3 shiny_1.4.0 BiocManager_1.30.10
[21] compiler_3.6.2 httr_1.4.1 dqrng_0.2.1 backports_1.1.5
[25] assertthat_0.2.1 fastmap_1.0.1 lazyeval_0.2.2 limma_3.42.0
[29] cli_2.0.1 later_1.0.0 BiocSingular_1.2.1 htmltools_0.4.0
[33] tools_3.6.2 rsvd_1.0.2 igraph_1.2.4.2 gtable_0.3.0
[37] glue_1.3.1 GenomeInfoDbData_1.2.2 rappdirs_0.3.1 Rcpp_1.0.3
[41] cellranger_1.1.0 Biostrings_2.54.0 vctrs_0.2.1 nlme_3.1-143
[45] DelayedMatrixStats_1.8.0 xfun_0.12 rvest_0.3.5 mime_0.8
[49] lifecycle_0.1.0 irlba_2.3.3 XML_3.98-1.20 statmod_1.4.33
[53] edgeR_3.28.0 zlibbioc_1.32.0 scales_1.1.0 BSgenome_1.54.0
[57] hms_0.5.3 promises_1.1.0 yaml_2.2.0 curl_4.3
[61] memoise_1.1.0 gridExtra_2.3 stringi_1.4.5 RSQLite_2.2.0
[65] BiocVersion_3.10.1 rlang_0.4.2 pkgconfig_2.0.3 bitops_1.0-6
[69] lattice_0.20-38 labeling_0.3 GenomicAlignments_1.22.1 bit_1.1-15
[73] tidyselect_0.2.5 magrittr_1.5 R6_2.4.1 generics_0.0.2
[77] DBI_1.1.0 pillar_1.4.3 haven_2.2.0 withr_2.1.2
[81] RCurl_1.95-4.12 modelr_0.1.5 crayon_1.3.4 viridis_0.5.1
[85] locfit_1.5-9.1 grid_3.6.2 readxl_1.3.1 FNN_1.1.3
[89] blob_1.2.0 reprex_0.3.0 digest_0.6.23 xtable_1.8-4
[93] httpuv_1.5.2 RcppParallel_4.4.4 munsell_0.5.0 beeswarm_0.2.3
[97] viridisLite_0.3.0 vipor_0.4.5