knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(readr.show_progress = FALSE,
digits=5,
scipen=8,
future.globals.maxSize = +Inf)
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /home/rfarouni/Documents/index_hopping
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
figures_dir <- file.path(validation_output_dir, "figures")
data <- read_tsv(file.path(validation_output_dir,
"hiseq4000_inner_joined_with_labels_hg38_ensembl95.txt"))
Parsed with column specification:
cols(
cell = [31mcol_character()[39m,
gene = [31mcol_character()[39m,
umi = [32mcol_double()[39m,
s1_nonplexed = [32mcol_double()[39m,
s2_nonplexed = [32mcol_double()[39m,
s1_plexed = [32mcol_double()[39m,
s2_plexed = [32mcol_double()[39m,
outcome = [31mcol_character()[39m,
label = [31mcol_character()[39m
)
data
summary_counts <-
data %>%
mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
arrange(r) %>%
group_by(r, label) %>%
summarize_at(vars(matches("^s1_pl|s2_pl")),
list(~ sum(.))) %>%
mutate(s1_hopped=if_else(label %in% c("0,f", "r,f"), s2_plexed,0),
s2_hopped=if_else(label %in% c("f,0", "f,r"), s1_plexed,0),
s1_nonhopped=if_else(label %in% c("r,0", "r,f"), s1_plexed,0),
s2_nonhopped=if_else(label %in% c("0,r", "f,r"), s2_plexed,0)) %>%
select(-s1_plexed,- s2_plexed)
summary_counts
summary_counts_conditional <-
summary_counts %>%
group_by(r) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))%>%
mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_conditional
summary_counts_marginal <-
summary_counts %>%
ungroup() %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))%>%
mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
SIHR=1-(s1_nonhopped+s2_nonhopped)/(s1_hopped+s1_nonhopped +s2_hopped+s2_nonhopped),
frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_marginal
p1 <-
ggplot(summary_counts_conditional) +
geom_line(aes(x = r,
y = SIHR_12*100,
colour="SIHR_12"))+
geom_line(aes(x = r,
y = SIHR_21*100,
colour="SIHR_21")) +
geom_line(aes(x = r,
y = frac_s1,
colour="frac_s1")) +
geom_hline(yintercept = unlist(summary_counts_marginal[5:7])* c(100, 100, 100),
linetype="dashed") +
xlim(0,90) +
ylim(0,1)
p1
NA
summary_mol_counts <-
data %>%
mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
arrange(r) %>%
mutate_at(vars(matches("^s")),
list(~ as.integer(.!=0))) %>%
group_by(r, label) %>%
summarize_at(vars(matches("^s1_pl|^s2_pl")),
list(~ sum(.))) %>%
mutate(s1_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
s2_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
select(-s1_plexed,- s2_plexed)
summary_mol_counts
summary_mol_counts %>%
filter(label %in% c("0,f", "f,0")) %>%
ungroup() %>%
summarize_at(vars(ends_with("_phantom")), sum)
summary_mol_counts_marginal <-
summary_mol_counts%>%
ungroup() %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),# prop hopped phantom molec
ppm_21 = s2_phantom/(s2_phantom+s2_real),
ppm_1 = s2_phantom/(s2_phantom+s1_real), # prop phantom molec
ppm_2 = s1_phantom/(s1_phantom+s2_real),
ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_marginal
summary_mol_counts_conditional <-
summary_mol_counts%>%
group_by(r) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),
ppm_21 = s2_phantom/(s2_phantom+s2_real),
ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_conditional
p4 <- ggplot(summary_mol_counts_conditional) +
geom_line(aes(x = r,
y = ppm_12,
colour="ppm_12"))+
geom_line(aes(x = r,
y = ppm_21,
colour=" ppm_21")) +
#geom_hline(yintercept = unlist(summary_mol_counts_marginal[5:7]), linetype="dashed") +
xlim(0,300)
p4
#ggsave("phantom_molecules_validation.pdf", p4, width =8, height = 5)
summary_mol_counts_cell<-
data %>%
filter(label!="NA") %>%
mutate_at(vars(matches("^s")),
list(~ as.integer(.!=0))) %>%
group_by(cell, label) %>%
summarize_at(vars(matches("^s1_pl|^s2_pl")),
list(~ sum(.))) %>%
mutate(s1_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
s2_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
select(-s1_plexed,- s2_plexed) %>%
group_by(cell) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate_at(vars(matches("^s")),
list(nonempty= ~ as.integer(.!=0))) %>%
unite(label,matches("nonempty"), sep=",") %>%
mutate(cell_status=
case_when(label %in% c("0,0,0,1","0,0,1,0") ~ "real",
label %in% c("0,0,1,1") ~ "real",
label %in% c("1,0,0,0","0,1,0,0", "1,1,0,0") ~ "phantom",
label %in% c("1,0,0,1","0,1,1,0") ~ "phantom",
TRUE ~ "contaminated")) %>%
mutate(s1_total =(s1_phantom+s1_real),
s2_total =(s2_phantom+s2_real),
s1_ppm = s1_phantom/(s1_total),
s2_ppm = s2_phantom/(s2_total))
summary_mol_counts_cell
cell_status_tally <-
summary_mol_counts_cell %>%
group_by(cell_status,label) %>%
tally(sort=TRUE)
cell_status_tally
n_affected_cells <-
cell_status_tally %>%
ungroup() %>%
filter(cell_status!="real") %>%
summarise(n=sum(n)) %>%
pull(n)
n_affected_cells <- n_affected_cells + 1502 +6
n_affected_cells
[1] 64579
n_total_cells <-
summary_mol_counts_cell %>%
mutate_at(vars(matches("_total")),
list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarise_at(vars(matches("_total")), list(~sum(.))) %>%
mutate(cells_total= s1_total+s2_total)%>%
pull(cells_total)
n_total_cells
[1] 322321
Proportion of affected cells
p_affected_cells <- n_affected_cells/n_total_cells
p_affected_cells
[1] 0.20036
For each cell-barcode, plot the number of phantom molecules against the number of total molecules associated with it.
Sample 1 plot
p2 <- ggplot(summary_mol_counts_cell %>%
filter(cell_status!="real" & s1_phantom >0 )) +
geom_point(aes(x = s1_total,
y = s1_phantom,
colour=cell_status)) +
scale_x_log10() +
scale_y_log10()
p2
#ggsave("phantom_molecules_validation.pdf", p2, width =8, height = 5)
Sample 2 plot
p3 <- ggplot(summary_mol_counts_cell %>%
filter(cell_status!="real" & s2_phantom >0 )) +
geom_point(aes(x = s2_total,
y = s2_phantom,
colour=cell_status),
size=0.7,
alpha=0.6) +
scale_x_log10() +
scale_y_log10() +
labs(x="Total Number of Molecules",
y="Number of Phantom Molecules")
p3
ggsave(file.path(figures_dir,"phantom_cells_validation_s2.pdf"), p3, width =8, height = 5)
read_counts <-
data %>%
filter(label!="NA")%>%
select(-ends_with("nonplexed")) %>%
set_names(c("cell", "gene", "umi", "s1", "s2", "outcome", "label"))
read_counts
S <- 2
sample_names <- colnames(read_counts)[4:(S+3)]
sample_names
[1] "s1" "s2"
tic("Step 2: creating outcome counts datatable with grouping vars")
outcome_counts <- create_outcome_counts(read_counts%>%
select(-label),
sample_names,
min_frac=0.8)
Note: Using an external vector in selections is ambiguous.
[34mℹ[39m Use `all_of(sample_names)` instead of `sample_names` to silence this message.
[34mℹ[39m See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
[90mThis message is displayed once per session.[39m
toc()
Step 2: creating outcome counts datatable with grouping vars: 1.437 sec elapsed
outcome_counts
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
fit_out <-
estimate_hopping_rate(
outcome_counts,
S
)
`...` must not be empty for ungrouped data frames.
Did you want `data = everything()`?unnest() has a new interface. See ?unnest for details.
Try `df %>% unnest(c(tidied, confint_tidied, max_r))`, with `mutate()` if neededThe `.drop` argument of `unnest()` is deprecated as of tidyr 1.0.0.
All list-columns are now preserved.
[90mThis warning is displayed once every 8 hours.[39m
[90mCall `lifecycle::last_warnings()` to see where this warning was generated.[39m
toc()
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.204 sec elapsed
fit_out
$glm_estimates
$chimera_counts
NA
# compute_molecular_complexity_profile
tic("Step 4: compute molecular complexity profile and other summary statistics")
summary_stats <-
compute_summary_stats(
outcome_counts,
fit_out$glm_estimates$phat,
sample_names
)
toc()
Step 4: compute molecular complexity profile and other summary statistics: 0.2 sec elapsed
summary_stats
$summary_estimates
$marginal
$conditional
$pi_r_hat
NA
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
outcome_counts <-
reassign_reads_and_mark_retained_observations(
outcome_counts,
summary_stats,
sample_names,
fit_out,
torc
)
# get the tradoff ratio cutoff
summary_stats <- get_threshold(outcome_counts, summary_stats)
toc()
Step 5: reassign read counts, determine cutoff, and mark retained observations: 1.142 sec elapsed
summary_counts_marginal
tic("Step 6: Purge and save read counts datatable to disk")
read_counts <-
left_join(read_counts %>%
select(outcome, cell, umi, gene, sample_names, label),
outcome_counts,
by = c("outcome")
) %>%
select(-outcome)
toc()
Step 6: Purge and save read counts datatable to disk: 2.957 sec elapsed
p5 <-
ggplot(summary_counts_conditional) +
geom_line(aes(x = r,
y = SIHR_12,
colour="12"))+
geom_line(aes(x = r,
y = SIHR_21,
colour="21")) +
geom_hline(aes(yintercept = summary_counts_marginal$SIHR,
colour="true mean"),
linetype="solid",
size=.5) +
geom_hline(aes(yintercept = fit_out$glm_estimates$SIHR,
colour="estimate"),
linetype="solid",
size=.1) +
geom_linerange(data=summary_counts_conditional,
aes(x=r,
ymax=1-fit_out$glm_estimates$phat_low,
ymin=1-fit_out$glm_estimates$phat_high,
colour="estimate"),
size=.5)+
xlim(1,210) +
ylim(0.002,0.005)
#ggsave("index_hopping_rate_200.pdf", p5, width=9, height=6)
p5
read_counts <-
read_counts %>%
ungroup() %>%
arrange(-qr) %>%
mutate(t= case_when(
label %in% c("f,r","0,r") ~ 2,
label %in% c("r,f","r,0") ~ 1 ),
f= case_when(
label %in% c("f,r","f,0") ~ 1,
label %in% c("r,f","0,f") ~ 2 )) %>%
mutate(tp= if_else( t == s, 1L, 0L, missing =0L),
fp= if_else( f == s, 1L, 0L, missing =0L),
tn= if_else( f != s, 1L, 0L, missing =0L),
fn= if_else( t != s, 1L, 0L, missing =0L),
tp0= if_else( t == 0, 1L, 0L, missing =0L), #0 if predict all molecules to be phantom
fn0= if_else( t != 0, 1L, 0L, missing =0L),
tn0= if_else( f != 0, 1L, 0L, missing =0L),
fp0= if_else( f == 0, 1L, 0L, missing =0L),
tp_max= if_else( t == s_maxprop, 1L, 0L, missing =0L),
fp_max= if_else( f == s_maxprop, 1L, 0L, missing =0L),
tn_max= if_else( f != s_maxprop, 1L, 0L, missing =0L),
fn_max= if_else( t != s_maxprop, 1L, 0L, missing =0L))
false_counts_maxprop <-
read_counts %>%
summarize_at(vars(c("tp_max", "fp_max", "tn_max", "fn_max")),
list( ~ sum(.))) %>%
set_names(c("tp", "fp", "tn", "fn"))
false_counts_maxprop
false_counts_min_cutoff <-
read_counts %>%
summarize_at(vars(c("tp", "fp", "tn", "fn")),
list( ~ sum(.)))
false_counts_min_cutoff
false_counts_nopurging <-
read_counts%>%
summarize(n_cugs = n(),
n_real= sum(t>0, na.rm = TRUE),
n_fantom = sum(f>0, na.rm = TRUE),
n_mol=n_real+n_fantom,
g =n_cugs- n_real,
u = n_mol-n_cugs,
tp=n_cugs-g,
fp=u+g,
tn=0,
fn=0)
false_counts_nopurging
read_counts <-
read_counts %>%
mutate_at(vars(c("tp", "fp", "tn", "fn","tp0", "fp0", "tn0", "fn0")),
list(cum= ~ cumsum(.))) %>%
mutate_at(vars(c("tp_cum", "fp_cum", "tn_cum", "fn_cum")),
list( ~ (last(.)-lag(., default =0)))) %>%
mutate(tp_t=tp_cum + tp0_cum,
fp_t=fp_cum + fp0_cum,
tn_t=tn_cum + tn0_cum,
fn_t=fn_cum + fn0_cum,
fpm= first(fp_t)- fp_t,
fnm= fn_t-first(fn_t),
tor_true= fnm/fpm)
false_counts_tor_cutoff <-
read_counts%>%
filter(retain) %>%
slice(1)%>%
select(c("s1", "s2", "qr", "tor", "tp_t", "fp_t", "tn_t", "fn_t", "fpm", "fnm", "tor_true"))
false_counts_tor_cutoff
false_counts_dt <-
bind_rows(
list(no_purging=false_counts_nopurging %>%
select(c("tp", "fp", "tn", "fn")),
no_discarding=false_counts_min_cutoff,
tor_cutoff=
false_counts_tor_cutoff %>%
select(c("tp_t", "fp_t", "tn_t", "fn_t")) %>%
set_names(c("tp", "fp", "tn", "fn")),
max_frac=false_counts_maxprop),
.id="approach") %>%
select(approach, fp,fn, tp, tn) %>%
mutate(fpr=fp/false_counts_nopurging$n_fantom,
fnr=fn/false_counts_nopurging$n_real)
false_counts_dt
# Hmisc::latex( false_counts_dt %>%
# mutate(fpr =round(fpr,4),
# fnr =round(fnr,4)),
# file="",
# rowname=NULL,
# booktabs=TRUE, size="small")
classification_curves <-
read_counts %>%
group_by(qr) %>%
slice(1L) %>%
ungroup() %>%
select( qr, qs, tor, retain, fp_t, FP, fn_t, FN, tp_t, tn_t, TP, TN, FPm, FNm, fpm, fnm, tor_true,o,r) %>%
mutate(fpr=fp_t/false_counts_nopurging$n_fantom,
fnr=fn_t/false_counts_nopurging$n_real)
classification_curves
p_tradeoff <-
ggplot(classification_curves) +
geom_point(
aes(x = FPm,
y = FNm),
size=.5)+
geom_line(
aes(x = FPm,
y = FPm,
colour="1")
) +
geom_line(
aes(x = FPm,
y = 2*FPm,
colour="2"))+
geom_line(
aes(x = FPm,
y = 3*FPm,
colour="3"))+
geom_line(
aes(x = FPm,
y = 4*FPm,
colour="4"))+
geom_line(
aes(x = FPm,
y = 5*FPm,
colour="5"))+
geom_line(
aes(x = FPm,
y = 9*FPm,
colour="9"))+
scale_y_log10() +
theme_bw() +
theme(
legend.title = element_text(face = "bold")) +
scale_color_discrete(name = "TORC") +
labs(x="Marginal Decrease in False Positives (reduce phantom molecs) ",
y="Marginal Increase in False Negatives (discard real molecs)")
ggsave(file.path(figures_dir, "validation_tradeoff.pdf"), p_tradeoff, width=9, height=6)
p_tradeoff
p6 <-
ggplot(classification_curves) +
geom_point(
aes(x = fp_t,
y = fn_t,
colour="true")) +
geom_line(
aes(x = fp_t,
y = fn_t,
colour="true")) +
geom_line(
aes(x = FP,
y = FN,
colour="predicted"))+
geom_point(
aes(x = FP,
y = FN,
colour="predicted"))+
geom_point(data=false_counts_dt ,
aes(x = fp,
y = fn,
shape=approach),
size=2) +
labs(x="False Positive Count",
y="False Negative Count") +
scale_y_sqrt() +
scale_x_sqrt()
ggsave(file.path(figures_dir,"peformance_groundtruth.pdf"), p6, width=9, height=6)
p6
p7 <- ggplot(false_counts_dt
%>% filter(approach %in% c("no_discarding", "tor_cutoff", "max_frac"))) +
geom_point(
aes(x = fp ,
y = fn,
color=approach),
size=2) +
labs(x="False Positive Count",
y="False Negative Count")
ggsave(file.path(figures_dir,"peformance_zoom.pdf"), p7, width=9, height=6)
p7
sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_1.0.0 data.table_1.12.8 tictoc_1.0 furrr_0.1.0 future_1.16.0
[6] broom_0.5.5 matrixStats_0.56.0 forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5
[11] purrr_0.3.3 readr_1.3.1 tidyr_1.0.2 tibble_3.0.0 ggplot2_3.3.0
[16] tidyverse_1.3.0 rhdf5_2.30.1
loaded via a namespace (and not attached):
[1] Rcpp_1.0.4 lubridate_1.7.8 lattice_0.20-40 listenv_0.8.0 assertthat_0.2.1
[6] rprojroot_1.3-2 packrat_0.5.0 digest_0.6.25 R6_2.4.1 cellranger_1.1.0
[11] backports_1.1.6 reprex_0.3.0 httr_1.4.1 pillar_1.4.3 rlang_0.4.5
[16] readxl_1.3.1 rstudioapi_0.11 labeling_0.3 munsell_0.5.0 compiler_3.6.3
[21] modelr_0.1.6 xfun_0.12 pkgconfig_2.0.3 globals_0.12.5 tidyselect_1.0.0
[26] codetools_0.2-16 fansi_0.4.1 crayon_1.3.4 dbplyr_1.4.2 withr_2.1.2
[31] MASS_7.3-51.5 grid_3.6.3 nlme_3.1-144 jsonlite_1.6.1 gtable_0.3.0
[36] lifecycle_0.2.0 DBI_1.1.0 magrittr_1.5 scales_1.1.0 cli_2.0.2
[41] stringi_1.4.6 farver_2.0.3 fs_1.4.1 xml2_1.3.0 ellipsis_0.3.0
[46] generics_0.0.2 vctrs_0.2.4 Rhdf5lib_1.8.0 tools_3.6.3 glue_1.4.0
[51] hms_0.5.3 parallel_3.6.3 colorspace_1.4-1 rvest_0.3.5 knitr_1.28
[56] haven_2.2.0