In this notebookd we simulate outcome data and compare the performance of the TOR approach with minimum read fraction (MRF) approach.
library(tidyverse)
library(poibin)
library(combinat)
library(furrr)
library(data.table)
library(matrixStats)
library(rprojroot)
library(cowplot)
plan(multiprocess)
[ONE-TIME WARNING] Forked processing ('multicore') is disabled in future (>= 1.13.0) when running R from RStudio, because it is considered unstable. Because of this, plan("multicore") will fall back to plan("sequential"), and plan("multiprocess") will fall back to plan("multisession") - not plan("multicore") as in the past. For more details, how to control forked processing or not, and how to silence this warning in future R sessions, see ?future::supportsMulticore
require(knitr)
opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
options(future.globals.maxSize= +Inf, future.fork.enable=TRUE)
figures_dir <- file.path("./figures")
create_data <- function(prob, rmax, nreads){
dt <-
tibble(r=1:rmax,
s=dgeom(r-1, prob[1]) ) %>%
mutate(s=(s/sum(s))) %>%
mutate(rs= r*s)
RMR <-sum(dt %>% pull(rs))
nmol <- nreads/RMR
dt <-
dt %>%
mutate(s= round(nmol*s))%>%
select(-rs)
return(dt)
}
get_product_logsum <- function(x, y) {
z_log <- log(x) + log(y)
z <- exp(z_log)
return(z)
}
generate_outcomes_s <- function(s, p, S, r, pi_r){
p_h <- (1 - p) / (S - 1)
pvec <- rep(p_h, S)
pvec[s] <- p
pi_r_s <- pi_r[s]
outcomes <- xsimplex(S, r)
pmf <- apply(outcomes,
2,
dmnom,
prob = pvec)
outcomes <- outcomes %>%
t() %>%
as_tibble( .name_repair = "unique") %>%
setNames(paste0("y", 1:S)) %>%
mutate(pmf_rs=pmf,
pmf_r=get_product_logsum(pmf, pi_r_s))
return(outcomes)
}
create_grouping_vars <- function(outcomes, S, mrf) {
s <-
outcomes %>%
pull(s) %>%
as.numeric()
setDT(outcomes)
outcomes[,
outcome := do.call(paste, c(.SD, sep = ",")),
.SDcols = paste0("y", 1:S)]
outcome <-
outcomes %>%
pull(outcome)
outcomes <-
outcomes %>%
select(c(paste0("y", 1:S))) %>%
as.matrix
y_s <- as.integer(outcomes[cbind(seq_along(s), s)] != 0)
r <- as.integer(rowSums2(outcomes))
k_chimera <- as.integer(S - rowCounts(outcomes, value = 0))
fantoms <- k_chimera - y_s
s_frac_1 <-
apply(cbind(outcomes,r),
1,
function(x) {
y <- which(x[1:S] >= (x[S + 1] *mrf[1]))
y <- unname(y)
if (length(y) == 0)
y <- 0
return(y)
})
s_frac_2 <-
apply(cbind(outcomes,r),
1,
function(x) {
y <- which(x[1:S] >= (x[S + 1] * mrf[2]))
y <- unname(y)
if (length(y) == 0)
y <- 0
return(y)
})
s_frac_3 <-
apply(cbind(outcomes,r),
1,
function(x) {
y <- which(x[1:S] >= (x[S + 1]* mrf[3]))
y <- unname(y)
if (length(y) == 0)
y <- 0
return(y)
})
grouping_vars <- tibble(outcome=outcome,
r = r,
s_frac_1 = s_frac_1,
s_frac_2 = s_frac_2,
s_frac_3 = s_frac_3,
fantoms =fantoms,
k_chimera = k_chimera)
return(grouping_vars)
}
add_vars_to_outcome_counts <- function(outcomes, S, mrf) {
grouping_vars <- create_grouping_vars(outcomes,S, mrf)
outcomes <- bind_cols(outcomes, grouping_vars)
return(outcomes)
}
infer_sample_of_origin_outcome <- function(..., pi_r, log_zi, S, p) {
y <- c(...)[1:S]
log_sample_pi <- log(pi_r)
posterior_s <- y * log_zi + log_sample_pi
## normalize to the sample with maximum posterior probability
posterior_s <- posterior_s - max(posterior_s)
posterior_s <- exp(posterior_s)
posterior <- posterior_s / sum(posterior_s)
q <- max(posterior)
s_hat <- which.max(posterior)
out <- list(s_hat = s_hat, q = q)
return(out)
}
add_posterior_prob <- function(outcomes, S, p, pi_r) {
log_zi <- log((S - 1)) - log(1 / p - 1)
posteriors <-
future_pmap_dfr(
outcomes %>%
select(paste0("y", 1:S)),
infer_sample_of_origin_outcome,
pi_r=pi_r,
log_zi = log_zi,
S=S,
p=p
)
outcomes <- bind_cols(outcomes %>%
select(-c(paste0("y", 1:S))),
posteriors)
return(outcomes)
}
generate_outcomes <- function(...,p, S, mrf){
r <- c(...)[1]
pi_r <- c(...)[2:(S+1)]
outcomes <- future_map_dfr(1:S,
generate_outcomes_s,
p=p,
S=S,
r=r,
pi_r=pi_r,
.id = "s")
outcomes <- add_vars_to_outcome_counts(outcomes, S, mrf)
outcomes <- add_posterior_prob(outcomes, S, p, pi_r)
outcomes <-
outcomes %>%
mutate(s= as.integer(s) ,
s_frac_1= as.integer(s_frac_1),
s_frac_2= as.integer(s_frac_2),
s_frac_3= as.integer(s_frac_3),
r=as.integer(r))
return(outcomes)
}
evaluate_classification <- function(outcomes) {
outcomes <-
outcomes %>%
mutate(
hit_s= s==s_hat,
hit_frac_1= s==s_frac_1,
hit_frac_2= s==s_frac_2,
hit_frac_3= s==s_frac_3,
k_chimera_p=get_product_logsum(k_chimera, pmf),
fantoms_p=get_product_logsum(fantoms, pmf),
g=(1-(k_chimera-fantoms))*pmf,
qr = 1 - q)
g_total <-
outcomes %>%
summarise_at(vars(c("g")), sum) %>%
pull(g)
outcomes<-
outcomes %>%
arrange(qr)%>%
mutate(
o = cumsum(pmf),
FP = cumsum(get_product_logsum(
qr,
pmf
)),
FN=1 - o + FP-g_total,
FPm= last(FP)- FP, # marginal
FNm=FN-last(FN), # marginal
tor=if_else(FPm <.Machine$double.neg.eps | FNm <.Machine$double.neg.eps,
0,
FNm/FPm),
FP_1=case_when(
!hit_frac_1 & s_frac_1!=0 ~ pmf,
TRUE ~0),
FN_1=case_when(s_frac_1==0 & g==0 ~ pmf,
!hit_frac_1 & s_frac_1!=0 & g==0 ~ pmf,
TRUE ~0),
FP_2=case_when(
!hit_frac_2 & s_frac_2!=0 ~ pmf,
TRUE ~0),
FN_2=case_when(s_frac_2==0 & g==0 ~ pmf,
!hit_frac_2 & s_frac_2!=0 & g==0 ~ pmf,
TRUE ~0),
FP_3=case_when(
!hit_frac_3 & s_frac_3!=0 ~ pmf,
TRUE ~0),
FN_3=case_when(s_frac_3==0 & g==0 ~ pmf,
!hit_frac_3 & s_frac_3!=0 & g==0 ~ pmf,
TRUE ~0))
return(outcomes)
}
compute_classification_metrics <- function(p, pi_r,m_df, torc, mrf){
outcomes <-
future_pmap_dfr(pi_r,
generate_outcomes,
p=p,
S=S,
mrf=mrf)
outcomes <-
left_join(outcomes ,
m_df,
by="r") %>%
mutate(
pmf= get_product_logsum(pmf_r,m))%>%
select(-c("m","pmf_rs","pmf_r"))
outcomes <- evaluate_classification(outcomes)
tor_thresh <-
outcomes %>%
summarize(tor_thresh = tor[which.max(-pmax(tor, torc))]) %>%
pull(tor_thresh)
metrics_tor <-
outcomes %>%
filter(tor == tor_thresh) %>%
slice(1) %>%
select(FP, FN, tor) %>%
rename(cutoff = "tor") %>%
mutate(cutoff = paste0("tor_", str_sub(
as.character(cutoff),
start = 1,
end = 4
)))
maxfrac_fp <-
outcomes %>%
summarise_at(vars(starts_with("FP_")), sum) %>%
gather(condition, FP) %>%
select(FP)
maxfrac_fn <-
outcomes %>%
summarise_at(vars(starts_with("FN_")), sum) %>%
gather(condition, FN) %>%
select(FN)
metrics_dt <- bind_cols(maxfrac_fp,
maxfrac_fn) %>%
mutate(cutoff = sprintf("mf_%.2f", mrf)) %>%
bind_rows(., metrics_tor) %>%
select(cutoff, everything())
tor_dt <-
outcomes %>%
filter(tor < 10 & tor > 1 & g == 0) %>%
mutate(tor = round(tor, 4)) %>%
group_by(tor) %>%
filter(row_number() == 1)
return(list(tor_dt=tor_dt, metrics_dt= metrics_dt))
}
plot_Pi <- function(Pi, Pi_marginal) {
p1 <-
ggplot(Pi %>%
select(
-sum_mols
) %>%
gather(sample, p, -c("r"))) +
geom_line(aes(
x = r,
y = p,
colour = sample
),
size = 1,
alpha = 1
) +
geom_point(
data = Pi %>%
select(r, m),
aes(x = r, y = m),
shape = 10,
size = 1,
alpha = .9
) +
theme_bw() +
geom_hline(
data = Pi_marginal,
aes(
yintercept = p_molecs,
colour = sample
),
size = 0.3,
linetype = "longdash",
alpha = 1
) +
labs(
x = "r (PCR duplicates)",
y = "proportion"
) +
scale_colour_viridis_d(
option = "inferno",
direction = -1
) +
theme(
axis.title.x = element_text(size = rel(1.4)),
axis.title.y = element_text(size = rel(1.4)),
axis.text = element_text(size = rel(1.3)),
legend.title = element_text(size = rel(1.1), face = "bold"),
legend.text = element_text(size = rel(1.1))
)
return(p1)
}
plot_trade_off <- function(condition, condition_name){
condition$metrics_dt$cutoff[4] <- "tor*"
p <- ggplot(condition$tor_dt) +
geom_line(
aes(x = FP,
y = FN),
colour="grey")+
geom_point(
aes(x = FP,
y = FN,
color=tor),
size=.5,
alpha=.8)+
scale_color_viridis_c() +
labs(x="False Positives",
y="False Negatives") +
geom_point(data=condition$metrics_dt,
aes(x = FP,
y = FN,
shape=cutoff),
size=1.8,
colour="brown")+
theme_bw() +
#scale_y_sqrt()+
theme(
legend.title = element_text(face = "bold")
) +
expand_limits(x = c(0, .006), y = c(0, 0.04))
legend <- get_legend(p)
p <- p + theme(legend.position = "none")
p <- ggdraw() +
draw_plot(p, 0, 0, 1, 1) +
draw_label(paste0("SIHR=", condition_name),
x = 0,
y = 1,
vjust = 2,
hjust = -1.8,
size = 12,
fontface = "italic"
)
return(list(p=p, legend=legend))
}
plot_trade_off_all <- function(metrics_list) {
p_tradeoff <- imap(metrics_list, plot_trade_off)
p_list <-
p_tradeoff %>%
map(list("p"))
legend_list <-
p_tradeoff %>%
map(list("legend"))
p_tradeoff_all <-
plot_grid(
plot_grid(p_list[[1]],
p_list[[2]],
p_list[[3]],
p_list[[4]],
align="hv",
axis ="tblr"),
plot_grid(NULL,
legend_list[[4]],
NULL,
ncol=1),
rel_widths=c(1, 0.1))
return(p_tradeoff_all)
}
simulate_pi <- function(prob , rmax, nreads=3e7 ){
sample_names <- paste0("s", 1:length(prob))
names(prob) <- sample_names
dt <-
map_dfr(prob,
create_data,
rmax=rmax,
nreads=nreads,
.id="sample") %>%
spread(sample, s)
dt_summary <-
dt %>%
select(-r) %>%
summarize_all(sum) %>%
gather(sample, nmols) %>%
mutate(RMR=nreads/nmols,
pgeom_prob=prob,
p_molecs=nmols/sum(nmols))
dt <- dt %>%
mutate(sum_mols = rowSums(.[sample_names])) %>%
mutate_at(vars(sample_names), ~ . / sum_mols) %>%
mutate(m= sum_mols/sum(sum_mols))
return(list(Pi=dt, summary=dt_summary))
}
simulate_data <- function(prob, pvec, rmax, torc, mrf){
pi_out <- simulate_pi(prob, rmax)
m_df <- pi_out$Pi %>% select(r, m)
pi_r <- pi_out$Pi %>% select(-m, -sum_mols)
metrics_list <-
map(pvec,
compute_classification_metrics,
pi_r=pi_r,
m_df=m_df,
torc=torc,
mrf=mrf)
return(list(metrics_list=metrics_list, pi_out=pi_out))
}
S <- 4 # sample number
rmax <- 30 # PCR amplification limit
torc <- 3 # Trade off cutoff
mrf <- c(0.6, 0.8, 0.9) #three choices for the minimum read fraction (MRF) thresholds. 0.8 is the default
pvec <- c(0.982, 0.986, 0.990, 0.994) # probability of not hopping (4 settings)
names(pvec) <- c(0.018, 0.014, 0.010, 0.006) # name vector with SIHR (i.e. hopping rate or 1-pvec)
pvec
0.018 0.014 0.01 0.006
0.982 0.986 0.990 0.994
Set the parameter values for the truncated geometric distributions for the S samples
prob1 <- c( 0.45, 0.4, 0.1, 0.05)
out_1 <- simulate_data(prob1, pvec,rmax, torc, mrf)
out_1$pi_out$summary
Plot the molecular proportions complexity profile
plot_Pi(out_1$pi_out$Pi, out_1$pi_out$summary)
Show the results
out_1$metrics_list
$`0.018`
$`0.018`$tor_dt
$`0.018`$metrics_dt
$`0.014`
$`0.014`$tor_dt
$`0.014`$metrics_dt
$`0.01`
$`0.01`$tor_dt
$`0.01`$metrics_dt
$`0.006`
$`0.006`$tor_dt
$`0.006`$metrics_dt
NANA
Plot the results
p_trade_off_1 <- plot_trade_off_all(out_1$metrics_list)
p_trade_off_1
save_plot(file.path(figures_dir, "simulation_tradoff01.pdf"),
p_trade_off_1,
ncol = 3, # we're saving a grid plot of 2 columns
nrow = 2, # and 2 rows
base_height=4,
# each individual subplot should have an aspect ratio of 1.3
base_aspect_ratio = 1
)
prob2 <- c( 0.45, 0.42, 0.40, 0.05)
out_2 <- simulate_data(prob2, pvec,rmax, torc, mrf)
out_2$pi_out$summary
plot_Pi(out_2$pi_out$Pi, out_2$pi_out$summary)
out_2$metrics_list
$`0.018`
$`0.018`$tor_dt
$`0.018`$metrics_dt
$`0.014`
$`0.014`$tor_dt
$`0.014`$metrics_dt
$`0.01`
$`0.01`$tor_dt
$`0.01`$metrics_dt
$`0.006`
$`0.006`$tor_dt
$`0.006`$metrics_dt
NANA
p_trade_off_2 <- plot_trade_off_all(out_2$metrics_list)
p_trade_off_2
save_plot(file.path(figures_dir, "simulation_tradoff02.pdf"),
p_trade_off_2,
ncol = 3, # we're saving a grid plot of 2 columns
nrow = 2, # and 2 rows
base_height=4,
# each individual subplot should have an aspect ratio of 1.3
base_aspect_ratio = 1
)
sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.3 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] knitr_1.26 cowplot_1.0.0 rprojroot_1.3-2 matrixStats_0.55.0 data.table_1.12.8 furrr_0.1.0 future_1.15.1
[8] combinat_0.0-8 poibin_1.5 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.3 purrr_0.3.3 readr_1.3.1
[15] tidyr_1.0.0 tibble_2.1.3 ggplot2_3.2.1 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] Rcpp_1.0.3 lubridate_1.7.4 lattice_0.20-38 listenv_0.8.0 assertthat_0.2.1 zeallot_0.1.0 digest_0.6.23
[8] R6_2.4.1 cellranger_1.1.0 backports_1.1.5 reprex_0.3.0 httr_1.4.1 pillar_1.4.3 rlang_0.4.2
[15] lazyeval_0.2.2 readxl_1.3.1 rstudioapi_0.10 labeling_0.3 munsell_0.5.0 broom_0.5.3 compiler_3.6.2
[22] modelr_0.1.5 xfun_0.12 pkgconfig_2.0.3 globals_0.12.5 tidyselect_0.2.5 codetools_0.2-16 fansi_0.4.1
[29] viridisLite_0.3.0 crayon_1.3.4 dbplyr_1.4.2 withr_2.1.2 grid_3.6.2 nlme_3.1-143 jsonlite_1.6
[36] gtable_0.3.0 lifecycle_0.1.0 DBI_1.1.0 magrittr_1.5 scales_1.1.0 cli_2.0.1 stringi_1.4.5
[43] farver_2.0.1 fs_1.3.1 xml2_1.2.2 ellipsis_0.3.0 generics_0.0.2 vctrs_0.2.1 tools_3.6.2
[50] glue_1.3.1 hms_0.5.3 parallel_3.6.2 colorspace_1.4-1 rvest_0.3.5 haven_2.2.0