Prepare analysis workflow
Set parameters
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(readr.show_progress = FALSE,
digits=5,
scipen=8,
future.globals.maxSize = +Inf)
Set filepaths and parameters
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /home/rfarouni/Documents/index_hopping
Load libraries
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
Load functions
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
Load data
validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
data <- read_tsv(file.path(validation_output_dir,
"hiseq4000_inner_joined_with_labels.txt"))
Parsed with column specification:
cols(
cell = [31mcol_character()[39m,
gene = [32mcol_double()[39m,
umi = [32mcol_double()[39m,
s1_nonplexed = [32mcol_double()[39m,
s2_nonplexed = [32mcol_double()[39m,
s1_plexed = [32mcol_double()[39m,
s2_plexed = [32mcol_double()[39m,
outcome = [31mcol_character()[39m,
label = [31mcol_character()[39m
)
data
Compute Index hopping rate
Estimates conditional on duplication level and label
summary_counts <-
data %>%
mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
arrange(r) %>%
group_by(r, label) %>%
summarize_at(vars(matches("^s1_pl|s2_pl")),
list(~ sum(.))) %>%
mutate(s1_hopped=if_else(label %in% c("0,f", "r,f"), s2_plexed,0),
s2_hopped=if_else(label %in% c("f,0", "f,r"), s1_plexed,0),
s1_nonhopped=if_else(label %in% c("r,0", "r,f"), s1_plexed,0),
s2_nonhopped=if_else(label %in% c("0,r", "f,r"), s2_plexed,0)) %>%
select(-s1_plexed,- s2_plexed)
summary_counts
Estimates conditional on duplication level
summary_counts_conditional <-
summary_counts %>%
group_by(r) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))%>%
mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_conditional
Marginal estimates
summary_counts_marginal <-
summary_counts %>%
ungroup() %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))%>%
mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
SIHR=1-(s1_nonhopped+s2_nonhopped)/(s1_hopped+s1_nonhopped +s2_hopped+s2_nonhopped),
frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_marginal
p1 <-
ggplot(summary_counts_conditional) +
geom_line(aes(x = r,
y = SIHR_12*100,
colour="SIHR_12"))+
geom_line(aes(x = r,
y = SIHR_21*100,
colour="SIHR_21")) +
geom_line(aes(x = r,
y = frac_s1,
colour="frac_s1")) +
geom_hline(yintercept = unlist(summary_counts_marginal[5:7])* c(100, 100, 100),
linetype="dashed") +
xlim(0,90) +
ylim(0,1)
p1

NA
Molecules
summary_mol_counts <-
data %>%
mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
arrange(r) %>%
mutate_at(vars(matches("^s")),
list(~ as.integer(.!=0))) %>%
group_by(r, label) %>%
summarize_at(vars(matches("^s1_pl|^s2_pl")),
list(~ sum(.))) %>%
mutate(s1_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
s2_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
select(-s1_plexed,- s2_plexed)
summary_mol_counts
summary_mol_counts_marginal <-
summary_mol_counts%>%
ungroup() %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),# prop hopped phantom molec
ppm_21 = s2_phantom/(s2_phantom+s2_real),
ppm_1 = s2_phantom/(s2_phantom+s1_real), # prop phantom molec
ppm_2 = s1_phantom/(s1_phantom+s2_real),
ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_marginal
summary_mol_counts_conditional <-
summary_mol_counts%>%
group_by(r) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),
ppm_21 = s2_phantom/(s2_phantom+s2_real),
ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_conditional
p4 <- ggplot(summary_mol_counts_conditional) +
geom_line(aes(x = r,
y = ppm_12,
colour="ppm_12"))+
geom_line(aes(x = r,
y = ppm_21,
colour=" ppm_21")) +
#geom_hline(yintercept = unlist(summary_mol_counts_marginal[5:7]), linetype="dashed") +
xlim(0,300)
p4

#ggsave("phantom_molecules_validation.pdf", p4, width =8, height = 5)
Examine extent of contamination in cells
summary_mol_counts_cell<-
data %>%
filter(label!="NA") %>%
mutate_at(vars(matches("^s")),
list(~ as.integer(.!=0))) %>%
group_by(cell, label) %>%
summarize_at(vars(matches("^s1_pl|^s2_pl")),
list(~ sum(.))) %>%
mutate(s1_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
s2_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
select(-s1_plexed,- s2_plexed) %>%
group_by(cell) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.))) %>%
mutate_at(vars(matches("^s")),
list(nonempty= ~ as.integer(.!=0))) %>%
unite(label,matches("nonempty"), sep=",") %>%
mutate(cell_status=
case_when(label %in% c("0,0,0,1","0,0,1,0") ~ "real",
label %in% c("0,0,1,1") ~ "real",
label %in% c("1,0,0,0","0,1,0,0", "1,1,0,0") ~ "phantom",
label %in% c("1,0,0,1","0,1,1,0") ~ "phantom",
TRUE ~ "contaminated")) %>%
mutate(s1_total =(s1_phantom+s1_real),
s2_total =(s2_phantom+s2_real),
s1_ppm = s1_phantom/(s1_total),
s2_ppm = s2_phantom/(s2_total))
summary_mol_counts_cell
cell_status_tally <-
summary_mol_counts_cell %>%
group_by(cell_status,label) %>%
tally(sort=TRUE)
cell_status_tally
n_affected_cells <-
cell_status_tally %>%
ungroup() %>%
filter(cell_status!="real") %>%
summarise(n=sum(n)) %>%
pull(n)
n_affected_cells <- n_affected_cells + 1502 +6
n_affected_cells
[1] 64579
n_total_cells <-
summary_mol_counts_cell %>%
mutate_at(vars(matches("_total")),
list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarise_at(vars(matches("_total")), list(~sum(.))) %>%
mutate(cells_total= s1_total+s2_total)%>%
pull(cells_total)
n_total_cells
[1] 322321
Proportion of affected cells
p_affected_cells <- n_affected_cells/n_total_cells
p_affected_cells
[1] 0.20036
For each cell-barcode, plot the number of phantom molecules against the number of total molecules associated with it.
Sample 1 plot
p2 <- ggplot(summary_mol_counts_cell %>%
filter(cell_status!="real" & s1_phantom >0 )) +
geom_point(aes(x = s1_total,
y = s1_phantom,
colour=cell_status)) +
scale_x_log10() +
scale_y_log10()
p2

#ggsave("phantom_molecules_validation.pdf", p2, width =8, height = 5)
Sample 2 plot
p3 <- ggplot(summary_mol_counts_cell %>%
filter(cell_status!="real" & s2_phantom >0 )) +
geom_point(aes(x = s2_total,
y = s2_phantom,
colour=cell_status)) +
scale_x_log10() +
scale_y_log10()
p3

#ggsave("phantom_cells_validation_s2.pdf", p3, width =8, height = 5)
Run workflow on multiplexed data
read_counts <-
data %>%
filter(label!="NA")%>%
select(-ends_with("nonplexed")) %>%
set_names(c("cell", "gene", "umi", "s1", "s2", "outcome", "label"))
read_counts
S <- 2
sample_names <- colnames(read_counts)[4:(S+3)]
sample_names
[1] "s1" "s2"
tic("Step 2: creating outcome counts datatable with grouping vars")
outcome_counts <- create_outcome_counts(read_counts%>%
select(-label),
sample_names,
min_frac=0.8)
toc()
Step 2: creating outcome counts datatable with grouping vars: 1.591 sec elapsed
outcome_counts
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
fit_out <-
estimate_hopping_rate(
outcome_counts,
S
)
toc()
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.083 sec elapsed
fit_out
$glm_estimates
$chimera_counts
NA
# compute_molecular_complexity_profile
tic("Step 4: compute molecular complexity profile and other summary statistics")
summary_stats <-
compute_summary_stats(
outcome_counts,
fit_out$glm_estimates$phat,
sample_names
)
toc()
Step 4: compute molecular complexity profile and other summary statistics: 0.16 sec elapsed
summary_stats
$summary_estimates
$marginal
$conditional
$pi_r_hat
NA
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
outcome_counts <-
reassign_reads_and_mark_retained_observations(
outcome_counts,
summary_stats,
sample_names,
fit_out,
torc
)
# get the tradoff ratio cutoff
summary_stats <- get_threshold(outcome_counts, summary_stats)
toc()
Step 5: reassign read counts, determine cutoff, and mark retained observations: 1.176 sec elapsed
tic("Step 6: Purge and save read counts datatable to disk")
read_counts <-
left_join(read_counts %>%
select(outcome, cell, umi, gene, sample_names, label),
outcome_counts,
by = c("outcome")
) %>%
select(-outcome)
toc()
Step 6: Purge and save read counts datatable to disk: 2.166 sec elapsed
Compare the SIHR estimates with ground truth estimates
p5 <-
ggplot(summary_counts_conditional) +
geom_line(aes(x = r,
y = SIHR_12,
colour="12"))+
geom_line(aes(x = r,
y = SIHR_21,
colour="21")) +
geom_hline(aes(yintercept = summary_counts_marginal$SIHR,
colour="true mean"),
linetype="solid",
size=.5) +
geom_hline(aes(yintercept = fit_out$glm_estimates$SIHR,
colour="estimate"),
linetype="solid",
size=.1) +
geom_linerange(data=summary_counts_conditional,
aes(x=r,
ymax=1-fit_out$glm_estimates$phat_low,
ymin=1-fit_out$glm_estimates$phat_high,
colour="estimate"),
size=.5)+
xlim(1,210) +
ylim(0.002,0.005)
#ggsave("index_hopping_rate_200.pdf", p5, width=9, height=6)
p5

Determine the number of false positives and false negatives
read_counts <-
read_counts %>%
ungroup() %>%
arrange(-qr) %>%
mutate(t= case_when(
label %in% c("f,r","0,r") ~ 2,
label %in% c("r,f","r,0") ~ 1 ),
f= case_when(
label %in% c("f,r","f,0") ~ 1,
label %in% c("r,f","0,f") ~ 2 )) %>%
mutate(tp= if_else( t == s, 1L, 0L, missing =0L),
fp= if_else( f == s, 1L, 0L, missing =0L),
tn= if_else( f != s, 1L, 0L, missing =0L),
fn= if_else( t != s, 1L, 0L, missing =0L),
tp0= if_else( t == 0, 1L, 0L, missing =0L), #0 if predict all molecules to be phantom
fn0= if_else( t != 0, 1L, 0L, missing =0L),
tn0= if_else( f != 0, 1L, 0L, missing =0L),
fp0= if_else( f == 0, 1L, 0L, missing =0L),
tp_max= if_else( t == s_maxprop, 1L, 0L, missing =0L),
fp_max= if_else( f == s_maxprop, 1L, 0L, missing =0L),
tn_max= if_else( f != s_maxprop, 1L, 0L, missing =0L),
fn_max= if_else( t != s_maxprop, 1L, 0L, missing =0L))
The maximum read fraction method
false_counts_maxprop <-
read_counts %>%
summarize_at(vars(c("tp_max", "fp_max", "tn_max", "fn_max")),
list( ~ sum(.))) %>%
set_names(c("tp", "fp", "tn", "fn"))
false_counts_maxprop
The tor method
false_counts_min_cutoff <-
read_counts %>%
summarize_at(vars(c("tp", "fp", "tn", "fn")),
list( ~ sum(.)))
false_counts_min_cutoff
No purging
false_counts_nopurging <-
read_counts%>%
summarize(n_cugs = n(),
n_real= sum(t>0, na.rm = TRUE),
n_fantom = sum(f>0, na.rm = TRUE),
n_mol=n_real+n_fantom,
g =n_cugs- n_real,
u = n_mol-n_cugs,
tp=n_mol-g,
fp=u+g,
tn=0,
fn=0)
false_counts_nopurging
TOR cutoff
read_counts <-
read_counts %>%
mutate_at(vars(c("tp", "fp", "tn", "fn","tp0", "fp0", "tn0", "fn0")),
list(cum= ~ cumsum(.))) %>%
mutate_at(vars(c("tp_cum", "fp_cum", "tn_cum", "fn_cum")),
list( ~ (last(.)-lag(., default =0)))) %>%
mutate(tp_t=tp_cum + tp0_cum,
fp_t=fp_cum + fp0_cum,
tn_t=tn_cum + tn0_cum,
fn_t=fn_cum + fn0_cum,
fpm= first(fp_t)- fp_t,
fnm= fn_t-first(fn_t),
tor_true= fnm/fpm)
false_counts_tor_cutoff <-
read_counts%>%
filter(retain) %>%
slice(1)%>%
select(c("s1", "s2", "qr", "tor", "tp_t", "fp_t", "tn_t", "fn_t", "fpm", "fnm", "tor_true"))
false_counts_tor_cutoff
Create comparison datatable
false_counts_dt <-
bind_rows(
list(no_purging=false_counts_nopurging %>%
select(c("tp", "fp", "tn", "fn")),
no_cutoff=false_counts_min_cutoff,
tor_cutoff=
false_counts_tor_cutoff %>%
select(c("tp_t", "fp_t", "tn_t", "fn_t")) %>%
set_names(c("tp", "fp", "tn", "fn")),
max_frac=false_counts_maxprop),
.id="approach") %>%
select(approach, fp,fn, tp, tn) %>%
mutate(fpr=fp/false_counts_nopurging$n_fantom,
fnr=fn/false_counts_nopurging$n_real)
false_counts_dt
Plots
Datatable for plotting
classification_curves <-
read_counts %>%
group_by(qr) %>%
slice(1L) %>%
ungroup() %>%
select( qr, qs, tor, retain, fp_t, FP, fn_t, FN, tp_t, tn_t, TP, TN, FPm, FNm, fpm, fnm, tor_true,o,r) %>%
mutate(fpr=fp_t/false_counts_nopurging$n_fantom,
fnr=fn_t/false_counts_nopurging$n_real)
classification_curves
Preformance Plots
p_tradeoff <-
ggplot(classification_curves) +
geom_point(
aes(x = FPm,
y = FNm),
size=.5)+
geom_line(
aes(x = FPm,
y = FPm,
colour="1")
) +
geom_line(
aes(x = FPm,
y = 2*FPm,
colour="2"))+
geom_line(
aes(x = FPm,
y = 3*FPm,
colour="3"))+
geom_line(
aes(x = FPm,
y = 4*FPm,
colour="4"))+
geom_line(
aes(x = FPm,
y = 5*FPm,
colour="5"))+
geom_line(
aes(x = FPm,
y = 9*FPm,
colour="9"))+
scale_y_log10() +
theme_bw() +
theme(
legend.title = element_text(face = "bold")) +
scale_color_discrete(name = "TORC") +
labs(x="Marginal Decrease in False Positives (reduce phantom molecs) ",
y="Marginal Increase in False Negatives (discard real molecs)")
#ggsave(file.path(figures_dir, "validation_tradeoff.pdf"), p_tradeoff, width=9, height=6)
p_tradeoff

p6 <-
ggplot(classification_curves) +
geom_point(
aes(x = fp_t,
y = fn_t,
colour="true")) +
geom_line(
aes(x = fp_t,
y = fn_t,
colour="true")) +
geom_line(
aes(x = FP,
y = FN,
colour="predicted"))+
geom_point(
aes(x = FP,
y = FN,
colour="predicted"))+
geom_point(data=false_counts_dt ,
aes(x = fp,
y = fn,
shape=approach),
size=2) +
labs(x="False Positive Count",
y="False Negative Count") +
scale_y_sqrt() +
scale_x_sqrt()
#ggsave("peformance_groundtruth.pdf", p6, width=9, height=6)
p6

p7 <- ggplot(false_counts_dt
%>% filter(approach %in% c("no_cutoff", "tor_cutoff", "max_frac"))) +
geom_point(
aes(x = fp ,
y = fn,
color=approach),
size=2)
#ggsave("peformance_zoom.pdf", p7, width=9, height=6)
p7

sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_0.9.4 data.table_1.12.2 tictoc_1.0 furrr_0.1.0 future_1.13.0 broom_0.5.2 matrixStats_0.54.0 forcats_0.4.0
[9] stringr_1.4.0 dplyr_0.8.1 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.2 ggplot2_3.1.1 tidyverse_1.2.1
[17] rhdf5_2.28.0
loaded via a namespace (and not attached):
[1] tidyselect_0.2.5 xfun_0.7 listenv_0.7.0 haven_2.1.0 lattice_0.20-38 colorspace_1.4-1 generics_0.0.2 yaml_2.2.0
[9] rlang_0.3.4 pillar_1.4.1 glue_1.3.1 withr_2.1.2 modelr_0.1.4 readxl_1.3.1 plyr_1.8.4 munsell_0.5.0
[17] gtable_0.3.0 cellranger_1.1.0 rvest_0.3.4 codetools_0.2-16 labeling_0.3 knitr_1.23 parallel_3.6.0 Rcpp_1.0.1
[25] scales_1.0.0 backports_1.1.4 jsonlite_1.6 hms_0.4.2 digest_0.6.19 stringi_1.4.3 grid_3.6.0 rprojroot_1.3-2
[33] cli_1.1.0 tools_3.6.0 magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.2 MASS_7.3-51.1 xml2_1.2.0
[41] lubridate_1.7.4 assertthat_0.2.1 httr_1.4.0 rstudioapi_0.10 Rhdf5lib_1.6.0 R6_2.4.0 globals_0.12.4 nlme_3.1-140
[49] compiler_3.6.0
---
title: "Phantom Purge"
subtitle: "Validation Analysis: Part II"
author: 
- name: Rick Farouni
  affiliation:
  - &cruk Génome Québec Innovation Centre, McGill University, Montreal, Canada
date: '`r format(Sys.Date(), "%Y-%B-%d")`'
output:
  html_notebook:
    df_print: paged
    code_folding: show
    toc: no
    toc_float: 
      collapsed: false
      smooth_scroll: false
---

# Prepare analysis workflow

### Set parameters

```{r setup}
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(readr.show_progress = FALSE,
        digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)
```


### Set filepaths and parameters

```{r}
project_dir <- rprojroot::find_rstudio_root_file()

if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
```

### Load libraries

```{r message=FALSE, warning=FALSE}
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
```


### Load functions


```{r message=FALSE}
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
```

### Load data
```{r}
validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
```

```{r}
data <- read_tsv(file.path(validation_output_dir,
                           "hiseq4000_inner_joined_with_labels.txt"))
data
```


# Compute Index hopping rate

##  Estimates conditional on duplication level and label

```{r}
summary_counts <-
    data %>%
  mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
  arrange(r) %>%
  group_by(r, label) %>%
  summarize_at(vars(matches("^s1_pl|s2_pl")), 
               list(~ sum(.)))  %>%
  mutate(s1_hopped=if_else(label %in% c("0,f", "r,f"), s2_plexed,0),
         s2_hopped=if_else(label %in% c("f,0", "f,r"), s1_plexed,0),
         s1_nonhopped=if_else(label %in% c("r,0", "r,f"), s1_plexed,0),
         s2_nonhopped=if_else(label %in% c("0,r", "f,r"), s2_plexed,0)) %>%
  select(-s1_plexed,- s2_plexed)

summary_counts
```

## Estimates conditional on duplication level

```{r}
summary_counts_conditional <-
summary_counts  %>%
  group_by(r) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.)))%>%
  mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
         SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
         frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_conditional
```



## Marginal estimates

```{r}
summary_counts_marginal <- 
  summary_counts %>%
  ungroup() %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.)))%>%
  mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
         SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
         SIHR=1-(s1_nonhopped+s2_nonhopped)/(s1_hopped+s1_nonhopped +s2_hopped+s2_nonhopped),
         frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_marginal
```


```{r, fig.height=10}
p1 <- 
  ggplot(summary_counts_conditional) +
    geom_line(aes(x = r,
                    y = SIHR_12*100,
               colour="SIHR_12"))+
    geom_line(aes(x = r,
                    y = SIHR_21*100,
               colour="SIHR_21")) +
      geom_line(aes(x = r,
                    y = frac_s1,
               colour="frac_s1")) +
    geom_hline(yintercept = unlist(summary_counts_marginal[5:7])* c(100, 100, 100),
               linetype="dashed") +
    xlim(0,90) + 
  ylim(0,1) 
p1
 
```


## Molecules

```{r}
summary_mol_counts <-
  data %>%
  mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
  arrange(r) %>%  
  mutate_at(vars(matches("^s")), 
            list(~ as.integer(.!=0)))  %>%
  group_by(r, label) %>%
  summarize_at(vars(matches("^s1_pl|^s2_pl")), 
               list(~ sum(.)))  %>%
    mutate(s1_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
         s2_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
         s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
         s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
  select(-s1_plexed,- s2_plexed) 
summary_mol_counts 
```
```{r}
summary_mol_counts_marginal <-
  summary_mol_counts%>%
  ungroup() %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>%
  mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),# prop hopped phantom molec
         ppm_21 = s2_phantom/(s2_phantom+s2_real),
         ppm_1 = s2_phantom/(s2_phantom+s1_real), # prop phantom molec
         ppm_2 = s1_phantom/(s1_phantom+s2_real),
         ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
         frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_marginal
```


```{r}
summary_mol_counts_conditional <-
  summary_mol_counts%>%
  group_by(r) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>%
  mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),
         ppm_21 = s2_phantom/(s2_phantom+s2_real),
         ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
         frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_conditional
```

```{r, fig.height=10}
p4 <- ggplot(summary_mol_counts_conditional) +
  geom_line(aes(x = r,
                  y = ppm_12,
             colour="ppm_12"))+
  geom_line(aes(x = r,
                  y =  ppm_21,
             colour=" ppm_21")) +
  #geom_hline(yintercept = unlist(summary_mol_counts_marginal[5:7]), linetype="dashed") +
  xlim(0,300) 
p4
 #ggsave("phantom_molecules_validation.pdf", p4,  width =8, height = 5)
```



## Examine extent of contamination in cells

```{r}
summary_mol_counts_cell<-
  data %>%
  filter(label!="NA") %>%
  mutate_at(vars(matches("^s")), 
            list(~ as.integer(.!=0)))  %>%
  group_by(cell, label) %>%
  summarize_at(vars(matches("^s1_pl|^s2_pl")), 
               list(~ sum(.)))  %>%
    mutate(s1_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
         s2_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
         s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
         s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
  select(-s1_plexed,- s2_plexed) %>%
  group_by(cell) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>% 
    mutate_at(vars(matches("^s")), 
            list(nonempty= ~ as.integer(.!=0)))  %>%
  unite(label,matches("nonempty"), sep=",") %>%
  mutate(cell_status= 
           case_when(label %in% c("0,0,0,1","0,0,1,0") ~ "real",
                     label %in% c("0,0,1,1") ~ "real",
                     label %in% c("1,0,0,0","0,1,0,0", "1,1,0,0") ~  "phantom",
                     label %in% c("1,0,0,1","0,1,1,0") ~  "phantom",
                     TRUE ~  "contaminated"))  %>%
  mutate(s1_total =(s1_phantom+s1_real),
         s2_total =(s2_phantom+s2_real),
         s1_ppm = s1_phantom/(s1_total),
         s2_ppm = s2_phantom/(s2_total))
  
summary_mol_counts_cell
```
```{r}
cell_status_tally <-
  summary_mol_counts_cell %>%
  group_by(cell_status,label) %>%
  tally(sort=TRUE)
cell_status_tally
```

```{r}
n_affected_cells <- 
  cell_status_tally %>%
  ungroup() %>%
  filter(cell_status!="real") %>% 
  summarise(n=sum(n)) %>% 
  pull(n)
n_affected_cells <- n_affected_cells + 1502 +6
n_affected_cells
```


```{r}
n_total_cells <-
  summary_mol_counts_cell %>%
  mutate_at(vars(matches("_total")), 
            list(~ as.integer(.!=0))) %>%
  ungroup() %>%
  summarise_at(vars(matches("_total")), list(~sum(.))) %>%
  mutate(cells_total= s1_total+s2_total)%>%
  pull(cells_total)
n_total_cells 
```

Proportion of affected cells

```{r}
p_affected_cells <-  n_affected_cells/n_total_cells
p_affected_cells
```

For each cell-barcode, plot the number of phantom molecules against the number of total molecules associated with it.

Sample 1 plot

```{r, fig.height=10}
p2 <- ggplot(summary_mol_counts_cell %>%
               filter(cell_status!="real" & s1_phantom >0 )) +
  geom_point(aes(x = s1_total,
                  y = s1_phantom,
                 colour=cell_status)) +
  scale_x_log10() +
  scale_y_log10()

p2
 #ggsave("phantom_molecules_validation.pdf", p2,  width =8, height = 5)
```

Sample 2 plot


```{r, fig.height=10}
p3 <- ggplot(summary_mol_counts_cell %>%
               filter(cell_status!="real" & s2_phantom >0 )) +
  geom_point(aes(x = s2_total,
                  y = s2_phantom,
                 colour=cell_status)) +
  scale_x_log10() +
  scale_y_log10()

p3
#ggsave("phantom_cells_validation_s2.pdf", p3,  width =8, height = 5)
```

# Run workflow on multiplexed data

```{r}
read_counts <- 
  data %>% 
  filter(label!="NA")%>%
  select(-ends_with("nonplexed")) %>%
  set_names(c("cell", "gene", "umi", "s1", "s2", "outcome", "label"))
read_counts
```


```{r}
S <- 2
sample_names <- colnames(read_counts)[4:(S+3)]
sample_names
```



```{r}
tic("Step 2: creating outcome counts datatable with grouping vars")

outcome_counts <- create_outcome_counts(read_counts%>%
                                          select(-label), 
                                        sample_names,  
                                        min_frac=0.8)
toc()


outcome_counts
```

```{r}
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
  fit_out <-
    estimate_hopping_rate(
      outcome_counts,
      S
    )
  toc()
fit_out 
```



```{r}
  # compute_molecular_complexity_profile
  tic("Step 4: compute molecular complexity profile and other summary statistics")
  summary_stats <-
    compute_summary_stats(
      outcome_counts,
      fit_out$glm_estimates$phat,
      sample_names
    )
  toc()
summary_stats
```

Set the trade-off ratio cost cutoff (*torc*). The parameter *torc* represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of *torc* are expected to be within the range of 1-5.

```{r}
torc <- 3 
```


```{r}
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")

  outcome_counts <-
    reassign_reads_and_mark_retained_observations(
      outcome_counts,
      summary_stats,
      sample_names,
      fit_out,
      torc
    )
  # get the tradoff ratio cutoff
  summary_stats <- get_threshold(outcome_counts, summary_stats)

  toc()
```



```{r}
tic("Step 6: Purge and save read counts datatable to disk")

read_counts <-
  left_join(read_counts %>%
    select(outcome, cell, umi, gene, sample_names, label),
  outcome_counts,
  by = c("outcome")
  ) %>%
  select(-outcome)

toc()

```


### Compare the SIHR estimates with ground truth estimates

```{r, fig.width=10}
p5 <- 
  ggplot(summary_counts_conditional) +
  geom_line(aes(x = r,
                  y = SIHR_12,
             colour="12"))+
  geom_line(aes(x = r,
                  y = SIHR_21,
             colour="21"))  +
geom_hline(aes(yintercept =  summary_counts_marginal$SIHR, 
               colour="true mean"), 
           linetype="solid",
           size=.5)   +
geom_hline(aes(yintercept =  fit_out$glm_estimates$SIHR, 
               colour="estimate"),
           linetype="solid",
           size=.1)   +
    geom_linerange(data=summary_counts_conditional, 
                  aes(x=r,
                      ymax=1-fit_out$glm_estimates$phat_low,
                      ymin=1-fit_out$glm_estimates$phat_high,
                      colour="estimate"), 
                  size=.5)+ 
  xlim(1,210) +
  ylim(0.002,0.005)
#ggsave("index_hopping_rate_200.pdf", p5, width=9, height=6)
p5
```



# Determine the number of false positives and false negatives

```{r}
read_counts <-
    read_counts %>%
  ungroup() %>%
  arrange(-qr) %>%
  mutate(t= case_when(
      label %in% c("f,r","0,r") ~ 2,
      label %in% c("r,f","r,0") ~ 1    ),
    f= case_when(
      label %in% c("f,r","f,0") ~ 1,
      label %in% c("r,f","0,f") ~ 2    )) %>%
  mutate(tp= if_else( t == s, 1L, 0L, missing =0L),
         fp= if_else( f == s, 1L, 0L, missing =0L),
         tn= if_else( f != s, 1L, 0L, missing =0L),
         fn= if_else( t != s, 1L, 0L, missing =0L),
         tp0= if_else( t == 0, 1L, 0L, missing =0L), #0 if  predict all molecules to be phantom
         fn0= if_else( t != 0, 1L, 0L, missing =0L),
         tn0= if_else( f != 0, 1L, 0L, missing =0L),
         fp0= if_else( f == 0, 1L, 0L, missing =0L),
         tp_max= if_else( t == s_maxprop, 1L, 0L, missing =0L),
         fp_max= if_else( f == s_maxprop, 1L, 0L, missing =0L),
         tn_max= if_else( f != s_maxprop, 1L, 0L, missing =0L),
         fn_max= if_else( t != s_maxprop, 1L, 0L, missing =0L)) 
```


### The maximum read fraction method


```{r}
false_counts_maxprop <-
  read_counts %>%
  summarize_at(vars(c("tp_max", "fp_max", "tn_max", "fn_max")),
            list( ~ sum(.))) %>%
  set_names(c("tp", "fp", "tn", "fn"))
false_counts_maxprop
```

### The tor method


```{r}
false_counts_min_cutoff <-
  read_counts %>%
  summarize_at(vars(c("tp", "fp", "tn", "fn")),
            list( ~ sum(.))) 
false_counts_min_cutoff
```

### No purging

```{r}
false_counts_nopurging <- 
  read_counts%>%
  summarize(n_cugs = n(),
            n_real= sum(t>0, na.rm = TRUE),
            n_fantom = sum(f>0, na.rm = TRUE),
            n_mol=n_real+n_fantom,
            g =n_cugs- n_real,
            u = n_mol-n_cugs,
            tp=n_mol-g,
            fp=u+g,
            tn=0,
            fn=0)

false_counts_nopurging
```


###  TOR cutoff

```{r}
read_counts <-
  read_counts %>%
  mutate_at(vars(c("tp", "fp", "tn", "fn","tp0", "fp0", "tn0", "fn0")),
            list(cum= ~ cumsum(.)))  %>%
  mutate_at(vars(c("tp_cum", "fp_cum", "tn_cum", "fn_cum")), 
            list( ~ (last(.)-lag(., default =0)))) %>%
  mutate(tp_t=tp_cum + tp0_cum,
         fp_t=fp_cum + fp0_cum,
         tn_t=tn_cum + tn0_cum,
         fn_t=fn_cum + fn0_cum,
         fpm= first(fp_t)- fp_t, 
         fnm= fn_t-first(fn_t),
         tor_true= fnm/fpm)
```



```{r}
false_counts_tor_cutoff <-
  read_counts%>%
  filter(retain) %>%
  slice(1)%>%
  select(c("s1", "s2", "qr", "tor", "tp_t", "fp_t", "tn_t", "fn_t", "fpm", "fnm", "tor_true"))  
false_counts_tor_cutoff
```
## Create comparison datatable

```{r}
false_counts_dt <-
  bind_rows(
    list(no_purging=false_counts_nopurging %>%
           select(c("tp", "fp", "tn", "fn")),
         no_cutoff=false_counts_min_cutoff,
         tor_cutoff=
           false_counts_tor_cutoff %>%
           select(c("tp_t", "fp_t", "tn_t", "fn_t")) %>%
           set_names(c("tp", "fp", "tn", "fn")),
         max_frac=false_counts_maxprop),
    .id="approach") %>%
  select(approach, fp,fn, tp, tn) %>%
  mutate(fpr=fp/false_counts_nopurging$n_fantom,
         fnr=fn/false_counts_nopurging$n_real)
false_counts_dt
```



## Plots

### Datatable for plotting


```{r}
classification_curves <-
  read_counts %>% 
  group_by(qr) %>%
  slice(1L) %>%
  ungroup() %>%
  select( qr, qs, tor, retain, fp_t, FP, fn_t, FN, tp_t, tn_t, TP, TN, FPm, FNm, fpm, fnm, tor_true,o,r) %>%
  mutate(fpr=fp_t/false_counts_nopurging$n_fantom,
         fnr=fn_t/false_counts_nopurging$n_real)
classification_curves
```

### Preformance Plots



```{r fig.height=7, fig.width=10, message=FALSE, warning=FALSE}
p_tradeoff <-  
    ggplot(classification_curves) + 
    geom_point(
      aes(x = FPm,
          y = FNm),
      size=.5)+
    geom_line(
      aes(x = FPm,
          y = FPm,
          colour="1")
    ) +
    geom_line(
      aes(x = FPm,
          y = 2*FPm,
          colour="2"))+
    geom_line(
      aes(x = FPm,
          y = 3*FPm,
          colour="3"))+
    geom_line(
      aes(x = FPm,
          y = 4*FPm,
          colour="4"))+
    geom_line(
      aes(x = FPm,
          y = 5*FPm,
          colour="5"))+
    geom_line(
      aes(x = FPm,
          y = 9*FPm,
          colour="9"))+
    scale_y_log10() +
    theme_bw()  +
      theme(
        legend.title = element_text(face = "bold")) + 
      scale_color_discrete(name = "TORC") +
    labs(x="Marginal Decrease in False Positives (reduce phantom molecs) ",
         y="Marginal Increase in False Negatives (discard real molecs)") 
    

#ggsave(file.path(figures_dir, "validation_tradeoff.pdf"), p_tradeoff, width=9, height=6)

p_tradeoff
```

```{r fig.height=6}
p6 <-
  ggplot(classification_curves)  + 
    geom_point(
             aes(x = fp_t,
                  y = fn_t,
                 colour="true")) +

  geom_line(
             aes(x = fp_t,
                  y = fn_t,
                 colour="true")) +
    geom_line(
             aes(x = FP,
                  y = FN,
                 colour="predicted"))+
      geom_point(
             aes(x = FP,
                  y = FN,
                 colour="predicted"))+ 

  geom_point(data=false_counts_dt ,
             aes(x = fp,
                  y = fn,
                  shape=approach),
             size=2) +
    labs(x="False Positive Count",
       y="False Negative Count")  + 
    scale_y_sqrt() + 
  scale_x_sqrt() 
#ggsave("peformance_groundtruth.pdf", p6, width=9, height=6)
 p6
```



```{r fig.height=8}
p7 <- ggplot(false_counts_dt
              %>% filter(approach %in% c("no_cutoff", "tor_cutoff", "max_frac")))  + 
  geom_point(
             aes(x = fp ,
                  y = fn,
                  color=approach),
             size=2) 


#ggsave("peformance_zoom.pdf", p7, width=9, height=6)
p7
```
```{r}
sessionInfo()
```

