Prepare analysis workflow

Set parameters

knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(readr.show_progress = FALSE,
        digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)

Set filepaths and parameters

project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
Project directory: /home/rfarouni/Documents/index_hopping

Load libraries

library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)

Load functions

code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))

Load data

validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
data <- read_tsv(file.path(validation_output_dir,
                           "hiseq4000_inner_joined_with_labels.txt"))
Parsed with column specification:
cols(
  cell = col_character(),
  gene = col_double(),
  umi = col_double(),
  s1_nonplexed = col_double(),
  s2_nonplexed = col_double(),
  s1_plexed = col_double(),
  s2_plexed = col_double(),
  outcome = col_character(),
  label = col_character()
)
data

Compute Index hopping rate

Estimates conditional on duplication level and label

summary_counts <-
    data %>%
  mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
  arrange(r) %>%
  group_by(r, label) %>%
  summarize_at(vars(matches("^s1_pl|s2_pl")), 
               list(~ sum(.)))  %>%
  mutate(s1_hopped=if_else(label %in% c("0,f", "r,f"), s2_plexed,0),
         s2_hopped=if_else(label %in% c("f,0", "f,r"), s1_plexed,0),
         s1_nonhopped=if_else(label %in% c("r,0", "r,f"), s1_plexed,0),
         s2_nonhopped=if_else(label %in% c("0,r", "f,r"), s2_plexed,0)) %>%
  select(-s1_plexed,- s2_plexed)
summary_counts

Estimates conditional on duplication level

summary_counts_conditional <-
summary_counts  %>%
  group_by(r) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.)))%>%
  mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
         SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
         frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_conditional

Marginal estimates

summary_counts_marginal <- 
  summary_counts %>%
  ungroup() %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.)))%>%
  mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
         SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
         SIHR=1-(s1_nonhopped+s2_nonhopped)/(s1_hopped+s1_nonhopped +s2_hopped+s2_nonhopped),
         frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_marginal
p1 <- 
  ggplot(summary_counts_conditional) +
    geom_line(aes(x = r,
                    y = SIHR_12*100,
               colour="SIHR_12"))+
    geom_line(aes(x = r,
                    y = SIHR_21*100,
               colour="SIHR_21")) +
      geom_line(aes(x = r,
                    y = frac_s1,
               colour="frac_s1")) +
    geom_hline(yintercept = unlist(summary_counts_marginal[5:7])* c(100, 100, 100),
               linetype="dashed") +
    xlim(0,90) + 
  ylim(0,1) 
p1

NA

Molecules

summary_mol_counts <-
  data %>%
  mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
  arrange(r) %>%  
  mutate_at(vars(matches("^s")), 
            list(~ as.integer(.!=0)))  %>%
  group_by(r, label) %>%
  summarize_at(vars(matches("^s1_pl|^s2_pl")), 
               list(~ sum(.)))  %>%
    mutate(s1_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
         s2_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
         s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
         s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
  select(-s1_plexed,- s2_plexed) 
summary_mol_counts 
summary_mol_counts_marginal <-
  summary_mol_counts%>%
  ungroup() %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>%
  mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),# prop hopped phantom molec
         ppm_21 = s2_phantom/(s2_phantom+s2_real),
         ppm_1 = s2_phantom/(s2_phantom+s1_real), # prop phantom molec
         ppm_2 = s1_phantom/(s1_phantom+s2_real),
         ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
         frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_marginal
summary_mol_counts_conditional <-
  summary_mol_counts%>%
  group_by(r) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>%
  mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),
         ppm_21 = s2_phantom/(s2_phantom+s2_real),
         ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
         frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_conditional
p4 <- ggplot(summary_mol_counts_conditional) +
  geom_line(aes(x = r,
                  y = ppm_12,
             colour="ppm_12"))+
  geom_line(aes(x = r,
                  y =  ppm_21,
             colour=" ppm_21")) +
  #geom_hline(yintercept = unlist(summary_mol_counts_marginal[5:7]), linetype="dashed") +
  xlim(0,300) 
p4

 #ggsave("phantom_molecules_validation.pdf", p4,  width =8, height = 5)

Examine extent of contamination in cells

summary_mol_counts_cell<-
  data %>%
  filter(label!="NA") %>%
  mutate_at(vars(matches("^s")), 
            list(~ as.integer(.!=0)))  %>%
  group_by(cell, label) %>%
  summarize_at(vars(matches("^s1_pl|^s2_pl")), 
               list(~ sum(.)))  %>%
    mutate(s1_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
         s2_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
         s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
         s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
  select(-s1_plexed,- s2_plexed) %>%
  group_by(cell) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>% 
    mutate_at(vars(matches("^s")), 
            list(nonempty= ~ as.integer(.!=0)))  %>%
  unite(label,matches("nonempty"), sep=",") %>%
  mutate(cell_status= 
           case_when(label %in% c("0,0,0,1","0,0,1,0") ~ "real",
                     label %in% c("0,0,1,1") ~ "real",
                     label %in% c("1,0,0,0","0,1,0,0", "1,1,0,0") ~  "phantom",
                     label %in% c("1,0,0,1","0,1,1,0") ~  "phantom",
                     TRUE ~  "contaminated"))  %>%
  mutate(s1_total =(s1_phantom+s1_real),
         s2_total =(s2_phantom+s2_real),
         s1_ppm = s1_phantom/(s1_total),
         s2_ppm = s2_phantom/(s2_total))
  
summary_mol_counts_cell
cell_status_tally <-
  summary_mol_counts_cell %>%
  group_by(cell_status,label) %>%
  tally(sort=TRUE)
cell_status_tally
n_affected_cells <- 
  cell_status_tally %>%
  ungroup() %>%
  filter(cell_status!="real") %>% 
  summarise(n=sum(n)) %>% 
  pull(n)
n_affected_cells <- n_affected_cells + 1502 +6
n_affected_cells
[1] 64579
n_total_cells <-
  summary_mol_counts_cell %>%
  mutate_at(vars(matches("_total")), 
            list(~ as.integer(.!=0))) %>%
  ungroup() %>%
  summarise_at(vars(matches("_total")), list(~sum(.))) %>%
  mutate(cells_total= s1_total+s2_total)%>%
  pull(cells_total)
n_total_cells 
[1] 322321

Proportion of affected cells

p_affected_cells <-  n_affected_cells/n_total_cells
p_affected_cells
[1] 0.20036

For each cell-barcode, plot the number of phantom molecules against the number of total molecules associated with it.

Sample 1 plot

p2 <- ggplot(summary_mol_counts_cell %>%
               filter(cell_status!="real" & s1_phantom >0 )) +
  geom_point(aes(x = s1_total,
                  y = s1_phantom,
                 colour=cell_status)) +
  scale_x_log10() +
  scale_y_log10()
p2

 #ggsave("phantom_molecules_validation.pdf", p2,  width =8, height = 5)

Sample 2 plot

p3 <- ggplot(summary_mol_counts_cell %>%
               filter(cell_status!="real" & s2_phantom >0 )) +
  geom_point(aes(x = s2_total,
                  y = s2_phantom,
                 colour=cell_status)) +
  scale_x_log10() +
  scale_y_log10()
p3

#ggsave("phantom_cells_validation_s2.pdf", p3,  width =8, height = 5)

Run workflow on multiplexed data

read_counts <- 
  data %>% 
  filter(label!="NA")%>%
  select(-ends_with("nonplexed")) %>%
  set_names(c("cell", "gene", "umi", "s1", "s2", "outcome", "label"))
read_counts
S <- 2
sample_names <- colnames(read_counts)[4:(S+3)]
sample_names
[1] "s1" "s2"
tic("Step 2: creating outcome counts datatable with grouping vars")
outcome_counts <- create_outcome_counts(read_counts%>%
                                          select(-label), 
                                        sample_names,  
                                        min_frac=0.8)
toc()
Step 2: creating outcome counts datatable with grouping vars: 1.591 sec elapsed
outcome_counts
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
  fit_out <-
    estimate_hopping_rate(
      outcome_counts,
      S
    )
  toc()
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.083 sec elapsed
fit_out 
$glm_estimates

$chimera_counts
NA
  # compute_molecular_complexity_profile
  tic("Step 4: compute molecular complexity profile and other summary statistics")
  summary_stats <-
    compute_summary_stats(
      outcome_counts,
      fit_out$glm_estimates$phat,
      sample_names
    )
  toc()
Step 4: compute molecular complexity profile and other summary statistics: 0.16 sec elapsed
summary_stats
$summary_estimates

$marginal

$conditional

$pi_r_hat
NA

Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.

torc <- 3 
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
  outcome_counts <-
    reassign_reads_and_mark_retained_observations(
      outcome_counts,
      summary_stats,
      sample_names,
      fit_out,
      torc
    )
  # get the tradoff ratio cutoff
  summary_stats <- get_threshold(outcome_counts, summary_stats)
  toc()
Step 5: reassign read counts, determine cutoff, and mark retained observations: 1.176 sec elapsed
tic("Step 6: Purge and save read counts datatable to disk")
read_counts <-
  left_join(read_counts %>%
    select(outcome, cell, umi, gene, sample_names, label),
  outcome_counts,
  by = c("outcome")
  ) %>%
  select(-outcome)
toc()
Step 6: Purge and save read counts datatable to disk: 2.166 sec elapsed

Compare the SIHR estimates with ground truth estimates

p5 <- 
  ggplot(summary_counts_conditional) +
  geom_line(aes(x = r,
                  y = SIHR_12,
             colour="12"))+
  geom_line(aes(x = r,
                  y = SIHR_21,
             colour="21"))  +
geom_hline(aes(yintercept =  summary_counts_marginal$SIHR, 
               colour="true mean"), 
           linetype="solid",
           size=.5)   +
geom_hline(aes(yintercept =  fit_out$glm_estimates$SIHR, 
               colour="estimate"),
           linetype="solid",
           size=.1)   +
    geom_linerange(data=summary_counts_conditional, 
                  aes(x=r,
                      ymax=1-fit_out$glm_estimates$phat_low,
                      ymin=1-fit_out$glm_estimates$phat_high,
                      colour="estimate"), 
                  size=.5)+ 
  xlim(1,210) +
  ylim(0.002,0.005)
#ggsave("index_hopping_rate_200.pdf", p5, width=9, height=6)
p5

Determine the number of false positives and false negatives

read_counts <-
    read_counts %>%
  ungroup() %>%
  arrange(-qr) %>%
  mutate(t= case_when(
      label %in% c("f,r","0,r") ~ 2,
      label %in% c("r,f","r,0") ~ 1    ),
    f= case_when(
      label %in% c("f,r","f,0") ~ 1,
      label %in% c("r,f","0,f") ~ 2    )) %>%
  mutate(tp= if_else( t == s, 1L, 0L, missing =0L),
         fp= if_else( f == s, 1L, 0L, missing =0L),
         tn= if_else( f != s, 1L, 0L, missing =0L),
         fn= if_else( t != s, 1L, 0L, missing =0L),
         tp0= if_else( t == 0, 1L, 0L, missing =0L), #0 if  predict all molecules to be phantom
         fn0= if_else( t != 0, 1L, 0L, missing =0L),
         tn0= if_else( f != 0, 1L, 0L, missing =0L),
         fp0= if_else( f == 0, 1L, 0L, missing =0L),
         tp_max= if_else( t == s_maxprop, 1L, 0L, missing =0L),
         fp_max= if_else( f == s_maxprop, 1L, 0L, missing =0L),
         tn_max= if_else( f != s_maxprop, 1L, 0L, missing =0L),
         fn_max= if_else( t != s_maxprop, 1L, 0L, missing =0L)) 

The maximum read fraction method

false_counts_maxprop <-
  read_counts %>%
  summarize_at(vars(c("tp_max", "fp_max", "tn_max", "fn_max")),
            list( ~ sum(.))) %>%
  set_names(c("tp", "fp", "tn", "fn"))
false_counts_maxprop

The tor method

false_counts_min_cutoff <-
  read_counts %>%
  summarize_at(vars(c("tp", "fp", "tn", "fn")),
            list( ~ sum(.))) 
false_counts_min_cutoff

No purging

false_counts_nopurging <- 
  read_counts%>%
  summarize(n_cugs = n(),
            n_real= sum(t>0, na.rm = TRUE),
            n_fantom = sum(f>0, na.rm = TRUE),
            n_mol=n_real+n_fantom,
            g =n_cugs- n_real,
            u = n_mol-n_cugs,
            tp=n_mol-g,
            fp=u+g,
            tn=0,
            fn=0)
false_counts_nopurging

TOR cutoff

read_counts <-
  read_counts %>%
  mutate_at(vars(c("tp", "fp", "tn", "fn","tp0", "fp0", "tn0", "fn0")),
            list(cum= ~ cumsum(.)))  %>%
  mutate_at(vars(c("tp_cum", "fp_cum", "tn_cum", "fn_cum")), 
            list( ~ (last(.)-lag(., default =0)))) %>%
  mutate(tp_t=tp_cum + tp0_cum,
         fp_t=fp_cum + fp0_cum,
         tn_t=tn_cum + tn0_cum,
         fn_t=fn_cum + fn0_cum,
         fpm= first(fp_t)- fp_t, 
         fnm= fn_t-first(fn_t),
         tor_true= fnm/fpm)
false_counts_tor_cutoff <-
  read_counts%>%
  filter(retain) %>%
  slice(1)%>%
  select(c("s1", "s2", "qr", "tor", "tp_t", "fp_t", "tn_t", "fn_t", "fpm", "fnm", "tor_true"))  
false_counts_tor_cutoff

Create comparison datatable

false_counts_dt <-
  bind_rows(
    list(no_purging=false_counts_nopurging %>%
           select(c("tp", "fp", "tn", "fn")),
         no_cutoff=false_counts_min_cutoff,
         tor_cutoff=
           false_counts_tor_cutoff %>%
           select(c("tp_t", "fp_t", "tn_t", "fn_t")) %>%
           set_names(c("tp", "fp", "tn", "fn")),
         max_frac=false_counts_maxprop),
    .id="approach") %>%
  select(approach, fp,fn, tp, tn) %>%
  mutate(fpr=fp/false_counts_nopurging$n_fantom,
         fnr=fn/false_counts_nopurging$n_real)
false_counts_dt

Plots

Datatable for plotting

classification_curves <-
  read_counts %>% 
  group_by(qr) %>%
  slice(1L) %>%
  ungroup() %>%
  select( qr, qs, tor, retain, fp_t, FP, fn_t, FN, tp_t, tn_t, TP, TN, FPm, FNm, fpm, fnm, tor_true,o,r) %>%
  mutate(fpr=fp_t/false_counts_nopurging$n_fantom,
         fnr=fn_t/false_counts_nopurging$n_real)
classification_curves

Preformance Plots

p_tradeoff <-  
    ggplot(classification_curves) + 
    geom_point(
      aes(x = FPm,
          y = FNm),
      size=.5)+
    geom_line(
      aes(x = FPm,
          y = FPm,
          colour="1")
    ) +
    geom_line(
      aes(x = FPm,
          y = 2*FPm,
          colour="2"))+
    geom_line(
      aes(x = FPm,
          y = 3*FPm,
          colour="3"))+
    geom_line(
      aes(x = FPm,
          y = 4*FPm,
          colour="4"))+
    geom_line(
      aes(x = FPm,
          y = 5*FPm,
          colour="5"))+
    geom_line(
      aes(x = FPm,
          y = 9*FPm,
          colour="9"))+
    scale_y_log10() +
    theme_bw()  +
      theme(
        legend.title = element_text(face = "bold")) + 
      scale_color_discrete(name = "TORC") +
    labs(x="Marginal Decrease in False Positives (reduce phantom molecs) ",
         y="Marginal Increase in False Negatives (discard real molecs)") 
    
#ggsave(file.path(figures_dir, "validation_tradeoff.pdf"), p_tradeoff, width=9, height=6)
p_tradeoff

p6 <-
  ggplot(classification_curves)  + 
    geom_point(
             aes(x = fp_t,
                  y = fn_t,
                 colour="true")) +
  geom_line(
             aes(x = fp_t,
                  y = fn_t,
                 colour="true")) +
    geom_line(
             aes(x = FP,
                  y = FN,
                 colour="predicted"))+
      geom_point(
             aes(x = FP,
                  y = FN,
                 colour="predicted"))+ 
  geom_point(data=false_counts_dt ,
             aes(x = fp,
                  y = fn,
                  shape=approach),
             size=2) +
    labs(x="False Positive Count",
       y="False Negative Count")  + 
    scale_y_sqrt() + 
  scale_x_sqrt() 
#ggsave("peformance_groundtruth.pdf", p6, width=9, height=6)
 p6

p7 <- ggplot(false_counts_dt
              %>% filter(approach %in% c("no_cutoff", "tor_cutoff", "max_frac")))  + 
  geom_point(
             aes(x = fp ,
                  y = fn,
                  color=approach),
             size=2) 
#ggsave("peformance_zoom.pdf", p7, width=9, height=6)
p7

sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.2 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8   
 [6] LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] cowplot_0.9.4      data.table_1.12.2  tictoc_1.0         furrr_0.1.0        future_1.13.0      broom_0.5.2        matrixStats_0.54.0 forcats_0.4.0     
 [9] stringr_1.4.0      dplyr_0.8.1        purrr_0.3.2        readr_1.3.1        tidyr_0.8.3        tibble_2.1.2       ggplot2_3.1.1      tidyverse_1.2.1   
[17] rhdf5_2.28.0      

loaded via a namespace (and not attached):
 [1] tidyselect_0.2.5 xfun_0.7         listenv_0.7.0    haven_2.1.0      lattice_0.20-38  colorspace_1.4-1 generics_0.0.2   yaml_2.2.0      
 [9] rlang_0.3.4      pillar_1.4.1     glue_1.3.1       withr_2.1.2      modelr_0.1.4     readxl_1.3.1     plyr_1.8.4       munsell_0.5.0   
[17] gtable_0.3.0     cellranger_1.1.0 rvest_0.3.4      codetools_0.2-16 labeling_0.3     knitr_1.23       parallel_3.6.0   Rcpp_1.0.1      
[25] scales_1.0.0     backports_1.1.4  jsonlite_1.6     hms_0.4.2        digest_0.6.19    stringi_1.4.3    grid_3.6.0       rprojroot_1.3-2 
[33] cli_1.1.0        tools_3.6.0      magrittr_1.5     lazyeval_0.2.2   crayon_1.3.4     pkgconfig_2.0.2  MASS_7.3-51.1    xml2_1.2.0      
[41] lubridate_1.7.4  assertthat_0.2.1 httr_1.4.0       rstudioapi_0.10  Rhdf5lib_1.6.0   R6_2.4.0         globals_0.12.4   nlme_3.1-140    
[49] compiler_3.6.0  
---
title: "Phantom Purge"
subtitle: "Validation Analysis: Part II"
author: 
- name: Rick Farouni
  affiliation:
  - &cruk Génome Québec Innovation Centre, McGill University, Montreal, Canada
date: '`r format(Sys.Date(), "%Y-%B-%d")`'
output:
  html_notebook:
    df_print: paged
    code_folding: show
    toc: no
    toc_float: 
      collapsed: false
      smooth_scroll: false
---

# Prepare analysis workflow

### Set parameters

```{r setup}
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(readr.show_progress = FALSE,
        digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)
```


### Set filepaths and parameters

```{r}
project_dir <- rprojroot::find_rstudio_root_file()

if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
```

### Load libraries

```{r message=FALSE, warning=FALSE}
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
```


### Load functions


```{r message=FALSE}
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
```

### Load data
```{r}
validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
```

```{r}
data <- read_tsv(file.path(validation_output_dir,
                           "hiseq4000_inner_joined_with_labels.txt"))
data
```


# Compute Index hopping rate

##  Estimates conditional on duplication level and label

```{r}
summary_counts <-
    data %>%
  mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
  arrange(r) %>%
  group_by(r, label) %>%
  summarize_at(vars(matches("^s1_pl|s2_pl")), 
               list(~ sum(.)))  %>%
  mutate(s1_hopped=if_else(label %in% c("0,f", "r,f"), s2_plexed,0),
         s2_hopped=if_else(label %in% c("f,0", "f,r"), s1_plexed,0),
         s1_nonhopped=if_else(label %in% c("r,0", "r,f"), s1_plexed,0),
         s2_nonhopped=if_else(label %in% c("0,r", "f,r"), s2_plexed,0)) %>%
  select(-s1_plexed,- s2_plexed)

summary_counts
```

## Estimates conditional on duplication level

```{r}
summary_counts_conditional <-
summary_counts  %>%
  group_by(r) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.)))%>%
  mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
         SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
         frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_conditional
```



## Marginal estimates

```{r}
summary_counts_marginal <- 
  summary_counts %>%
  ungroup() %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.)))%>%
  mutate(SIHR_12 = s1_hopped/(s1_hopped+s1_nonhopped),
         SIHR_21 = s2_hopped/(s2_hopped+s2_nonhopped),
         SIHR=1-(s1_nonhopped+s2_nonhopped)/(s1_hopped+s1_nonhopped +s2_hopped+s2_nonhopped),
         frac_s1= (s1_hopped+s1_nonhopped)/ (s1_hopped+s1_nonhopped+s2_hopped+s2_nonhopped))
summary_counts_marginal
```


```{r, fig.height=10}
p1 <- 
  ggplot(summary_counts_conditional) +
    geom_line(aes(x = r,
                    y = SIHR_12*100,
               colour="SIHR_12"))+
    geom_line(aes(x = r,
                    y = SIHR_21*100,
               colour="SIHR_21")) +
      geom_line(aes(x = r,
                    y = frac_s1,
               colour="frac_s1")) +
    geom_hline(yintercept = unlist(summary_counts_marginal[5:7])* c(100, 100, 100),
               linetype="dashed") +
    xlim(0,90) + 
  ylim(0,1) 
p1
 
```


## Molecules

```{r}
summary_mol_counts <-
  data %>%
  mutate(r=as.integer(rowSums(.[c(6,7)]))) %>%
  arrange(r) %>%  
  mutate_at(vars(matches("^s")), 
            list(~ as.integer(.!=0)))  %>%
  group_by(r, label) %>%
  summarize_at(vars(matches("^s1_pl|^s2_pl")), 
               list(~ sum(.)))  %>%
    mutate(s1_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
         s2_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
         s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
         s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
  select(-s1_plexed,- s2_plexed) 
summary_mol_counts 
```
```{r}
summary_mol_counts_marginal <-
  summary_mol_counts%>%
  ungroup() %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>%
  mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),# prop hopped phantom molec
         ppm_21 = s2_phantom/(s2_phantom+s2_real),
         ppm_1 = s2_phantom/(s2_phantom+s1_real), # prop phantom molec
         ppm_2 = s1_phantom/(s1_phantom+s2_real),
         ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
         frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_marginal
```


```{r}
summary_mol_counts_conditional <-
  summary_mol_counts%>%
  group_by(r) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>%
  mutate(ppm_12 = s1_phantom/(s1_phantom+s1_real),
         ppm_21 = s2_phantom/(s2_phantom+s2_real),
         ppm=(s1_phantom+s2_phantom)/(s1_phantom+s1_real+s2_phantom+s2_real),
         frac_mol_s1= (s1_phantom+s1_real)/ (s1_phantom+s1_real+s2_phantom+s2_real))
summary_mol_counts_conditional
```

```{r, fig.height=10}
p4 <- ggplot(summary_mol_counts_conditional) +
  geom_line(aes(x = r,
                  y = ppm_12,
             colour="ppm_12"))+
  geom_line(aes(x = r,
                  y =  ppm_21,
             colour=" ppm_21")) +
  #geom_hline(yintercept = unlist(summary_mol_counts_marginal[5:7]), linetype="dashed") +
  xlim(0,300) 
p4
 #ggsave("phantom_molecules_validation.pdf", p4,  width =8, height = 5)
```



## Examine extent of contamination in cells

```{r}
summary_mol_counts_cell<-
  data %>%
  filter(label!="NA") %>%
  mutate_at(vars(matches("^s")), 
            list(~ as.integer(.!=0)))  %>%
  group_by(cell, label) %>%
  summarize_at(vars(matches("^s1_pl|^s2_pl")), 
               list(~ sum(.)))  %>%
    mutate(s1_phantom=if_else(label %in% c("f,0", "f,r"), s1_plexed,0L),
         s2_phantom=if_else(label %in% c("0,f", "r,f"), s2_plexed,0L),
         s1_real=if_else(label %in% c("r,0", "r,f"), s1_plexed,0L),
         s2_real=if_else(label %in% c("0,r", "f,r"), s2_plexed,0L)) %>%
  select(-s1_plexed,- s2_plexed) %>%
  group_by(cell) %>%
  summarize_at(vars(matches("^s")), 
               list(~ sum(.))) %>% 
    mutate_at(vars(matches("^s")), 
            list(nonempty= ~ as.integer(.!=0)))  %>%
  unite(label,matches("nonempty"), sep=",") %>%
  mutate(cell_status= 
           case_when(label %in% c("0,0,0,1","0,0,1,0") ~ "real",
                     label %in% c("0,0,1,1") ~ "real",
                     label %in% c("1,0,0,0","0,1,0,0", "1,1,0,0") ~  "phantom",
                     label %in% c("1,0,0,1","0,1,1,0") ~  "phantom",
                     TRUE ~  "contaminated"))  %>%
  mutate(s1_total =(s1_phantom+s1_real),
         s2_total =(s2_phantom+s2_real),
         s1_ppm = s1_phantom/(s1_total),
         s2_ppm = s2_phantom/(s2_total))
  
summary_mol_counts_cell
```
```{r}
cell_status_tally <-
  summary_mol_counts_cell %>%
  group_by(cell_status,label) %>%
  tally(sort=TRUE)
cell_status_tally
```

```{r}
n_affected_cells <- 
  cell_status_tally %>%
  ungroup() %>%
  filter(cell_status!="real") %>% 
  summarise(n=sum(n)) %>% 
  pull(n)
n_affected_cells <- n_affected_cells + 1502 +6
n_affected_cells
```


```{r}
n_total_cells <-
  summary_mol_counts_cell %>%
  mutate_at(vars(matches("_total")), 
            list(~ as.integer(.!=0))) %>%
  ungroup() %>%
  summarise_at(vars(matches("_total")), list(~sum(.))) %>%
  mutate(cells_total= s1_total+s2_total)%>%
  pull(cells_total)
n_total_cells 
```

Proportion of affected cells

```{r}
p_affected_cells <-  n_affected_cells/n_total_cells
p_affected_cells
```

For each cell-barcode, plot the number of phantom molecules against the number of total molecules associated with it.

Sample 1 plot

```{r, fig.height=10}
p2 <- ggplot(summary_mol_counts_cell %>%
               filter(cell_status!="real" & s1_phantom >0 )) +
  geom_point(aes(x = s1_total,
                  y = s1_phantom,
                 colour=cell_status)) +
  scale_x_log10() +
  scale_y_log10()

p2
 #ggsave("phantom_molecules_validation.pdf", p2,  width =8, height = 5)
```

Sample 2 plot


```{r, fig.height=10}
p3 <- ggplot(summary_mol_counts_cell %>%
               filter(cell_status!="real" & s2_phantom >0 )) +
  geom_point(aes(x = s2_total,
                  y = s2_phantom,
                 colour=cell_status)) +
  scale_x_log10() +
  scale_y_log10()

p3
#ggsave("phantom_cells_validation_s2.pdf", p3,  width =8, height = 5)
```

# Run workflow on multiplexed data

```{r}
read_counts <- 
  data %>% 
  filter(label!="NA")%>%
  select(-ends_with("nonplexed")) %>%
  set_names(c("cell", "gene", "umi", "s1", "s2", "outcome", "label"))
read_counts
```


```{r}
S <- 2
sample_names <- colnames(read_counts)[4:(S+3)]
sample_names
```



```{r}
tic("Step 2: creating outcome counts datatable with grouping vars")

outcome_counts <- create_outcome_counts(read_counts%>%
                                          select(-label), 
                                        sample_names,  
                                        min_frac=0.8)
toc()


outcome_counts
```

```{r}
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
  fit_out <-
    estimate_hopping_rate(
      outcome_counts,
      S
    )
  toc()
fit_out 
```



```{r}
  # compute_molecular_complexity_profile
  tic("Step 4: compute molecular complexity profile and other summary statistics")
  summary_stats <-
    compute_summary_stats(
      outcome_counts,
      fit_out$glm_estimates$phat,
      sample_names
    )
  toc()
summary_stats
```

Set the trade-off ratio cost cutoff (*torc*). The parameter *torc* represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of *torc* are expected to be within the range of 1-5.

```{r}
torc <- 3 
```


```{r}
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")

  outcome_counts <-
    reassign_reads_and_mark_retained_observations(
      outcome_counts,
      summary_stats,
      sample_names,
      fit_out,
      torc
    )
  # get the tradoff ratio cutoff
  summary_stats <- get_threshold(outcome_counts, summary_stats)

  toc()
```



```{r}
tic("Step 6: Purge and save read counts datatable to disk")

read_counts <-
  left_join(read_counts %>%
    select(outcome, cell, umi, gene, sample_names, label),
  outcome_counts,
  by = c("outcome")
  ) %>%
  select(-outcome)

toc()

```


### Compare the SIHR estimates with ground truth estimates

```{r, fig.width=10}
p5 <- 
  ggplot(summary_counts_conditional) +
  geom_line(aes(x = r,
                  y = SIHR_12,
             colour="12"))+
  geom_line(aes(x = r,
                  y = SIHR_21,
             colour="21"))  +
geom_hline(aes(yintercept =  summary_counts_marginal$SIHR, 
               colour="true mean"), 
           linetype="solid",
           size=.5)   +
geom_hline(aes(yintercept =  fit_out$glm_estimates$SIHR, 
               colour="estimate"),
           linetype="solid",
           size=.1)   +
    geom_linerange(data=summary_counts_conditional, 
                  aes(x=r,
                      ymax=1-fit_out$glm_estimates$phat_low,
                      ymin=1-fit_out$glm_estimates$phat_high,
                      colour="estimate"), 
                  size=.5)+ 
  xlim(1,210) +
  ylim(0.002,0.005)
#ggsave("index_hopping_rate_200.pdf", p5, width=9, height=6)
p5
```



# Determine the number of false positives and false negatives

```{r}
read_counts <-
    read_counts %>%
  ungroup() %>%
  arrange(-qr) %>%
  mutate(t= case_when(
      label %in% c("f,r","0,r") ~ 2,
      label %in% c("r,f","r,0") ~ 1    ),
    f= case_when(
      label %in% c("f,r","f,0") ~ 1,
      label %in% c("r,f","0,f") ~ 2    )) %>%
  mutate(tp= if_else( t == s, 1L, 0L, missing =0L),
         fp= if_else( f == s, 1L, 0L, missing =0L),
         tn= if_else( f != s, 1L, 0L, missing =0L),
         fn= if_else( t != s, 1L, 0L, missing =0L),
         tp0= if_else( t == 0, 1L, 0L, missing =0L), #0 if  predict all molecules to be phantom
         fn0= if_else( t != 0, 1L, 0L, missing =0L),
         tn0= if_else( f != 0, 1L, 0L, missing =0L),
         fp0= if_else( f == 0, 1L, 0L, missing =0L),
         tp_max= if_else( t == s_maxprop, 1L, 0L, missing =0L),
         fp_max= if_else( f == s_maxprop, 1L, 0L, missing =0L),
         tn_max= if_else( f != s_maxprop, 1L, 0L, missing =0L),
         fn_max= if_else( t != s_maxprop, 1L, 0L, missing =0L)) 
```


### The maximum read fraction method


```{r}
false_counts_maxprop <-
  read_counts %>%
  summarize_at(vars(c("tp_max", "fp_max", "tn_max", "fn_max")),
            list( ~ sum(.))) %>%
  set_names(c("tp", "fp", "tn", "fn"))
false_counts_maxprop
```

### The tor method


```{r}
false_counts_min_cutoff <-
  read_counts %>%
  summarize_at(vars(c("tp", "fp", "tn", "fn")),
            list( ~ sum(.))) 
false_counts_min_cutoff
```

### No purging

```{r}
false_counts_nopurging <- 
  read_counts%>%
  summarize(n_cugs = n(),
            n_real= sum(t>0, na.rm = TRUE),
            n_fantom = sum(f>0, na.rm = TRUE),
            n_mol=n_real+n_fantom,
            g =n_cugs- n_real,
            u = n_mol-n_cugs,
            tp=n_mol-g,
            fp=u+g,
            tn=0,
            fn=0)

false_counts_nopurging
```


###  TOR cutoff

```{r}
read_counts <-
  read_counts %>%
  mutate_at(vars(c("tp", "fp", "tn", "fn","tp0", "fp0", "tn0", "fn0")),
            list(cum= ~ cumsum(.)))  %>%
  mutate_at(vars(c("tp_cum", "fp_cum", "tn_cum", "fn_cum")), 
            list( ~ (last(.)-lag(., default =0)))) %>%
  mutate(tp_t=tp_cum + tp0_cum,
         fp_t=fp_cum + fp0_cum,
         tn_t=tn_cum + tn0_cum,
         fn_t=fn_cum + fn0_cum,
         fpm= first(fp_t)- fp_t, 
         fnm= fn_t-first(fn_t),
         tor_true= fnm/fpm)
```



```{r}
false_counts_tor_cutoff <-
  read_counts%>%
  filter(retain) %>%
  slice(1)%>%
  select(c("s1", "s2", "qr", "tor", "tp_t", "fp_t", "tn_t", "fn_t", "fpm", "fnm", "tor_true"))  
false_counts_tor_cutoff
```
## Create comparison datatable

```{r}
false_counts_dt <-
  bind_rows(
    list(no_purging=false_counts_nopurging %>%
           select(c("tp", "fp", "tn", "fn")),
         no_cutoff=false_counts_min_cutoff,
         tor_cutoff=
           false_counts_tor_cutoff %>%
           select(c("tp_t", "fp_t", "tn_t", "fn_t")) %>%
           set_names(c("tp", "fp", "tn", "fn")),
         max_frac=false_counts_maxprop),
    .id="approach") %>%
  select(approach, fp,fn, tp, tn) %>%
  mutate(fpr=fp/false_counts_nopurging$n_fantom,
         fnr=fn/false_counts_nopurging$n_real)
false_counts_dt
```



## Plots

### Datatable for plotting


```{r}
classification_curves <-
  read_counts %>% 
  group_by(qr) %>%
  slice(1L) %>%
  ungroup() %>%
  select( qr, qs, tor, retain, fp_t, FP, fn_t, FN, tp_t, tn_t, TP, TN, FPm, FNm, fpm, fnm, tor_true,o,r) %>%
  mutate(fpr=fp_t/false_counts_nopurging$n_fantom,
         fnr=fn_t/false_counts_nopurging$n_real)
classification_curves
```

### Preformance Plots



```{r fig.height=7, fig.width=10, message=FALSE, warning=FALSE}
p_tradeoff <-  
    ggplot(classification_curves) + 
    geom_point(
      aes(x = FPm,
          y = FNm),
      size=.5)+
    geom_line(
      aes(x = FPm,
          y = FPm,
          colour="1")
    ) +
    geom_line(
      aes(x = FPm,
          y = 2*FPm,
          colour="2"))+
    geom_line(
      aes(x = FPm,
          y = 3*FPm,
          colour="3"))+
    geom_line(
      aes(x = FPm,
          y = 4*FPm,
          colour="4"))+
    geom_line(
      aes(x = FPm,
          y = 5*FPm,
          colour="5"))+
    geom_line(
      aes(x = FPm,
          y = 9*FPm,
          colour="9"))+
    scale_y_log10() +
    theme_bw()  +
      theme(
        legend.title = element_text(face = "bold")) + 
      scale_color_discrete(name = "TORC") +
    labs(x="Marginal Decrease in False Positives (reduce phantom molecs) ",
         y="Marginal Increase in False Negatives (discard real molecs)") 
    

#ggsave(file.path(figures_dir, "validation_tradeoff.pdf"), p_tradeoff, width=9, height=6)

p_tradeoff
```

```{r fig.height=6}
p6 <-
  ggplot(classification_curves)  + 
    geom_point(
             aes(x = fp_t,
                  y = fn_t,
                 colour="true")) +

  geom_line(
             aes(x = fp_t,
                  y = fn_t,
                 colour="true")) +
    geom_line(
             aes(x = FP,
                  y = FN,
                 colour="predicted"))+
      geom_point(
             aes(x = FP,
                  y = FN,
                 colour="predicted"))+ 

  geom_point(data=false_counts_dt ,
             aes(x = fp,
                  y = fn,
                  shape=approach),
             size=2) +
    labs(x="False Positive Count",
       y="False Negative Count")  + 
    scale_y_sqrt() + 
  scale_x_sqrt() 
#ggsave("peformance_groundtruth.pdf", p6, width=9, height=6)
 p6
```



```{r fig.height=8}
p7 <- ggplot(false_counts_dt
              %>% filter(approach %in% c("no_cutoff", "tor_cutoff", "max_frac")))  + 
  geom_point(
             aes(x = fp ,
                  y = fn,
                  color=approach),
             size=2) 


#ggsave("peformance_zoom.pdf", p7, width=9, height=6)
p7
```
```{r}
sessionInfo()
```

