knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <-"hiseq2500"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: novaseq_l2
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
Each sample’s molecule_info.h5 file should be renamed to {sample_name}.h5 and placed in ../project_dir/data/{dataset_name}/input/. The purged UMI count matrices and other output files are saved to ../project_dir/data/{dataset_name}/output/.
code_dir <- file.path(project_dir, "code")
data_dir <- file.path(project_dir, "data", 
                      dataset_name)
input_dir <- file.path(data_dir, "input")
output_dir <- file.path(data_dir, "output")
figures_dir <- file.path(output_dir, "figures")
read_counts_filepath <- file.path(output_dir,
                                  sprintf("%s_read_counts.rds", 
                                          dataset_name))
results_filepath <- file.path(output_dir, 
                              sprintf("%s_results.rds", 
                                      dataset_name))
Create directories if they don’t exist.
dir.create(output_dir)
Warning in dir.create(output_dir): '/project/6007998/rfarouni/
index_hopping/data/novaseq_l2/output' already exists
dir.create(figures_dir)
Warning in dir.create(figures_dir): '/project/6007998/rfarouni/
index_hopping/data/novaseq_l2/output/figures' already exists
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3 
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
purge_phantoms <- function(input_dir,
                           output_dir,
                           read_counts_filepath = NULL,
                           torc = 3,
                           max_r = NULL) {
  tic("Running workflow I")
  tic("Step 1: loading molecule_info files and creating read counts datatable")
  read_counts <- create_joined_counts(input_dir, read_counts_filepath)
  toc()
  sample_names <-
    setdiff(
      colnames(read_counts),
      c("cell", "umi", "gene", "outcome")
    )
  S <- length(sample_names)
  tic("Step 2: creating outcome counts datatable with grouping vars")
  outcome_counts <- create_outcome_counts(read_counts, sample_names)
  toc()
  tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
  fit_out <-
    estimate_hopping_rate(
      outcome_counts,
      S,
      max_r = max_r
    )
  toc()
  # compute_molecular_complexity_profile
  tic("Step 4: compute molecular complexity profile and other summary statistics")
  summary_stats <-
    compute_summary_stats(
      outcome_counts,
      fit_out$glm_estimates$phat,
      sample_names
    )
  toc()
  tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
  outcome_counts <-
    reassign_reads_and_mark_retained_observations(
      outcome_counts,
      summary_stats,
      sample_names,
      fit_out,
      torc
    )
  # get the tradoff ratio cutoff
  summary_stats <- get_threshold(outcome_counts, summary_stats)
  toc()
  tic("Step 6: Purge and save read counts datatable to disk")
  read_counts <-
    left_join(read_counts %>%
      select(outcome, cell, umi, gene, sample_names),
    outcome_counts %>%
      select(c("outcome", "retain", paste0(sample_names, "_hat"))),
    by = c("outcome")
    ) %>%
    select(-outcome)
  purge_and_save_read_counts(
    read_counts,
    dataset_name,
    sample_names,
    output_dir
  )
  toc()
  tic("Step 7: create umi counts matrices")
  umi_counts_cell_gene <-
    create_umi_counts(
      read_counts,
      sample_names
    )
  toc()
  outcome_counts <-
    outcome_counts %>%
    arrange(-qr) %>%
    select(-c(paste0(sample_names, "_hat")))
  read_counts <-
    read_counts %>%
    select(-c("retain", paste0(sample_names, "_hat")))
  summary_stats$sample_names <- sample_names
  data_list <-
    list(
      umi_counts_cell_gene = umi_counts_cell_gene,
      read_counts = read_counts,
      outcome_counts = outcome_counts,
      fit_out = fit_out,
      summary_stats = summary_stats
    )
  toc()
  return(data_list)
}
identify_rna_cells <- function(data_list, output_dir) {
  tic("Running workflow II")
  tic("Step 7: identify RNA-containing cells")
  called_cells_out <- call_cells_all_samples(data_list$umi_counts_cell_gene, output_dir)
  toc()
  tic("Step 9: tallying molecules by cell-barcode")
  umi_counts_cell <- map2(
    called_cells_out$called_cells,
    data_list$umi_counts_cell_gene,
    get_umi_counts_cell
  )
  umi_counts_sample <-
    map(umi_counts_cell,
      map_dfr,
      get_umi_counts_sample,
      .id = "split"
    ) %>%
    bind_rows(.id = "sample")
  data_list$summary_stats <-
    update_summary_stats(
      data_list$summary_stats,
      umi_counts_sample
    )
  toc()
  data_list <-
    c(
      data_list,
      list(
        umi_counts_cell = umi_counts_cell,
        called_cells_tally = called_cells_out$called_cells_tally
      )
    )
  toc()
  return(data_list)
}
Estimate the sample index hopping probability, infer the true sample of origin, and reassign reads.
data_list <- purge_phantoms(input_dir, output_dir, read_counts_filepath, torc=torc)
Step 1: loading molecule_info files and creating read counts datatable: 499.752 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 61.48 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.18 sec elapsed
Step 4: compute molecular complexity profile and other summary statistics: 0.49 sec elapsed
Step 5: reassign read counts, determine cutoff, and mark retained observations: 6.194 sec elapsed
Step 6: Purge and save read counts datatable to disk: 724.377 sec elapsed
Step 7: create umi counts matrices: 762.945 sec elapsed
Running workflow I: 2055.481 sec elapsed
data_list$read_counts 
The datatable is ordered in descending order of qr, the posterior probability of incorrectly assigning s as the inferred sample of origin. n is the number of CUGs with the corresponding outcome and p_outcome is the observed marginal probability of that outcome.
data_list$outcome_counts 
p_chimeras is the proportion CUGs that are chimeric. g is the estimated proportion of fugue molecules and u is the molecule inflation factor such that n_cugs x u would give the number of non-fugue phantom molecules. The estimated total number of phantom molecules present in the dataset is given by n_pm=n_cugs x (u+g).
 data_list$summary_stats$summary_estimates
 data_list$summary_stats$conditional
 data_list$summary_stats$pi_r_hat
Plot
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=250)
p_read <- plot_grid(p_read$p, 
          p_read$legend,
          ncol=2,
          rel_widths=c(1, 0.1))
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_molcomplexity.pdf")),
       p_read,
       width=9,
       height=6)
p_read
data_list$fit_out$glm_estimates
p_fit <- plot_fit(data_list, dataset_name, x_lim=250)
p_fit <-plot_grid(p_fit$p,
          p_fit$legend,
          ncol=2,
          rel_widths=c(1, 0.2))
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_fit.pdf")),
       p_fit,
       width=9,
       height=6)
p_fit
data_list$summary_stats$cutoff_dt
The row discard_torc shows the outcome whose qr value is the maximum allowed. Reads corresponding to outcomes with greater qr values are discarded. no_discarding corresponds to retaining all reassigned reads and no_purging corresponds to keeping the data as it is.
p_fp <- plot_fp_reduction(data_list, dataset_name)
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_fp_performance.pdf")),
       p_fp,
       width=9,
       height=6)
p_fp
p_tradeoff <- plot_fp_tradoff(data_list, dataset_name)
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_tradeoff.pdf")),
       p_tradeoff,
       width=9,
       height=6)
p_tradeoff
data_list <- identify_rna_cells(data_list, output_dir)
Step 7: identify RNA-containing cells: 1989.231 sec elapsed
Step 9: tallying molecules by cell-barcode: 322.964 sec elapsed
Running workflow II: 2312.195 sec elapsed
Here we examine the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here m is the number of total molecules in millions; rm_ret is the number of predicted real molecules and prm_ret is the proportion; rm_disc is the number of discarded real molecules; and pm is the number of predicted phantom molecules.
data_list$summary_stats$marginal
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$summary_stats$marginal_called_cells
The rows corresponding to consensus_background and consensus_cell refer to the number of barcodes that were categorized as background cells or rna-containing cells, respectively, no matter whether the data was purged or not. In contrast, transition_background and transition_cell refer to the number of barcodes that were recatgorized as background and cell, respectively. phantom_background and phantom_cell are phantom cells that disappear once phantom molecules are purged.
data_list$called_cells_tally
data_list$umi_counts_cell %>% 
  map(list("called_cells"))
$P7_0
# A tibble: 1,160 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   487     49   433       5
 2 AGTGAGGTCTGCCCTA   867    433   430       4
 3 GAACATCTCCTTGGTC   445     34   409       2
 4 CTAACTTAGTTGTAGA   420     27   391       2
 5 CGGGTCATCCCAAGTA   425     33   388       4
 6 CATGCCTTCAATACCG   782    409   373       0
 7 AGACGTTGTCCGAGTC   405     35   370       0
 8 CTCGAGGGTTTCGCTC   378     27   351       0
 9 GAATAAGCATATGGTC   395     48   337      10
10 TGAGAGGCAACACGCC   351     44   307       0
# … with 1,150 more rows
$P7_1
# A tibble: 835 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   595     51   540       4
 2 AGACGTTGTCCGAGTC   482     36   445       1
 3 CGGGTCATCCCAAGTA   493     45   444       4
 4 CTAACTTAGTTGTAGA   499     53   444       2
 5 GAATAAGCATATGGTC   472     55   417       0
 6 TGTTCCGAGGCTAGAC   483     67   416       0
 7 GTGGGTCCAAACTGTC   464     79   385       0
 8 GAACGGAGTTGCGCAC   425     46   379       0
 9 TAAACCGTCTCTGTCG   413     36   374       3
10 GTGCTTCGTCCCTACT   446     87   358       1
# … with 825 more rows
$P7_10
# A tibble: 2,464 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   286     21   265       0
 2 AGTGAGGTCTGCCCTA   279     24   253       2
 3 CTAACTTAGTTGTAGA   272     20   251       1
 4 CGGGTCATCCCAAGTA   263     35   227       1
 5 AGACGTTGTCCGAGTC   251     29   222       0
 6 GAACATCTCCTTGGTC   234     13   221       0
 7 CTCGAGGGTTTCGCTC   203     19   183       1
 8 CATGCCTTCAATACCG   222     33   179      10
 9 TGTTCCGAGGCTAGAC   201     19   179       3
10 GTGCTTCGTCCCTACT   225     51   173       1
# … with 2,454 more rows
$P7_11
# A tibble: 3,514 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   387     36   347       4
 2 GAACATCTCCTTGGTC   351     23   327       1
 3 CGGGTCATCCCAAGTA   382     51   325       6
 4 AGTGAGGTCTGCCCTA   368     57   310       1
 5 AGACGTTGTCCGAGTC   309     25   284       0
 6 TAAACCGTCTCTGTCG   296     19   272       5
 7 AAATGCCTCCCAAGTA   294     25   262       7
 8 TGTTCCGAGGCTAGAC   296     31   260       5
 9 CTCGAGGGTTTCGCTC   294     31   259       4
10 TAAGAGATCTTGGGTA   279     26   241      12
# … with 3,504 more rows
$P7_12
# A tibble: 3,062 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   384     39   339       6
 2 CTAACTTAGTTGTAGA   336     28   303       5
 3 AGTGAGGTCTGCCCTA   318     31   280       7
 4 AGACGTTGTCCGAGTC   306     27   278       1
 5 GAACATCTCCTTGGTC   313     40   273       0
 6 CTCGAGGGTTTCGCTC   281     31   248       2
 7 GAATAAGCATATGGTC   295     30   247      18
 8 AAATGCCTCCCAAGTA   264     31   230       3
 9 GGACATTTCGTAGGAG   253     23   226       4
10 TGTTCCGAGGCTAGAC   249     21   226       2
# … with 3,052 more rows
$P7_13
# A tibble: 3,416 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CGGGTCATCCCAAGTA   343     45   296       2
 2 AGTGAGGTCTGCCCTA   325     31   292       2
 3 CTAACTTAGTTGTAGA   317     33   283       1
 4 GAACATCTCCTTGGTC   305     22   282       1
 5 AGGGATGGTCTAACGT   307     33   271       3
 6 AGACGTTGTCCGAGTC   293     31   261       1
 7 GAATAAGCATATGGTC   287     36   241      10
 8 CTCGAGGGTTTCGCTC   264     23   240       1
 9 TAAACCGTCTCTGTCG   235     18   215       2
10 GGACATTTCGTAGGAG   249     32   213       4
# … with 3,406 more rows
$P7_14
# A tibble: 2,630 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   380     36   341       3
 2 CTAACTTAGTTGTAGA   307     17   288       2
 3 AGACGTTGTCCGAGTC   319     35   283       1
 4 AGTGAGGTCTGCCCTA   307     32   271       4
 5 GAACATCTCCTTGGTC   284     33   251       0
 6 CTCGAGGGTTTCGCTC   265     22   242       1
 7 TAAACCGTCTCTGTCG   260     18   241       1
 8 GAACATCTCAGAGACG   276     38   238       0
 9 GTGCTTCGTCCCTACT   299     61   234       4
10 GATCTAGTCGAGAACG   247     21   225       1
# … with 2,620 more rows
$P7_15
# A tibble: 2,765 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   290     26   261       3
 2 CTAACTTAGTTGTAGA   248     18   227       3
 3 CGGGTCATCCCAAGTA   242     24   217       1
 4 AGACGTTGTCCGAGTC   229     17   211       1
 5 AGTGAGGTCTGCCCTA   226     22   204       0
 6 GAACATCTCCTTGGTC   220     19   201       0
 7 CTCGAGGGTTTCGCTC   206     16   189       1
 8 GTGCTTCGTCCCTACT   211     43   167       1
 9 CTCGTCATCAGAGGTG  5116   5030    78       8
10 CATATGGGTTGCGTTA  1665   1597    66       2
# … with 2,755 more rows
$P7_2
# A tibble: 4,058 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   341     44   297       0
 2 AGTGAGGTCTGCCCTA   308     29   275       4
 3 CTAACTTAGTTGTAGA   293     26   267       0
 4 GAACATCTCCTTGGTC   280     20   259       1
 5 CGGGTCATCCCAAGTA   287     35   251       1
 6 CTCGAGGGTTTCGCTC   261     29   232       0
 7 AGACGTTGTCCGAGTC   262     41   221       0
 8 TAAACCGTCTCTGTCG   239     26   213       0
 9 TGTTCCGAGGCTAGAC   241     30   208       3
10 AAATGCCTCCCAAGTA   232     28   204       0
# … with 4,048 more rows
$P7_3
# A tibble: 4,006 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   373     82   290       1
 2 CTAACTTAGTTGTAGA   311     43   267       1
 3 AGTGAGGTCTGCCCTA   292     32   258       2
 4 AGACGTTGTCCGAGTC   273     26   247       0
 5 TGTTCCGAGGCTAGAC   278     34   244       0
 6 CGGGTCATCCCAAGTA   281     41   239       1
 7 CTCGAGGGTTTCGCTC   266     25   238       3
 8 GAACATCTCCTTGGTC   269     28   237       4
 9 AAATGCCTCCCAAGTA   252     27   220       5
10 TAAACCGTCTCTGTCG   246     27   218       1
# … with 3,996 more rows
$P7_4
# A tibble: 999 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GCTGCTTGTCCGAAGA   727    147   578       2
 2 GTGCTTCGTCCCTACT   583     75   506       2
 3 AGGGATGGTCTAACGT   402     32   368       2
 4 TGGGAAGCATTCGACA   343     17   326       0
 5 AGGCCGTGTGCGATAG   339     18   321       0
 6 CGGGTCATCCCAAGTA   326     29   294       3
 7 CTAACTTAGTTGTAGA   304     17   287       0
 8 AGTGAGGTCTGCCCTA   302     25   277       0
 9 AGACGTTGTCCGAGTC   306     28   276       2
10 CAGATCAAGACAGAGA   298     29   269       0
# … with 989 more rows
$P7_5
# A tibble: 2,041 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   307     34   272       1
 2 AGACGTTGTCCGAGTC   276     29   247       0
 3 AGTGAGGTCTGCCCTA 12731  12472   219      40
 4 CTAACTTAGTTGTAGA   242     26   215       1
 5 CGGGTCATCCCAAGTA   234     27   207       0
 6 GAACATCTCCTTGGTC   224     23   201       0
 7 CTCGAGGGTTTCGCTC   219     19   200       0
 8 AAATGCCTCCCAAGTA   217     18   196       3
 9 CATGCCTTCAATACCG   221     29   192       0
10 TGTTCCGAGGCTAGAC   210     19   190       1
# … with 2,031 more rows
$P7_6
# A tibble: 7,280 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGTGAGGTCTGCCCTA   260     23   234       3
 2 AGGGATGGTCTAACGT   243     27   215       1
 3 CTAACTTAGTTGTAGA   234     19   214       1
 4 GAACATCTCCTTGGTC   227     18   208       1
 5 CTCGAGGGTTTCGCTC   217     18   198       1
 6 CGGGTCATCCCAAGTA   220     24   195       1
 7 AAATGCCTCCCAAGTA   201     21   177       3
 8 GTGCTTCGTCCCTACT   212     48   157       7
 9 TAAGAGATCTTGGGTA  5407   5268   116      23
10 CTAACTTCATCGATGT  3752   3619   103      30
# … with 7,270 more rows
$P7_7
# A tibble: 743 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   285     29   256       0
 2 GAACATCTCCTTGGTC   235      9   225       1
 3 AGTGAGGTCTGCCCTA   237     20   216       1
 4 CTCGAGGGTTTCGCTC   241     23   216       2
 5 CTAACTTAGTTGTAGA   232     22   208       2
 6 AGACGTTGTCCGAGTC   222     19   202       1
 7 TGTTCCGAGGCTAGAC   208     15   193       0
 8 GGACATTTCGTAGGAG   213     18   189       6
 9 CGGGTCATCCCAAGTA   211     23   186       2
10 GAACATCTCAGAGACG   201     27   174       0
# … with 733 more rows
$P7_8
# A tibble: 1,209 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGTGAGGTCTGCCCTA   495     38   457       0
 2 GAACATCTCCTTGGTC   425     29   396       0
 3 GAACATCTCAGAGACG   443     50   393       0
 4 GAATAAGCATATGGTC   421     40   381       0
 5 CTCGAGGGTTTCGCTC   408     29   379       0
 6 TGTTCCGAGGCTAGAC   400     28   372       0
 7 TAAGAGATCTTGGGTA   403     39   364       0
 8 GAACGGAGTTGCGCAC   398     38   360       0
 9 CATGCCTTCAATACCG   398     43   355       0
10 GGCTGGTGTGTCGCTG   365     27   338       0
# … with 1,199 more rows
$P7_9
# A tibble: 2,400 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   446     39   406       1
 2 CTAACTTAGTTGTAGA   420     37   381       2
 3 AGTGAGGTCTGCCCTA   411     31   378       2
 4 AGACGTTGTCCGAGTC   375     24   348       3
 5 CGGGTCATCCCAAGTA   380     35   343       2
 6 GAACATCTCCTTGGTC   365     25   337       3
 7 GAATAAGCATATGGTC   372     38   334       0
 8 CTCGAGGGTTTCGCTC   346     27   318       1
 9 CATGCCTTCAATACCG   340     36   304       0
10 GGCTGGTGTGTCGCTG   322     17   303       2
# … with 2,390 more rows
data_list$umi_counts_cell %>% 
  map(list("background_cells"))
$P7_0
# A tibble: 309,061 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   429     21   403       5
 2 GCTGCTTGTCCGAAGA   555    220   317      18
 3 AAATGCCTCCCAAGTA   370     45   315      10
 4 GCAAACTAGCTTATCG   252     20   228       4
 5 AACTCAGTCTAACGGT   249     24   222       3
 6 TTAACTCCAATGGTCT   234     15   216       3
 7 CTTAACTCACTACAGT   220     18   201       1
 8 ACGCCGAGTCTACCTC   216     23   191       2
 9 ACACCAACATAAGACA   208     18   189       1
10 ACATACGGTAATCGTC   210     20   189       1
# … with 309,051 more rows
$P7_1
# A tibble: 306,030 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GCTGCTTGTCCGAAGA   580    162   418       0
 2 GAACATCTCAGAGACG   439     63   376       0
 3 CAGATCAAGACAGAGA   261     60   201       0
 4 CCTAGCTGTTCTGGTA   199     11   186       2
 5 TTAGTTCGTTCAGCGC   198     11   186       1
 6 GTTTCTACAGTAAGAT   194      7   185       2
 7 CTCTAATAGGACATTA   197     14   183       0
 8 TCTTTCCCAGACAGGT   195     10   183       2
 9 ATCATCTAGTTACGGG   193     11   182       0
10 TAGACCACAAACGCGA   197     15   182       0
# … with 306,020 more rows
$P7_10
# A tibble: 418,600 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GAATAAGCATATGGTC   266     28   230       8
 2 CACACCTTCAACGCTA   224     16   207       1
 3 GCTGCTTGTCCGAAGA   260     71   186       3
 4 GAACGGAGTTGCGCAC   215     28   178       9
 5 CGGACACAGCGCTTAT   186     14   172       0
 6 TCAGGTACATAACCTG   186     15   171       0
 7 TGAGAGGCAACACGCC   180     15   165       0
 8 TAAACCGTCTCTGTCG   182     19   163       0
 9 AAATGCCTCCCAAGTA   190     29   159       2
10 ACTTACTCAGCTCGAC   176     17   159       0
# … with 418,590 more rows
$P7_11
# A tibble: 325,495 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTAACTTAGTTGTAGA   345     23   321       1
 2 CACACCTTCAACGCTA   317     18   297       2
 3 GCTGCTTGTCCGAAGA   374     90   273      11
 4 ACTTACTCAGCTCGAC   279     17   262       0
 5 GAACGGAGTTGCGCAC   304     37   258       9
 6 GAATAAGCATATGGTC   294     44   242       8
 7 CGGACACAGCGCTTAT   257     26   230       1
 8 CATCCACAGATGGCGT   247     29   215       3
 9 AAACGGGAGGATATAC   229     16   210       3
10 GTGCTTCGTCCCTACT   276     60   208       8
# … with 325,485 more rows
$P7_12
# A tibble: 352,841 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   314     30   278       6
 2 CGGGTCATCCCAAGTA   298     31   265       2
 3 GCTGCTTGTCCGAAGA   347     98   238      11
 4 GAACGGAGTTGCGCAC   264     30   223      11
 5 CATCCACAGATGGCGT   242     20   221       1
 6 CGGACACAGCGCTTAT   222     10   211       1
 7 GGATGTTAGGGAACGG   223     20   200       3
 8 AAACGGGAGGATATAC   215     16   198       1
 9 GCGGGTTTCAGGATCT   199     15   183       1
10 GTATTCTTCACCATAG   190      7   183       0
# … with 352,831 more rows
$P7_13
# A tibble: 360,836 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   251     20   226       5
 2 GAACGGAGTTGCGCAC   261     37   209      15
 3 CATCCACAGATGGCGT   243     31   208       4
 4 GCTGCTTGTCCGAAGA   302     90   205       7
 5 ACTTACTCAGCTCGAC   222     27   194       1
 6 AAACGGGAGGATATAC   205     13   191       1
 7 GGATGTTAGGGAACGG   216     25   188       3
 8 GCTGCGACAGTAAGCG   200     19   181       0
 9 CCTACCAAGGTAAACT   195     14   178       3
10 CCTACCACATCGGTTA   207     29   177       1
# … with 360,826 more rows
$P7_14
# A tibble: 310,691 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   314     17   295       2
 2 CGGGTCATCCCAAGTA   292     26   262       4
 3 GAATAAGCATATGGTC   281     35   245       1
 4 GCTGCTTGTCCGAAGA   331     82   237      12
 5 GAACGGAGTTGCGCAC   252     36   216       0
 6 CGGACACAGCGCTTAT   209     18   191       0
 7 AAACGGGAGGATATAC   216     24   190       2
 8 GCTGCGACAGTAAGCG   200     17   183       0
 9 CGACTTCAGACCTAGG   199     16   181       2
10 GTAGGCCAGCCGATTT   203     22   181       0
# … with 310,681 more rows
$P7_15
# A tibble: 298,328 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   197     17   178       2
 2 GCTGCGACAGTAAGCG   179      8   171       0
 3 TAAACCGTCTCTGTCG   189     18   170       1
 4 GAACATCTCAGAGACG   197     25   169       3
 5 GATCTAGTCGAGAACG   188     19   167       2
 6 ACTTACTCAGCTCGAC   179     13   166       0
 7 TCTATTGCATAAAGGT   175      9   166       0
 8 TGAGAGGCAACACGCC   178     12   166       0
 9 GAATAAGCATATGGTC   193     23   165       5
10 GCAAACTTCGACAGCC   180     15   164       1
# … with 298,318 more rows
$P7_2
# A tibble: 361,126 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   270     23   239       8
 2 GCTGCTTGTCCGAAGA   309     75   228       6
 3 GAATAAGCATATGGTC   251     31   206      14
 4 ACTTACTCAGCTCGAC   246     41   205       0
 5 CATCCACAGATGGCGT   237     37   200       0
 6 GAACGGAGTTGCGCAC   232     26   197       9
 7 GCTGCGACAGTAAGCG   195     16   179       0
 8 CGACTTCAGACCTAGG   193     18   174       1
 9 CGGACACAGCGCTTAT   196     22   172       2
10 GCGCCAAAGGCTATCT   190     18   172       0
# … with 361,116 more rows
$P7_3
# A tibble: 381,119 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GCTGCTTGTCCGAAGA   325     81   233      11
 2 CACACCTTCAACGCTA   255     19   231       5
 3 GAATAAGCATATGGTC   264     33   221      10
 4 GAACGGAGTTGCGCAC   247     34   205       8
 5 CATCCACAGATGGCGT   219     22   196       1
 6 CGGACACAGCGCTTAT   217     22   195       0
 7 TCAGGTACATAACCTG   195     20   175       0
 8 GCTGCGACAGTAAGCG   188     17   171       0
 9 CGACTTCAGACCTAGG   193     22   170       1
10 GATCTAGTCGAGAACG   210     39   170       1
# … with 381,109 more rows
$P7_4
# A tibble: 315,858 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   303     18   285       0
 2 CGTCCATGTGTTCGAT   199     10   189       0
 3 AGATTGCAGAGCAATT   200     16   184       0
 4 CGATGTACATATGCTG   199     14   184       1
 5 AAATGCCGTGAACCTT   199     15   182       2
 6 AACGTTGTCAACACAC   195     14   181       0
 7 GCTGCGACAGTAAGCG   194     12   180       2
 8 TAAGCGTTCAGAGACG   194     14   180       0
 9 TCGAGGCTCCCTCTTT   193     10   180       3
10 ACACCAACATAAGACA   198     20   178       0
# … with 315,848 more rows
$P7_5
# A tibble: 364,537 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   288     48   239       1
 2 TAAACCGTCTCTGTCG   190     16   174       0
 3 GAACATCTCAGAGACG   199     31   166       2
 4 ACTTACTCAGCTCGAC   178     13   165       0
 5 GGCTGGTGTGTCGCTG   178     15   161       2
 6 TAAACCGCATGTAGTC   181     21   158       2
 7 GTGGGTCCAAACTGTC   190     34   156       0
 8 GCTGCGACAGTAAGCG   166     11   155       0
 9 CCGTACTGTCAGATAA   175     19   154       2
10 CATCCACAGATGGCGT   164     10   151       3
# … with 364,527 more rows
$P7_6
# A tibble: 351,694 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GCTGCTTGTCCGAAGA   327     86   231      10
 2 CACACCTTCAACGCTA   224     22   196       6
 3 TAAACCGTCTCTGTCG   188     15   173       0
 4 TGTTCCGAGGCTAGAC   200     23   173       4
 5 GGACATTTCGTAGGAG   180      9   168       3
 6 ACTTACTCAGCTCGAC   183     17   165       1
 7 AGACGTTGTCCGAGTC   185     21   164       0
 8 GGCTGGTGTGTCGCTG   172      9   163       0
 9 CGGACACAGCGCTTAT   199     42   157       0
10 CGACTTCAGACCTAGG   169     14   154       1
# … with 351,684 more rows
$P7_7
# A tibble: 275,155 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   238     16   215       7
 2 GCTGCTTGTCCGAAGA   272     65   200       7
 3 ACTTACTCAGCTCGAC   206     14   191       1
 4 GTGCTTCGTCCCTACT   245     50   187       8
 5 TGAGAGGCAACACGCC   194      6   186       2
 6 CATCCACAGATGGCGT   197     14   182       1
 7 TAAACCGTCTCTGTCG   194     13   180       1
 8 TCTATTGCATAAAGGT   195     15   179       1
 9 CATGCCTTCAATACCG   199     29   170       0
10 TAAACCGCATGTAGTC   197     26   167       4
# … with 275,145 more rows
$P7_8
# A tibble: 272,056 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   478     33   445       0
 2 AAATGCCTCCCAAGTA   418     48   370       0
 3 GCTGCTTGTCCGAAGA   474    152   322       0
 4 ACGCCGAGTCTACCTC   253     19   234       0
 5 ACACCAACATAAGACA   251     22   229       0
 6 GCAAACTAGCTTATCG   247     23   224       0
 7 TTAACTCCAATGGTCT   249     29   220       0
 8 GATCGTATCGGCGCAT   249     43   206       0
 9 ACGCCGAAGATCCGAG   203     12   191       0
10 ATTCTACCAGGACGTA   199      9   190       0
# … with 272,046 more rows
$P7_9
# A tibble: 316,062 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   397     32   362       3
 2 ACTTACTCAGCTCGAC   321     26   295       0
 3 GTAGGCCAGCCGATTT   284     33   251       0
 4 CATCCACAGATGGCGT   271     22   248       1
 5 AAATGCCTCCCAAGTA   275     29   244       2
 6 CGGACACAGCGCTTAT   274     28   244       2
 7 CGGAGTCCATGAGCGA   248     12   236       0
 8 AAACGGGAGGATATAC   269     43   225       1
 9 CGATGTACATATGCTG   245     22   223       0
10 GCAAACTTCGACAGCC   243     18   223       2
# … with 316,052 more rows
tic("saving output")
data_list$read_counts <- NULL
saveRDS(data_list, results_filepath)
toc()
saving output: 3582.473 sec elapsed
# memory usage
gc()
             used    (Mb)  gc trigger     (Mb)    max used     (Mb)
Ncells    6381580   340.9     9589840    512.2     9589840    512.2
Vcells 8960661518 68364.5 31123741425 237455.4 32387901149 247100.1
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
 [1] cowplot_0.9.4      data.table_1.12.2  tictoc_1.0        
 [4] furrr_0.1.0        future_1.13.0      broom_0.5.2       
 [7] matrixStats_0.54.0 forcats_0.4.0      stringr_1.4.0     
[10] dplyr_0.8.1        purrr_0.3.2        readr_1.3.1       
[13] tidyr_0.8.3        tibble_2.1.2       ggplot2_3.1.1     
[16] tidyverse_1.2.1    rhdf5_2.26.2       rmarkdown_1.13    
loaded via a namespace (and not attached):
 [1] nlme_3.1-137                bitops_1.0-6               
 [3] lubridate_1.7.4             httr_1.4.0                 
 [5] rprojroot_1.3-2             GenomeInfoDb_1.18.2        
 [7] tools_3.5.2                 backports_1.1.4            
 [9] utf8_1.1.4                  R6_2.4.0                   
[11] HDF5Array_1.10.1            lazyeval_0.2.2             
[13] BiocGenerics_0.28.0         colorspace_1.4-1           
[15] withr_2.1.2                 tidyselect_0.2.5           
[17] compiler_3.5.2              cli_1.1.0                  
[19] rvest_0.3.4                 Biobase_2.42.0             
[21] xml2_1.2.0                  DelayedArray_0.8.0         
[23] labeling_0.3                scales_1.0.0               
[25] digest_0.6.19               XVector_0.22.0             
[27] base64enc_0.1-3             pkgconfig_2.0.2            
[29] htmltools_0.3.6             limma_3.38.3               
[31] rlang_0.3.4                 readxl_1.3.1               
[33] rstudioapi_0.10             generics_0.0.2             
[35] jsonlite_1.6                BiocParallel_1.16.6        
[37] RCurl_1.95-4.12             magrittr_1.5               
[39] GenomeInfoDbData_1.2.0      Matrix_1.2-15              
[41] Rcpp_1.0.1                  munsell_0.5.0              
[43] S4Vectors_0.20.1            Rhdf5lib_1.4.2             
[45] fansi_0.4.0                 stringi_1.4.3              
[47] yaml_2.2.0                  edgeR_3.24.3               
[49] MASS_7.3-51.1               SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0             plyr_1.8.4                 
[53] grid_3.5.2                  parallel_3.5.2             
[55] listenv_0.7.0               crayon_1.3.4               
[57] lattice_0.20-38             haven_2.1.0                
[59] hms_0.4.2                   locfit_1.5-9.1             
[61] zeallot_0.1.0               knitr_1.23                 
[63] pillar_1.4.1                GenomicRanges_1.34.0       
[65] codetools_0.2-16            stats4_3.5.2               
[67] glue_1.3.1                  evaluate_0.14              
[69] modelr_0.1.4                vctrs_0.1.0                
[71] cellranger_1.1.0            gtable_0.3.0               
[73] assertthat_0.2.1            xfun_0.7                   
[75] DropletUtils_1.2.2          viridisLite_0.3.0          
[77] SingleCellExperiment_1.4.1  IRanges_2.16.0             
[79] globals_0.12.4