knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <-"hiseq2500"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: novaseq_l1
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
Each sample’s molecule_info.h5 file should be renamed to {sample_name}.h5 and placed in ../project_dir/data/{dataset_name}/input/. The purged UMI count matrices and other output files are saved to ../project_dir/data/{dataset_name}/output/.
code_dir <- file.path(project_dir, "code")
data_dir <- file.path(project_dir, "data", 
                      dataset_name)
input_dir <- file.path(data_dir, "input")
output_dir <- file.path(data_dir, "output")
figures_dir <- file.path(output_dir, "figures")
read_counts_filepath <- file.path(output_dir,
                                  sprintf("%s_read_counts.rds", 
                                          dataset_name))
results_filepath <- file.path(output_dir, 
                              sprintf("%s_results.rds", 
                                      dataset_name))
Create directories if they don’t exist.
dir.create(output_dir)
Warning in dir.create(output_dir): '/project/6007998/rfarouni/
index_hopping/data/novaseq_l1/output' already exists
dir.create(figures_dir)
Warning in dir.create(figures_dir): '/project/6007998/rfarouni/
index_hopping/data/novaseq_l1/output/figures' already exists
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3 
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
purge_phantoms <- function(input_dir,
                           output_dir,
                           read_counts_filepath = NULL,
                           torc = 3,
                           max_r = NULL) {
  tic("Running workflow I")
  tic("Step 1: loading molecule_info files and creating read counts datatable")
  read_counts <- create_joined_counts(input_dir, read_counts_filepath)
  toc()
  sample_names <-
    setdiff(
      colnames(read_counts),
      c("cell", "umi", "gene", "outcome")
    )
  S <- length(sample_names)
  tic("Step 2: creating outcome counts datatable with grouping vars")
  outcome_counts <- create_outcome_counts(read_counts, sample_names)
  toc()
  tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
  fit_out <-
    estimate_hopping_rate(
      outcome_counts,
      S,
      max_r = max_r
    )
  toc()
  # compute_molecular_complexity_profile
  tic("Step 4: compute molecular complexity profile and other summary statistics")
  summary_stats <-
    compute_summary_stats(
      outcome_counts,
      fit_out$glm_estimates$phat,
      sample_names
    )
  toc()
  tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
  outcome_counts <-
    reassign_reads_and_mark_retained_observations(
      outcome_counts,
      summary_stats,
      sample_names,
      fit_out,
      torc
    )
  # get the tradoff ratio cutoff
  summary_stats <- get_threshold(outcome_counts, summary_stats)
  toc()
  tic("Step 6: Purge and save read counts datatable to disk")
  read_counts <-
    left_join(read_counts %>%
      select(outcome, cell, umi, gene, sample_names),
    outcome_counts %>%
      select(c("outcome", "retain", paste0(sample_names, "_hat"))),
    by = c("outcome")
    ) %>%
    select(-outcome)
  purge_and_save_read_counts(
    read_counts,
    dataset_name,
    sample_names,
    output_dir
  )
  toc()
  tic("Step 7: create umi counts matrices")
  umi_counts_cell_gene <-
    create_umi_counts(
      read_counts,
      sample_names
    )
  toc()
  outcome_counts <-
    outcome_counts %>%
    arrange(-qr) %>%
    select(-c(paste0(sample_names, "_hat")))
  read_counts <-
    read_counts %>%
    select(-c("retain", paste0(sample_names, "_hat")))
  summary_stats$sample_names <- sample_names
  data_list <-
    list(
      umi_counts_cell_gene = umi_counts_cell_gene,
      read_counts = read_counts,
      outcome_counts = outcome_counts,
      fit_out = fit_out,
      summary_stats = summary_stats
    )
  toc()
  return(data_list)
}
identify_rna_cells <- function(data_list, output_dir) {
  tic("Running workflow II")
  tic("Step 7: identify RNA-containing cells")
  called_cells_out <- call_cells_all_samples(data_list$umi_counts_cell_gene, output_dir)
  toc()
  tic("Step 9: tallying molecules by cell-barcode")
  umi_counts_cell <- map2(
    called_cells_out$called_cells,
    data_list$umi_counts_cell_gene,
    get_umi_counts_cell
  )
  umi_counts_sample <-
    map(umi_counts_cell,
      map_dfr,
      get_umi_counts_sample,
      .id = "split"
    ) %>%
    bind_rows(.id = "sample")
  data_list$summary_stats <-
    update_summary_stats(
      data_list$summary_stats,
      umi_counts_sample
    )
  toc()
  data_list <-
    c(
      data_list,
      list(
        umi_counts_cell = umi_counts_cell,
        called_cells_tally = called_cells_out$called_cells_tally
      )
    )
  toc()
  return(data_list)
}
Estimate the sample index hopping probability, infer the true sample of origin, and reassign reads.
data_list <- purge_phantoms(input_dir, output_dir, read_counts_filepath, torc=torc)
Step 1: loading molecule_info files and creating read counts datatable: 498.264 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 60.014 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.138 sec elapsed
Step 4: compute molecular complexity profile and other summary statistics: 0.522 sec elapsed
Step 5: reassign read counts, determine cutoff, and mark retained observations: 6.362 sec elapsed
Step 6: Purge and save read counts datatable to disk: 722.83 sec elapsed
Step 7: create umi counts matrices: 765.427 sec elapsed
Running workflow I: 2053.622 sec elapsed
data_list$read_counts 
The datatable is ordered in descending order of qr, the posterior probability of incorrectly assigning s as the inferred sample of origin. n is the number of CUGs with the corresponding outcome and p_outcome is the observed marginal probability of that outcome.
data_list$outcome_counts 
p_chimeras is the proportion CUGs that are chimeric. g is the estimated proportion of fugue molecules and u is the molecule inflation factor such that n_cugs x u would give the number of non-fugue phantom molecules. The estimated total number of phantom molecules present in the dataset is given by n_pm=n_cugs x (u+g).
 data_list$summary_stats$summary_estimates
 data_list$summary_stats$conditional
 data_list$summary_stats$pi_r_hat
Plot
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=250)
p_read <- plot_grid(p_read$p, 
          p_read$legend,
          ncol=2,
          rel_widths=c(1, 0.1))
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_molcomplexity.pdf")),
       p_read,
       width=9,
       height=6)
p_read
data_list$fit_out$glm_estimates
p_fit <- plot_fit(data_list, dataset_name, x_lim=250)
p_fit <-plot_grid(p_fit$p,
          p_fit$legend,
          ncol=2,
          rel_widths=c(1, 0.2))
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_fit.pdf")),
       p_fit,
       width=9,
       height=6)
p_fit
data_list$summary_stats$cutoff_dt
The row discard_torc shows the outcome whose qr value is the maximum allowed. Reads corresponding to outcomes with greater qr values are discarded. no_discarding corresponds to retaining all reassigned reads and no_purging corresponds to keeping the data as it is.
p_fp <- plot_fp_reduction(data_list, dataset_name)
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_fp_performance.pdf")),
       p_fp,
       width=9,
       height=6)
p_fp
p_tradeoff <- plot_fp_tradoff(data_list, dataset_name)
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_tradeoff.pdf")),
       p_tradeoff,
       width=9,
       height=6)
p_tradeoff
data_list <- identify_rna_cells(data_list, output_dir)
Step 7: identify RNA-containing cells: 1985.566 sec elapsed
Step 9: tallying molecules by cell-barcode: 318.323 sec elapsed
Running workflow II: 2303.89 sec elapsed
Here we examine the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here m is the number of total molecules in millions; rm_ret is the number of predicted real molecules and prm_ret is the proportion; rm_disc is the number of discarded real molecules; and pm is the number of predicted phantom molecules.
data_list$summary_stats$marginal
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$summary_stats$marginal_called_cells
The rows corresponding to consensus_background and consensus_cell refer to the number of barcodes that were categorized as background cells or rna-containing cells, respectively, no matter whether the data was purged or not. In contrast, transition_background and transition_cell refer to the number of barcodes that were recatgorized as background and cell, respectively. phantom_background and phantom_cell are phantom cells that disappear once phantom molecules are purged.
data_list$called_cells_tally
data_list$umi_counts_cell %>% 
  map(list("called_cells"))
$P7_0
# A tibble: 1,315 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   475     40   430       5
 2 AGTGAGGTCTGCCCTA   850    432   417       1
 3 CTAACTTAGTTGTAGA   433     34   399       0
 4 CATGCCTTCAATACCG   785    392   391       2
 5 CTCGAGGGTTTCGCTC   396     27   368       1
 6 AGACGTTGTCCGAGTC   403     36   365       2
 7 GAACATCTCCTTGGTC   369     32   336       1
 8 CGGGTCATCCCAAGTA   369     39   329       1
 9 GTCGTAAGTTAAGATG   367     42   325       0
10 AGCATACCATCATCCC   357     37   320       0
# … with 1,305 more rows
$P7_1
# A tibble: 774 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   555     51   500       4
 2 AGACGTTGTCCGAGTC   518     42   471       5
 3 CTAACTTAGTTGTAGA   492     48   443       1
 4 CGGGTCATCCCAAGTA   486     53   432       1
 5 GAATAAGCATATGGTC   464     48   416       0
 6 TGTTCCGAGGCTAGAC   477     70   407       0
 7 GTGGGTCCAAACTGTC   470     87   382       1
 8 CATGCCTTCAATACCG   434     57   377       0
 9 TAAGAGATCTTGGGTA   480    114   366       0
10 GAACATCTCAGAGACG   424     59   365       0
# … with 764 more rows
$P7_10
# A tibble: 2,460 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   293     39   254       0
 2 CTAACTTAGTTGTAGA   255     21   234       0
 3 GAACATCTCCTTGGTC   227     13   214       0
 4 AGACGTTGTCCGAGTC   234     25   209       0
 5 CGGGTCATCCCAAGTA   224     15   209       0
 6 AGTGAGGTCTGCCCTA   218     21   196       1
 7 CTCGAGGGTTTCGCTC   214     19   195       0
 8 GGACATTTCGTAGGAG   207     17   189       1
 9 GTGCTTCGTCCCTACT   205     40   163       2
10 ACATACGCAGCATGAG 12457  12321   103      33
# … with 2,450 more rows
$P7_11
# A tibble: 3,506 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGTGAGGTCTGCCCTA   394     55   338       1
 2 AGGGATGGTCTAACGT   369     30   334       5
 3 CGGGTCATCCCAAGTA   373     62   310       1
 4 CTCGAGGGTTTCGCTC   323     20   302       1
 5 AGACGTTGTCCGAGTC   304     27   274       3
 6 GAACATCTCCTTGGTC   297     22   272       3
 7 CATGCCTTCAATACCG   316     45   270       1
 8 TGTTCCGAGGCTAGAC   284     23   258       3
 9 GGACATTTCGTAGGAG   271     18   247       6
10 GAACATCTCAGAGACG   268     32   232       4
# … with 3,496 more rows
$P7_12
# A tibble: 3,055 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   368     40   322       6
 2 CTAACTTAGTTGTAGA   339     26   313       0
 3 AGTGAGGTCTGCCCTA   331     29   297       5
 4 AGACGTTGTCCGAGTC   303     37   261       5
 5 TGTTCCGAGGCTAGAC   273     21   251       1
 6 GAACATCTCAGAGACG   315     60   250       5
 7 CTCGAGGGTTTCGCTC   271     24   246       1
 8 GAACATCTCCTTGGTC   275     32   242       1
 9 GGCTGGTGTGTCGCTG   258     30   225       3
10 TAAACCGTCTCTGTCG   252     25   224       3
# … with 3,045 more rows
$P7_13
# A tibble: 3,409 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   381     42   336       3
 2 CTAACTTAGTTGTAGA   312     22   286       4
 3 AGTGAGGTCTGCCCTA   308     18   285       5
 4 AGACGTTGTCCGAGTC   290     31   255       4
 5 GAACATCTCCTTGGTC   276     22   251       3
 6 CGGGTCATCCCAAGTA   271     19   246       6
 7 CTCGAGGGTTTCGCTC   260     25   234       1
 8 GAACATCTCAGAGACG   272     43   223       6
 9 TGTTCCGAGGCTAGAC   257     36   218       3
10 GAATAAGCATATGGTC   249     22   216      11
# … with 3,399 more rows
$P7_14
# A tibble: 2,629 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   371     35   334       2
 2 AGACGTTGTCCGAGTC   320     38   282       0
 3 CTAACTTAGTTGTAGA   311     28   282       1
 4 CGGGTCATCCCAAGTA   315     31   281       3
 5 AGTGAGGTCTGCCCTA   301     20   279       2
 6 GAACATCTCCTTGGTC   293     16   277       0
 7 CTCGAGGGTTTCGCTC   268     17   249       2
 8 GAATAAGCATATGGTC   273     43   230       0
 9 GAACATCTCAGAGACG   261     37   224       0
10 TGAGAGGCAACACGCC   239     17   221       1
# … with 2,619 more rows
$P7_15
# A tibble: 2,761 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   322     28   289       5
 2 CTAACTTAGTTGTAGA   242     17   221       4
 3 CGGGTCATCCCAAGTA   234     23   209       2
 4 AGTGAGGTCTGCCCTA   223     19   204       0
 5 AGACGTTGTCCGAGTC   218     18   200       0
 6 GAATAAGCATATGGTC   227     24   192      11
 7 TGAGAGGCAACACGCC   204     14   190       0
 8 GCGCCAAAGGCTATCT   226     80   146       0
 9 CATATGGGTTGCGTTA  1696   1606    89       1
10 CTCGTCATCAGAGGTG  5133   5047    81       5
# … with 2,751 more rows
$P7_2
# A tibble: 4,058 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   347     38   309       0
 2 AGTGAGGTCTGCCCTA   317     26   278      13
 3 CGGGTCATCCCAAGTA   282     30   251       1
 4 GAACATCTCCTTGGTC   268     19   247       2
 5 CTAACTTAGTTGTAGA   266     24   241       1
 6 CTCGAGGGTTTCGCTC   275     36   238       1
 7 GAACATCTCAGAGACG   288     46   238       4
 8 CATGCCTTCAATACCG   277     35   227      15
 9 AGACGTTGTCCGAGTC   253     35   217       1
10 TGTTCCGAGGCTAGAC   232     19   211       2
# … with 4,048 more rows
$P7_3
# A tibble: 4,000 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   376     77   299       0
 2 AGTGAGGTCTGCCCTA   310     30   276       4
 3 CTAACTTAGTTGTAGA   307     39   268       0
 4 AGACGTTGTCCGAGTC   290     23   265       2
 5 GAACATCTCCTTGGTC   275     29   245       1
 6 CTCGAGGGTTTCGCTC   258     22   235       1
 7 TGTTCCGAGGCTAGAC   254     29   223       2
 8 CGGGTCATCCCAAGTA   249     31   218       0
 9 CATCCACAGATGGCGT   238     22   215       1
10 CATGCCTTCAATACCG   253     31   213       9
# … with 3,990 more rows
$P7_4
# A tibble: 996 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GCTGCTTGTCCGAAGA   764    158   606       0
 2 GTGCTTCGTCCCTACT   570     86   483       1
 3 AGGGATGGTCTAACGT   396     41   352       3
 4 TGGGAAGCATTCGACA   358     18   340       0
 5 CTAACTTAGTTGTAGA   324     25   297       2
 6 AGTGAGGTCTGCCCTA   329     33   295       1
 7 AGGCCGTGTGCGATAG   309     16   293       0
 8 AGACGTTGTCCGAGTC   295     26   268       1
 9 AGCGGTCTCTTAGCCC   285     20   265       0
10 CAGATCAAGACAGAGA   289     27   262       0
# … with 986 more rows
$P7_5
# A tibble: 1,981 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   272     27   243       2
 2 CGGGTCATCCCAAGTA   251     26   224       1
 3 CTAACTTAGTTGTAGA   241     22   219       0
 4 AGTGAGGTCTGCCCTA 12662  12418   210      34
 5 GAACATCTCCTTGGTC   224     21   203       0
 6 CTCGAGGGTTTCGCTC   216     15   201       0
 7 AGACGTTGTCCGAGTC   217     18   197       2
 8 TAAACCGTCTCTGTCG   213     20   193       0
 9 GAATAAGCATATGGTC   222     29   183      10
10 GGACATTTCGTAGGAG   257     73   182       2
# … with 1,971 more rows
$P7_6
# A tibble: 7,285 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGTGAGGTCTGCCCTA   266     16   248       2
 2 AGGGATGGTCTAACGT   248     31   217       0
 3 GAACATCTCCTTGGTC   238     22   216       0
 4 CGGGTCATCCCAAGTA   210     19   191       0
 5 CTCGAGGGTTTCGCTC   208     15   191       2
 6 GTGCTTCGTCCCTACT   219     32   182       5
 7 GAATAAGCATATGGTC   208     17   181      10
 8 CTAACTTAGTTGTAGA   201     22   178       1
 9 AAATGCCTCCCAAGTA   208     29   176       3
10 TAAGAGATCTTGGGTA  5376   5228   126      22
# … with 7,275 more rows
$P7_7
# A tibble: 744 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   301     30   269       2
 2 AGTGAGGTCTGCCCTA   258     20   238       0
 3 CTAACTTAGTTGTAGA   247     10   235       2
 4 CTCGAGGGTTTCGCTC   234     17   216       1
 5 GAACATCTCAGAGACG   246     33   213       0
 6 TAAACCGTCTCTGTCG   228     14   211       3
 7 GAACATCTCCTTGGTC   235     25   209       1
 8 AGACGTTGTCCGAGTC   222     17   204       1
 9 GGACATTTCGTAGGAG   219     28   191       0
10 TAAACCGCATGTAGTC   212     22   187       3
# … with 734 more rows
$P7_8
# A tibble: 1,197 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGTGAGGTCTGCCCTA   505     40   465       0
 2 GAATAAGCATATGGTC   468     48   420       0
 3 GAACATCTCCTTGGTC   442     34   408       0
 4 CTCGAGGGTTTCGCTC   410     21   389       0
 5 TGTTCCGAGGCTAGAC   421     44   377       0
 6 GAACATCTCAGAGACG   415     52   363       0
 7 GAACGGAGTTGCGCAC   406     45   361       0
 8 CATGCCTTCAATACCG   405     49   356       0
 9 GTGCTTCGTCCCTACT   425     81   344       0
10 AAATGCCTCCCAAGTA   391     50   341       0
# … with 1,187 more rows
$P7_9
# A tibble: 2,395 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGGATGGTCTAACGT   446     42   403       1
 2 AGTGAGGTCTGCCCTA   409     31   376       2
 3 CGGGTCATCCCAAGTA   411     34   374       3
 4 CTAACTTAGTTGTAGA   385     27   356       2
 5 CTCGAGGGTTTCGCTC   375     26   346       3
 6 AGACGTTGTCCGAGTC   365     26   338       1
 7 GAACATCTCCTTGGTC   359     29   330       0
 8 GAATAAGCATATGGTC   337     34   303       0
 9 GGCTGGTGTGTCGCTG   316     18   292       6
10 GAACATCTCAGAGACG   329     45   284       0
# … with 2,385 more rows
data_list$umi_counts_cell %>% 
  map(list("background_cells"))
$P7_0
# A tibble: 307,850 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   449     30   413       6
 2 GCTGCTTGTCCGAAGA   533    222   302       9
 3 AAATGCCTCCCAAGTA   353     54   296       3
 4 GGACATTTCGTAGGAG   315     34   277       4
 5 TAAACCGCATGTAGTC   278     33   238       7
 6 GCAAACTAGCTTATCG   253     13   237       3
 7 ACACCAACATAAGACA   238      8   227       3
 8 ACGCCGAGTCTACCTC   234     22   210       2
 9 TTAACTCCAATGGTCT   228     18   209       1
10 AACTCAGTCTAACGGT   226     16   208       2
# … with 307,840 more rows
$P7_1
# A tibble: 305,602 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GCTGCTTGTCCGAAGA   546    145   400       1
 2 AGAGTGGTCATGTCCC   199     10   188       1
 3 GACCAATAGTACGACG   200     14   186       0
 4 CTCGTCATCAGAGGTG   200     14   185       1
 5 TGGCTGGAGAAGGTGA   196     12   184       0
 6 AGGGATGAGTACTTGC   197     15   182       0
 7 GCTTCCAGTTCCGGCA   195     11   182       2
 8 CATGGCGGTGCGATAG   198     17   180       1
 9 CCACCTAAGGCCCGTT   199     19   180       0
10 GGCAATTTCTTCTGGC   195     15   180       0
# … with 305,592 more rows
$P7_10
# A tibble: 417,050 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   225     20   205       0
 2 GAATAAGCATATGGTC   227     34   188       5
 3 GATCTAGTCGAGAACG   189     15   174       0
 4 TGTTCCGAGGCTAGAC   195     19   171       5
 5 GAACATCTCAGAGACG   197     31   166       0
 6 GGCTGGTGTGTCGCTG   182     17   165       0
 7 TAAGAGATCTTGGGTA   196     27   164       5
 8 GCTGCTTGTCCGAAGA   243     79   163       1
 9 AAATGCCTCCCAAGTA   183     21   162       0
10 GAACGGAGTTGCGCAC   190     19   162       9
# … with 417,040 more rows
$P7_11
# A tibble: 323,694 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTAACTTAGTTGTAGA   332     26   304       2
 2 CACACCTTCAACGCTA   323     23   298       2
 3 GCTGCTTGTCCGAAGA   403    131   263       9
 4 TAAACCGTCTCTGTCG   272     21   250       1
 5 GAATAAGCATATGGTC   290     24   249      17
 6 AAATGCCTCCCAAGTA   288     36   248       4
 7 CATCCACAGATGGCGT   243     20   218       5
 8 GATCTAGTCGAGAACG   236     19   217       0
 9 GAACGGAGTTGCGCAC   263     39   213      11
10 GCAAACTTCGACAGCC   233     18   213       2
# … with 323,684 more rows
$P7_12
# A tibble: 351,075 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CGGGTCATCCCAAGTA   342     37   302       3
 2 CACACCTTCAACGCTA   292     31   261       0
 3 GCTGCTTGTCCGAAGA   383    110   259      14
 4 GAATAAGCATATGGTC   300     34   255      11
 5 GAACGGAGTTGCGCAC   276     32   232      12
 6 ACTTACTCAGCTCGAC   225     16   206       3
 7 TAAGAGATCTTGGGTA   239     32   202       5
 8 AAACGGGAGGATATAC   219     26   191       2
 9 GGATGTTAGGGAACGG   204     11   191       2
10 CATCCACAGATGGCGT   220     30   188       2
# … with 351,065 more rows
$P7_13
# A tibble: 358,946 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   275     25   245       5
 2 GCTGCTTGTCCGAAGA   311     91   213       7
 3 CGGACACAGCGCTTAT   223     15   206       2
 4 CATCCACAGATGGCGT   215     29   184       2
 5 GAACGGAGTTGCGCAC   225     36   183       6
 6 GCTGCGACAGTAAGCG   190      8   181       1
 7 GGCTGGTGTGTCGCTG   198     16   178       4
 8 ACATACGCAGCATGAG   198     24   174       0
 9 ACTTTCAGTTTGACTG   194     18   174       2
10 CGACTTCAGACCTAGG   194     20   174       0
# … with 358,936 more rows
$P7_14
# A tibble: 308,824 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   306     21   284       1
 2 GAACGGAGTTGCGCAC   257     25   232       0
 3 GCTGCTTGTCCGAAGA   309     77   225       7
 4 GATCTAGTCGAGAACG   240     32   208       0
 5 CATCCACAGATGGCGT   223     21   199       3
 6 GCAAACTTCGACAGCC   213     21   190       2
 7 ACTTACTCAGCTCGAC   206     16   189       1
 8 AAATGCCGTGAACCTT   197     13   184       0
 9 CGACTTCAGACCTAGG   198     15   182       1
10 TGGGAAGTCAACCAAC   198     17   181       0
# … with 308,814 more rows
$P7_15
# A tibble: 296,796 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 TAAACCGTCTCTGTCG   198     11   187       0
 2 GAACGGAGTTGCGCAC   212     24   180       8
 3 TGTTCCGAGGCTAGAC   195     22   173       0
 4 CACACCTTCAACGCTA   193     18   172       3
 5 CTCGAGGGTTTCGCTC   188     18   170       0
 6 GAACATCTCCTTGGTC   180     10   170       0
 7 CATGCCTTCAATACCG   197     31   166       0
 8 GGCTGGTGTGTCGCTG   182     16   166       0
 9 GTGCTTCGTCCCTACT   201     33   165       3
10 ACTTACTCAGCTCGAC   177     13   162       2
# … with 296,786 more rows
$P7_2
# A tibble: 360,298 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   254     18   234       2
 2 GATCTAGTCGAGAACG   219     17   201       1
 3 GAATAAGCATATGGTC   238     32   196      10
 4 ACTTACTCAGCTCGAC   228     36   192       0
 5 GCTGCTTGTCCGAAGA   271     69   191      11
 6 CGGACACAGCGCTTAT   208     26   181       1
 7 GAACGGAGTTGCGCAC   208     20   181       7
 8 GCTGCGACAGTAAGCG   197     17   180       0
 9 AAACGGGAGGATATAC   189     12   176       1
10 CATCCACAGATGGCGT   197     27   170       0
# … with 360,288 more rows
$P7_3
# A tibble: 379,883 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   247     18   223       6
 2 GCTGCTTGTCCGAAGA   306     85   215       6
 3 GAATAAGCATATGGTC   263     43   209      11
 4 TAAACCGTCTCTGTCG   231     26   205       0
 5 AGATTGCAGAGCAATT   200     13   187       0
 6 ACTTTCAGTTTGACTG   198     19   179       0
 7 TCGAGGCTCCCTCTTT   194     16   178       0
 8 GAACGGAGTTGCGCAC   214     29   176       9
 9 CTCAGAACATATGCTG   194     19   170       5
10 CGGACACAGCGCTTAT   188     20   168       0
# … with 379,873 more rows
$P7_4
# A tibble: 314,515 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   320     16   304       0
 2 CGTCTACTCTTGACGA   195      9   186       0
 3 CCTACCACATCGGTTA   197     19   178       0
 4 GACTACAGTTCAACCA   189     11   178       0
 5 GGTGAAGAGGTGATAT   189     12   177       0
 6 ACTTTCAGTTTGACTG   192     14   176       2
 7 AACGTTGTCAACACAC   187     11   175       1
 8 AAATGCCGTGAACCTT   199     25   174       0
 9 TCGAGGCTCGCGCCAA   191     17   174       0
10 TGTTCCGCACTTAAGC   187     13   174       0
# … with 314,505 more rows
$P7_5
# A tibble: 363,944 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   287     40   244       3
 2 GCTGCTTGTCCGAAGA   301     79   217       5
 3 TGAGAGGCAACACGCC   198     15   180       3
 4 AAATGCCTCCCAAGTA   195     22   169       4
 5 TGTTCCGAGGCTAGAC   191     21   167       3
 6 ACTTACTCAGCTCGAC   181     13   166       2
 7 AGATTGCAGAGCAATT   167      8   159       0
 8 CCGTACTGTCAGATAA   178     17   159       2
 9 TAAGAGATCTTGGGTA   182     20   156       6
10 TAAACCGCATGTAGTC   174     17   154       3
# … with 363,934 more rows
$P7_6
# A tibble: 349,966 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   220     11   204       5
 2 GCTGCTTGTCCGAAGA   268     78   187       3
 3 AGACGTTGTCCGAGTC   195     15   180       0
 4 TGTTCCGAGGCTAGAC   195     22   170       3
 5 GGACATTTCGTAGGAG   187     14   168       5
 6 TGAGAGGCAACACGCC   179     16   162       1
 7 AAATGCCGTGAACCTT   167     11   155       1
 8 TCTATTGCATAAAGGT   168     13   155       0
 9 ACTTACTCAGCTCGAC   169     15   153       1
10 CATGCCTTCAATACCG   182     22   152       8
# … with 349,956 more rows
$P7_7
# A tibble: 273,888 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   230     10   219       1
 2 CGGGTCATCCCAAGTA   235     25   210       0
 3 CATCCACAGATGGCGT   196     12   184       0
 4 GTGCTTCGTCCCTACT   235     48   184       3
 5 TGTTCCGAGGCTAGAC   199     21   178       0
 6 GCTGCTTGTCCGAAGA   253     75   173       5
 7 GATCTAGTCGAGAACG   190     20   170       0
 8 GCTGCGACAGTAAGCG   179      9   169       1
 9 AAATGCCTCCCAAGTA   199     30   168       1
10 ACTTACTCAGCTCGAC   181     13   168       0
# … with 273,878 more rows
$P7_8
# A tibble: 270,528 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   470     38   432       0
 2 GCTGCTTGTCCGAAGA   473    137   336       0
 3 GCAAACTAGCTTATCG   299     15   284       0
 4 GAATAAGTCCTAGGGC   304     29   275       0
 5 GTAGGCCAGCCGATTT   309     39   270       0
 6 ACACCAACATAAGACA   247     15   232       0
 7 GCTTCCACACTTCGAA   251     28   223       0
 8 TTAACTCCAATGGTCT   252     30   222       0
 9 GCACTCTAGATACACA   243     25   218       0
10 ACGCCGAGTCTACCTC   233     18   215       0
# … with 270,518 more rows
$P7_9
# A tibble: 314,582 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CACACCTTCAACGCTA   389     27   358       4
 2 GAACGGAGTTGCGCAC   362     44   318       0
 3 AAATGCCTCCCAAGTA   322     29   292       1
 4 ACTTACTCAGCTCGAC   290     33   256       1
 5 CATCCACAGATGGCGT   274     24   249       1
 6 GTAGGCCAGCCGATTT   272     37   235       0
 7 GATCTAGTCGAGAACG   254     19   233       2
 8 CGGACACAGCGCTTAT   259     28   230       1
 9 AAACGGGAGGATATAC   264     35   228       1
10 GGATGTTAGGGAACGG   247     27   219       1
# … with 314,572 more rows
tic("saving output")
data_list$read_counts <- NULL
saveRDS(data_list, results_filepath)
toc()
saving output: 3599.002 sec elapsed
# memory usage
gc()
             used    (Mb)  gc trigger     (Mb)    max used     (Mb)
Ncells    6380399   340.8    12111514    646.9    12111514    646.9
Vcells 8947572279 68264.6 31057732902 236951.7 32330482593 246662.1
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
 [1] cowplot_0.9.4      data.table_1.12.2  tictoc_1.0        
 [4] furrr_0.1.0        future_1.13.0      broom_0.5.2       
 [7] matrixStats_0.54.0 forcats_0.4.0      stringr_1.4.0     
[10] dplyr_0.8.1        purrr_0.3.2        readr_1.3.1       
[13] tidyr_0.8.3        tibble_2.1.2       ggplot2_3.1.1     
[16] tidyverse_1.2.1    rhdf5_2.26.2       rmarkdown_1.13    
loaded via a namespace (and not attached):
 [1] nlme_3.1-137                bitops_1.0-6               
 [3] lubridate_1.7.4             httr_1.4.0                 
 [5] rprojroot_1.3-2             GenomeInfoDb_1.18.2        
 [7] tools_3.5.2                 backports_1.1.4            
 [9] utf8_1.1.4                  R6_2.4.0                   
[11] HDF5Array_1.10.1            lazyeval_0.2.2             
[13] BiocGenerics_0.28.0         colorspace_1.4-1           
[15] withr_2.1.2                 tidyselect_0.2.5           
[17] compiler_3.5.2              cli_1.1.0                  
[19] rvest_0.3.4                 Biobase_2.42.0             
[21] xml2_1.2.0                  DelayedArray_0.8.0         
[23] labeling_0.3                scales_1.0.0               
[25] digest_0.6.19               XVector_0.22.0             
[27] base64enc_0.1-3             pkgconfig_2.0.2            
[29] htmltools_0.3.6             limma_3.38.3               
[31] rlang_0.3.4                 readxl_1.3.1               
[33] rstudioapi_0.10             generics_0.0.2             
[35] jsonlite_1.6                BiocParallel_1.16.6        
[37] RCurl_1.95-4.12             magrittr_1.5               
[39] GenomeInfoDbData_1.2.0      Matrix_1.2-15              
[41] Rcpp_1.0.1                  munsell_0.5.0              
[43] S4Vectors_0.20.1            Rhdf5lib_1.4.2             
[45] fansi_0.4.0                 stringi_1.4.3              
[47] yaml_2.2.0                  edgeR_3.24.3               
[49] MASS_7.3-51.1               SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0             plyr_1.8.4                 
[53] grid_3.5.2                  parallel_3.5.2             
[55] listenv_0.7.0               crayon_1.3.4               
[57] lattice_0.20-38             haven_2.1.0                
[59] hms_0.4.2                   locfit_1.5-9.1             
[61] zeallot_0.1.0               knitr_1.23                 
[63] pillar_1.4.1                GenomicRanges_1.34.0       
[65] codetools_0.2-16            stats4_3.5.2               
[67] glue_1.3.1                  evaluate_0.14              
[69] modelr_0.1.4                vctrs_0.1.0                
[71] cellranger_1.1.0            gtable_0.3.0               
[73] assertthat_0.2.1            xfun_0.7                   
[75] DropletUtils_1.2.2          viridisLite_0.3.0          
[77] SingleCellExperiment_1.4.1  IRanges_2.16.0             
[79] globals_0.12.4