knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <-"hiseq2500"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: hiseq2500
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
Each sample’s molecule_info.h5 file should be renamed to {sample_name}.h5 and placed in ../project_dir/data/{dataset_name}/input/. The purged UMI count matrices and other output files are saved to ../project_dir/data/{dataset_name}/output/.
code_dir <- file.path(project_dir, "code")
data_dir <- file.path(project_dir, "data", 
                      dataset_name)
input_dir <- file.path(data_dir, "input")
output_dir <- file.path(data_dir, "output")
figures_dir <- file.path(output_dir, "figures")
read_counts_filepath <- file.path(output_dir,
                                  sprintf("%s_read_counts.rds", 
                                          dataset_name))
results_filepath <- file.path(output_dir, 
                              sprintf("%s_results.rds", 
                                      dataset_name))
Create directories if they don’t exist.
dir.create(output_dir)
Warning in dir.create(output_dir): '/project/6007998/rfarouni/
index_hopping/data/hiseq2500/output' already exists
dir.create(figures_dir)
Warning in dir.create(figures_dir): '/project/6007998/rfarouni/
index_hopping/data/hiseq2500/output/figures' already exists
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3 
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
purge_phantoms <- function(input_dir,
                           output_dir,
                           read_counts_filepath = NULL,
                           torc = 3,
                           max_r = NULL) {
  tic("Running workflow I")
  tic("Step 1: loading molecule_info files and creating read counts datatable")
  read_counts <- create_joined_counts(input_dir, read_counts_filepath)
  toc()
  sample_names <-
    setdiff(
      colnames(read_counts),
      c("cell", "umi", "gene", "outcome")
    )
  S <- length(sample_names)
  tic("Step 2: creating outcome counts datatable with grouping vars")
  outcome_counts <- create_outcome_counts(read_counts, sample_names)
  toc()
  tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
  fit_out <-
    estimate_hopping_rate(
      outcome_counts,
      S,
      max_r = max_r
    )
  toc()
  # compute_molecular_complexity_profile
  tic("Step 4: compute molecular complexity profile and other summary statistics")
  summary_stats <-
    compute_summary_stats(
      outcome_counts,
      fit_out$glm_estimates$phat,
      sample_names
    )
  toc()
  tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
  outcome_counts <-
    reassign_reads_and_mark_retained_observations(
      outcome_counts,
      summary_stats,
      sample_names,
      fit_out,
      torc
    )
  # get the tradoff ratio cutoff
  summary_stats <- get_threshold(outcome_counts, summary_stats)
  toc()
  tic("Step 6: Purge and save read counts datatable to disk")
  read_counts <-
    left_join(read_counts %>%
      select(outcome, cell, umi, gene, sample_names),
    outcome_counts %>%
      select(c("outcome", "retain", paste0(sample_names, "_hat"))),
    by = c("outcome")
    ) %>%
    select(-outcome)
  purge_and_save_read_counts(
    read_counts,
    dataset_name,
    sample_names,
    output_dir
  )
  toc()
  tic("Step 7: create umi counts matrices")
  umi_counts_cell_gene <-
    create_umi_counts(
      read_counts,
      sample_names
    )
  toc()
  outcome_counts <-
    outcome_counts %>%
    arrange(-qr) %>%
    select(-c(paste0(sample_names, "_hat")))
  read_counts <-
    read_counts %>%
    select(-c("retain", paste0(sample_names, "_hat")))
  summary_stats$sample_names <- sample_names
  data_list <-
    list(
      umi_counts_cell_gene = umi_counts_cell_gene,
      read_counts = read_counts,
      outcome_counts = outcome_counts,
      fit_out = fit_out,
      summary_stats = summary_stats
    )
  toc()
  return(data_list)
}
identify_rna_cells <- function(data_list, output_dir) {
  tic("Running workflow II")
  tic("Step 7: identify RNA-containing cells")
  called_cells_out <- call_cells_all_samples(data_list$umi_counts_cell_gene, output_dir)
  toc()
  tic("Step 9: tallying molecules by cell-barcode")
  umi_counts_cell <- map2(
    called_cells_out$called_cells,
    data_list$umi_counts_cell_gene,
    get_umi_counts_cell
  )
  umi_counts_sample <-
    map(umi_counts_cell,
      map_dfr,
      get_umi_counts_sample,
      .id = "split"
    ) %>%
    bind_rows(.id = "sample")
  data_list$summary_stats <-
    update_summary_stats(
      data_list$summary_stats,
      umi_counts_sample
    )
  toc()
  data_list <-
    c(
      data_list,
      list(
        umi_counts_cell = umi_counts_cell,
        called_cells_tally = called_cells_out$called_cells_tally
      )
    )
  toc()
  return(data_list)
}
Estimate the sample index hopping probability, infer the true sample of origin, and reassign reads.
data_list <- purge_phantoms(input_dir, output_dir, read_counts_filepath, torc=torc)
Step 1: loading molecule_info files and creating read counts datatable: 60.875 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 8.44 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.358 sec elapsed
Step 4: compute molecular complexity profile and other summary statistics: 1.383 sec elapsed
Step 5: reassign read counts, determine cutoff, and mark retained observations: 0.782 sec elapsed
Step 6: Purge and save read counts datatable to disk: 100.268 sec elapsed
Step 7: create umi counts matrices: 53.312 sec elapsed
Running workflow I: 225.432 sec elapsed
data_list$read_counts 
The datatable is ordered in descending order of qr, the posterior probability of incorrectly assigning s as the inferred sample of origin. n is the number of CUGs with the corresponding outcome and p_outcome is the observed marginal probability of that outcome.
data_list$outcome_counts 
p_chimeras is the proportion CUGs that are chimeric. g is the estimated proportion of fugue molecules and u is the molecule inflation factor such that n_cugs x u would give the number of non-fugue phantom molecules. The estimated total number of phantom molecules present in the dataset is given by n_pm=n_cugs x (u+g).
 data_list$summary_stats$summary_estimates
 data_list$summary_stats$conditional
 data_list$summary_stats$pi_r_hat
Plot
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=250)
p_read <- plot_grid(p_read$p, 
          p_read$legend,
          ncol=2,
          rel_widths=c(1, 0.1))
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_molcomplexity.pdf")),
       p_read,
       width=9,
       height=6)
p_read
data_list$fit_out$glm_estimates
p_fit <- plot_fit(data_list, dataset_name, x_lim=250)
p_fit <-plot_grid(p_fit$p,
          p_fit$legend,
          ncol=2,
          rel_widths=c(1, 0.2))
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_fit.pdf")),
       p_fit,
       width=9,
       height=6)
p_fit
data_list$summary_stats$cutoff_dt
The row discard_torc shows the outcome whose qr value is the maximum allowed. Reads corresponding to outcomes with greater qr values are discarded. no_discarding corresponds to retaining all reassigned reads and no_purging corresponds to keeping the data as it is.
p_fp <- plot_fp_reduction(data_list, dataset_name)
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_fp_performance.pdf")),
       p_fp,
       width=9,
       height=6)
p_fp
p_tradeoff <- plot_fp_tradoff(data_list, dataset_name)
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_tradeoff.pdf")),
       p_tradeoff,
       width=9,
       height=6)
p_tradeoff
data_list <- identify_rna_cells(data_list, output_dir)
Using defaultDrops cell calling instead.
      EmptyDrops function call failed for purged sample B1
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 7
Step 7: identify RNA-containing cells: 733.182 sec elapsed
Step 9: tallying molecules by cell-barcode: 25.698 sec elapsed
Running workflow II: 758.881 sec elapsed
Here we examine the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here m is the number of total molecules in millions; rm_ret is the number of predicted real molecules and prm_ret is the proportion; rm_disc is the number of discarded real molecules; and pm is the number of predicted phantom molecules.
data_list$summary_stats$marginal
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$summary_stats$marginal_called_cells
The rows corresponding to consensus_background and consensus_cell refer to the number of barcodes that were categorized as background cells or rna-containing cells, respectively, no matter whether the data was purged or not. In contrast, transition_background and transition_cell refer to the number of barcodes that were recatgorized as background and cell, respectively. phantom_background and phantom_cell are phantom cells that disappear once phantom molecules are purged.
data_list$called_cells_tally
data_list$umi_counts_cell %>% 
  map(list("called_cells"))
$A1
# A tibble: 409 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AGGTCCGGTCCGAGTC 37482  37479     3       0
 2 ACGTCAAGTTGCCTCT 31541  31538     2       1
 3 CACCACTTCGGTTCGG 18666  18664     2       0
 4 CGTAGGCCAACACCTA 65340  65338     2       0
 5 CTGCTGTTCCGGGTGT 17297  17294     2       1
 6 TCTCTAAAGATCACGG 21938  21936     2       0
 7 AACCGCGTCATGCTCC 47423  47422     1       0
 8 ACATACGCATTCGACA  3517   3516     1       0
 9 ACGAGGACATCACCCT 15208  15207     1       0
10 ACGGCCACAAGGACAC 49067  49064     1       2
# … with 399 more rows
$A2
# A tibble: 566 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 ACGAGCCCACCGTTGG 43895  43892     3       0
 2 CGTGTAATCTAACTCT 22943  22941     2       0
 3 CTTAGGAAGAAACGCC 19123  19121     2       0
 4 GCTGGGTAGTAGGTGC 55651  55648     2       1
 5 AAAGATGAGGCCCTCA 68719  68718     1       0
 6 AACTCCCCATTCGACA 15035  15034     1       0
 7 ACAGCTAAGGCCCGTT 29916  29914     1       1
 8 ACGGGCTTCGGTCCGA 34499  34498     1       0
 9 ACTTACTCAACACCCG 29456  29455     1       0
10 CAAGATCCAACACGCC 44113  44112     1       0
# … with 556 more rows
$B1
# A tibble: 220 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT   103      9    94       0
 2 TCGTACCGTCCATGAT    74     11    63       0
 3 TGCACCTTCTGGCGAC    74     11    63       0
 4 GTGCAGCCAAGCGAGT    43      5    38       0
 5 CTAGCCTTCCGCGCAA    41      4    37       0
 6 AGCAGCCGTTAAGGGC    28      2    26       0
 7 TGCACCTCATGCCACG    28      3    25       0
 8 TCATTACCAGGCAGTA    19      3    16       0
 9 GATCGTAGTCACCCAG    18      4    14       0
10 GTGCTTCAGGCATTGG    15      1    14       0
# … with 210 more rows
$B2
# A tibble: 16 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 TCGTACCGTCCATGAT 26473  26472     1       0
 2 ACGCCAGAGTACGTAA   417    417     0       0
 3 AGCAGCCGTTAAGGGC  8853   8853     0       0
 4 CAAGGCCGTCCGAGTC   232    232     0       0
 5 CGTGAGCCAAGGACTG  1478   1478     0       0
 6 CTAGCCTTCCGCGCAA 11808  11808     0       0
 7 CTCACACGTCTCTCGT 36274  36274     0       0
 8 CTTGGCTCAAAGTGCG  1034   1034     0       0
 9 GACAGAGTCAATACCG   211    211     0       0
10 GATCGTAGTCACCCAG  5904   5904     0       0
11 GTGCAGCCAAGCGAGT 12113  12113     0       0
12 GTGGGTCCAGGCTGAA  2190   2190     0       0
13 TCATTACCAGGCAGTA  5893   5893     0       0
14 TGCACCTCATGCCACG  5479   5479     0       0
15 TGCACCTTCTGGCGAC 22310  22310     0       0
16 TGGCTGGCATCCGTGG  1397   1397     0       0
$C1
# A tibble: 739 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 ACTATCTTCCTGTACC 33437  33434     2       1
 2 CCCTCCTGTGATGTCT 10899  10897     2       0
 3 GGTGCGTCAGTTAACC 38377  38375     2       0
 4 ACCTTTACAAAGTGCG 19839  19837     1       1
 5 ACGGCCAGTACTCAAC 26169  26168     1       0
 6 AGCGTATGTTCTCATT 15803  15802     1       0
 7 AGGGATGTCGCCGTGA  9354   9353     1       0
 8 AGGTCATGTAAGCACG 24932  24930     1       1
 9 ATCTGCCTCTTGCATT  4779   4777     1       1
10 ATTATCCGTTAAGATG 30075  30072     1       2
# … with 729 more rows
$C2
# A tibble: 1,317 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTTAACTGTGAGTATA  6233   6230     3       0
 2 ATAACGCCACGGTTTA  1584   1583     1       0
 3 GGAACTTTCAAAGTAG  1634   1633     1       0
 4 TACCTATCATCACAAC  6081   6080     1       0
 5 AAACCTGCAGTGGGAT  8200   8200     0       0
 6 AAACCTGGTCGAAAGC  7482   7482     0       0
 7 AAACGGGCAAGTTAAG  9265   9265     0       0
 8 AAACGGGGTTATCACG 13268  13268     0       0
 9 AAACGGGGTTGTCTTT 13437  13437     0       0
10 AAAGATGAGAGGTTGC   795    795     0       0
# … with 1,307 more rows
$D1
# A tibble: 685 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AAAGATGAGGCCGAAT 13329  13326     1       2
 2 AAGGCAGGTTATGTGC  4278   4277     1       0
 3 ACCGTAACAGGACGTA 10930  10929     1       0
 4 AGCTCCTTCTCAAACG 34839  34837     1       1
 5 ATAACGCGTTGTCTTT 11030  11027     1       2
 6 ATCCACCGTAAGGGAA 20527  20526     1       0
 7 ATCGAGTCAGATCTGT 25174  25172     1       1
 8 CATATTCGTACGAAAT 23309  23304     1       4
 9 CATCGGGGTATGAAAC 29755  29753     1       1
10 CCAGCGATCGACCAGC 13789  13785     1       3
# … with 675 more rows
$D2
# A tibble: 160 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 AACCATGTCAGCGACC 18595  18594     1       0
 2 AGATTGCCAGGTTTCA 14970  14969     1       0
 3 CGCGTTTCATGTTGAC 11987  11986     1       0
 4 CGTCAGGAGCTTCGCG 14085  14084     1       0
 5 GCATGTATCCCAGGTG 27178  27177     1       0
 6 GTGTTAGGTAGCGTCC 11840  11839     1       0
 7 TCAGGTATCATAACCG  8832   8831     1       0
 8 TGTCCCAGTCATCCCT  3194   3193     1       0
 9 AAAGCAAGTGAACCTT  9734   9734     0       0
10 AAAGCAATCTCCTATA 11917  11917     0       0
# … with 150 more rows
data_list$umi_counts_cell %>% 
  map(list("background_cells"))
$A1
# A tibble: 125,102 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GGCTGGTGTCCCTACT    86      4    82       0
 2 CTCACACGTCTCTCGT    99     19    80       0
 3 CATGCCTAGTGCTGCC    92     18    74       0
 4 TGCACCTTCTGGCGAC    57      5    52       0
 5 GGAATAAGTCAGGACA    53      3    50       0
 6 CACACAAAGGCGATAC    67     19    48       0
 7 TCGTACCGTCCATGAT    56     10    46       0
 8 CGGCTAGGTAAACCTC    42      0    42       0
 9 CTTCTCTCAACTGCTA    48      6    42       0
10 TGGTTAGTCCTGCAGG    40      1    39       0
# … with 125,092 more rows
$A2
# A tibble: 139,718 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GGCTGGTGTCCCTACT    97     15    82       0
 2 CTCACACGTCTCTCGT    93     21    72       0
 3 CATGCCTAGTGCTGCC    84     15    69       0
 4 TCGTACCGTCCATGAT    76     18    58       0
 5 TGCACCTTCTGGCGAC    66     14    52       0
 6 CACACAAAGGCGATAC    55     14    41       0
 7 GTGCAGCCAAGCGAGT    50     12    38       0
 8 AAGCCGCGTGCTCTTC    44      8    36       0
 9 GGAATAAGTCAGGACA    38      5    33       0
10 CTAGCCTTCCGCGCAA    40      8    32       0
# … with 139,708 more rows
$B1
# A tibble: 27,396 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 ACGGCCACAAGGACAC     8      0     8       0
 2 AGATTGCCAGGTTTCA     8      0     8       0
 3 CAGCCGACAGGATCGA     8      0     8       0
 4 GATCGTAAGATACACA     8      0     8       0
 5 ACTGTCCCAGCGAACA     8      1     7       0
 6 ATTATCCGTGTGAATA     8      1     7       0
 7 CAGATCAGTGCTCTTC     7      0     7       0
 8 CTACACCCACGCGAAA     7      0     7       0
 9 TTTGGTTCAGGAACGT     8      1     7       0
10 ACGGGCTTCGGTCCGA     6      0     6       0
# … with 27,386 more rows
$B2
# A tibble: 31,107 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GGCTGGTGTCCCTACT    96     10    86       0
 2 CATGCCTAGTGCTGCC    85      9    76       0
 3 CACACAAAGGCGATAC    66      8    58       0
 4 AAGCCGCGTGCTCTTC    46      6    40       0
 5 GGAATAAGTCAGGACA    42      3    39       0
 6 ATAAGAGAGTGTGGCA    32      3    29       0
 7 TGCCCATCAGTAAGCG    32      3    29       0
 8 CACACTCCACATAACC    29      3    26       0
 9 CATCGAAAGTCAATAG    28      6    22       0
10 CATATTCGTTGTACAC    14      0    14       0
# … with 31,097 more rows
$C1
# A tibble: 146,091 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GGCTGGTGTCCCTACT   113     10   103       0
 2 CATGCCTAGTGCTGCC   107      9    98       0
 3 CTCACACGTCTCTCGT    88     16    72       0
 4 CACACAAAGGCGATAC    71     12    59       0
 5 TCGTACCGTCCATGAT    64     10    54       0
 6 TGCACCTTCTGGCGAC    57     12    45       0
 7 GGAATAAGTCAGGACA    50      8    42       0
 8 AAGCCGCGTGCTCTTC    47      7    40       0
 9 CATCGAAAGTCAATAG    42      5    37       0
10 ATAAGAGAGTGTGGCA    38      7    31       0
# … with 146,081 more rows
$C2
# A tibble: 196,286 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CATGCCTAGTGCTGCC   103     13    90       0
 2 GGCTGGTGTCCCTACT   109     20    89       0
 3 CACACAAAGGCGATAC    67      2    65       0
 4 CTCACACGTCTCTCGT    69      8    61       0
 5 TCGTACCGTCCATGAT    67      9    58       0
 6 TGCACCTTCTGGCGAC    54      8    46       0
 7 GGAATAAGTCAGGACA    43      5    38       0
 8 AAGCCGCGTGCTCTTC    44      8    36       0
 9 CATCGAAAGTCAATAG    40      7    33       0
10 CTAGCCTTCCGCGCAA    33      2    31       0
# … with 196,276 more rows
$D1
# A tibble: 99,748 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT    94     20    74       0
 2 GGCTGGTGTCCCTACT    80      8    72       0
 3 CATGCCTAGTGCTGCC    86     15    71       0
 4 TCGTACCGTCCATGAT    62     16    46       0
 5 CACACAAAGGCGATAC    46      5    41       0
 6 TGCACCTTCTGGCGAC    44      7    37       0
 7 AAGCCGCGTGCTCTTC    32      3    29       0
 8 CATCGAAAGTCAATAG    28      4    24       0
 9 CTAGCCTTCCGCGCAA    33      9    24       0
10 GTGCAGCCAAGCGAGT    27      4    23       0
# … with 99,738 more rows
$D2
# A tibble: 62,329 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GGCTGGTGTCCCTACT   120     19   101       0
 2 CTCACACGTCTCTCGT    90      8    82       0
 3 CATGCCTAGTGCTGCC    93     12    81       0
 4 TCGTACCGTCCATGAT    83     22    60       1
 5 CACACAAAGGCGATAC    66      7    59       0
 6 TGCACCTTCTGGCGAC    62      5    57       0
 7 GTGCAGCCAAGCGAGT    40      5    35       0
 8 ATAAGAGAGTGTGGCA    32      2    30       0
 9 AAGCCGCGTGCTCTTC    28      0    28       0
10 CTAGCCTTCCGCGCAA    33      5    28       0
# … with 62,319 more rows
tic("saving output")
data_list$read_counts <- NULL
saveRDS(data_list, results_filepath)
toc()
saving output: 289.009 sec elapsed
# memory usage
gc()
            used   (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells   6055524  323.4    9523112   508.6    9523112   508.6
Vcells 705180869 5380.2 2345841278 17897.4 2931643582 22366.7
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
 [1] cowplot_0.9.4      data.table_1.12.2  tictoc_1.0        
 [4] furrr_0.1.0        future_1.13.0      broom_0.5.2       
 [7] matrixStats_0.54.0 forcats_0.4.0      stringr_1.4.0     
[10] dplyr_0.8.1        purrr_0.3.2        readr_1.3.1       
[13] tidyr_0.8.3        tibble_2.1.2       ggplot2_3.1.1     
[16] tidyverse_1.2.1    rhdf5_2.26.2       rmarkdown_1.13    
loaded via a namespace (and not attached):
 [1] nlme_3.1-137                bitops_1.0-6               
 [3] lubridate_1.7.4             httr_1.4.0                 
 [5] rprojroot_1.3-2             GenomeInfoDb_1.18.2        
 [7] tools_3.5.2                 backports_1.1.4            
 [9] utf8_1.1.4                  R6_2.4.0                   
[11] HDF5Array_1.10.1            lazyeval_0.2.2             
[13] BiocGenerics_0.28.0         colorspace_1.4-1           
[15] withr_2.1.2                 tidyselect_0.2.5           
[17] compiler_3.5.2              cli_1.1.0                  
[19] rvest_0.3.4                 Biobase_2.42.0             
[21] xml2_1.2.0                  DelayedArray_0.8.0         
[23] labeling_0.3                scales_1.0.0               
[25] digest_0.6.19               XVector_0.22.0             
[27] base64enc_0.1-3             pkgconfig_2.0.2            
[29] htmltools_0.3.6             limma_3.38.3               
[31] rlang_0.3.4                 readxl_1.3.1               
[33] rstudioapi_0.10             generics_0.0.2             
[35] jsonlite_1.6                BiocParallel_1.16.6        
[37] RCurl_1.95-4.12             magrittr_1.5               
[39] GenomeInfoDbData_1.2.0      Matrix_1.2-15              
[41] Rcpp_1.0.1                  munsell_0.5.0              
[43] S4Vectors_0.20.1            Rhdf5lib_1.4.2             
[45] fansi_0.4.0                 stringi_1.4.3              
[47] yaml_2.2.0                  edgeR_3.24.3               
[49] MASS_7.3-51.1               SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0             plyr_1.8.4                 
[53] grid_3.5.2                  parallel_3.5.2             
[55] listenv_0.7.0               crayon_1.3.4               
[57] lattice_0.20-38             haven_2.1.0                
[59] hms_0.4.2                   locfit_1.5-9.1             
[61] zeallot_0.1.0               knitr_1.23                 
[63] pillar_1.4.1                GenomicRanges_1.34.0       
[65] codetools_0.2-16            stats4_3.5.2               
[67] glue_1.3.1                  evaluate_0.14              
[69] modelr_0.1.4                vctrs_0.1.0                
[71] cellranger_1.1.0            gtable_0.3.0               
[73] assertthat_0.2.1            xfun_0.7                   
[75] DropletUtils_1.2.2          viridisLite_0.3.0          
[77] SingleCellExperiment_1.4.1  IRanges_2.16.0             
[79] globals_0.12.4