knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
                     fig.width=15,
                     digit=5,
                     scipen=8)
options(digits=5, 
        scipen=8,
        future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <-"hiseq2500"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: hiseq4000
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
  project_dir <- getwd()
  warning(sprintf("No rstudio project root file  found. 
                  Setting project directory to current workflow.Rmd file location: %s. 
                  Override if needed.",
                  project_dir))
 
}
message(sprintf("Project directory: %s",
                project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
Each sample’s molecule_info.h5 file should be renamed to {sample_name}.h5 and placed in ../project_dir/data/{dataset_name}/input/. The purged UMI count matrices and other output files are saved to ../project_dir/data/{dataset_name}/output/.
code_dir <- file.path(project_dir, "code")
data_dir <- file.path(project_dir, "data", 
                      dataset_name)
input_dir <- file.path(data_dir, "input")
output_dir <- file.path(data_dir, "output")
figures_dir <- file.path(output_dir, "figures")
read_counts_filepath <- file.path(output_dir,
                                  sprintf("%s_read_counts.rds", 
                                          dataset_name))
results_filepath <- file.path(output_dir, 
                              sprintf("%s_results.rds", 
                                      dataset_name))
Create directories if they don’t exist.
dir.create(output_dir)
Warning in dir.create(output_dir): '/project/6007998/rfarouni/
index_hopping/data/hiseq4000/output' already exists
dir.create(figures_dir)
Warning in dir.create(figures_dir): '/project/6007998/rfarouni/
index_hopping/data/hiseq4000/output/figures' already exists
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3 
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
purge_phantoms <- function(input_dir,
                           output_dir,
                           read_counts_filepath = NULL,
                           torc = 3,
                           max_r = NULL) {
  tic("Running workflow I")
  tic("Step 1: loading molecule_info files and creating read counts datatable")
  read_counts <- create_joined_counts(input_dir, read_counts_filepath)
  toc()
  sample_names <-
    setdiff(
      colnames(read_counts),
      c("cell", "umi", "gene", "outcome")
    )
  S <- length(sample_names)
  tic("Step 2: creating outcome counts datatable with grouping vars")
  outcome_counts <- create_outcome_counts(read_counts, sample_names)
  toc()
  tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
  fit_out <-
    estimate_hopping_rate(
      outcome_counts,
      S,
      max_r = max_r
    )
  toc()
  # compute_molecular_complexity_profile
  tic("Step 4: compute molecular complexity profile and other summary statistics")
  summary_stats <-
    compute_summary_stats(
      outcome_counts,
      fit_out$glm_estimates$phat,
      sample_names
    )
  toc()
  tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
  outcome_counts <-
    reassign_reads_and_mark_retained_observations(
      outcome_counts,
      summary_stats,
      sample_names,
      fit_out,
      torc
    )
  # get the tradoff ratio cutoff
  summary_stats <- get_threshold(outcome_counts, summary_stats)
  toc()
  tic("Step 6: Purge and save read counts datatable to disk")
  read_counts <-
    left_join(read_counts %>%
      select(outcome, cell, umi, gene, sample_names),
    outcome_counts %>%
      select(c("outcome", "retain", paste0(sample_names, "_hat"))),
    by = c("outcome")
    ) %>%
    select(-outcome)
  purge_and_save_read_counts(
    read_counts,
    dataset_name,
    sample_names,
    output_dir
  )
  toc()
  tic("Step 7: create umi counts matrices")
  umi_counts_cell_gene <-
    create_umi_counts(
      read_counts,
      sample_names
    )
  toc()
  outcome_counts <-
    outcome_counts %>%
    arrange(-qr) %>%
    select(-c(paste0(sample_names, "_hat")))
  read_counts <-
    read_counts %>%
    select(-c("retain", paste0(sample_names, "_hat")))
  summary_stats$sample_names <- sample_names
  data_list <-
    list(
      umi_counts_cell_gene = umi_counts_cell_gene,
      read_counts = read_counts,
      outcome_counts = outcome_counts,
      fit_out = fit_out,
      summary_stats = summary_stats
    )
  toc()
  return(data_list)
}
identify_rna_cells <- function(data_list, output_dir) {
  tic("Running workflow II")
  tic("Step 7: identify RNA-containing cells")
  called_cells_out <- call_cells_all_samples(data_list$umi_counts_cell_gene, output_dir)
  toc()
  tic("Step 9: tallying molecules by cell-barcode")
  umi_counts_cell <- map2(
    called_cells_out$called_cells,
    data_list$umi_counts_cell_gene,
    get_umi_counts_cell
  )
  umi_counts_sample <-
    map(umi_counts_cell,
      map_dfr,
      get_umi_counts_sample,
      .id = "split"
    ) %>%
    bind_rows(.id = "sample")
  data_list$summary_stats <-
    update_summary_stats(
      data_list$summary_stats,
      umi_counts_sample
    )
  toc()
  data_list <-
    c(
      data_list,
      list(
        umi_counts_cell = umi_counts_cell,
        called_cells_tally = called_cells_out$called_cells_tally
      )
    )
  toc()
  return(data_list)
}
Estimate the sample index hopping probability, infer the true sample of origin, and reassign reads.
data_list <- purge_phantoms(input_dir, output_dir, read_counts_filepath, torc=torc)
Step 1: loading molecule_info files and creating read counts datatable: 79.727 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 18.495 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.882 sec elapsed
Step 4: compute molecular complexity profile and other summary statistics: 2.702 sec elapsed
Step 5: reassign read counts, determine cutoff, and mark retained observations: 5.818 sec elapsed
Step 6: Purge and save read counts datatable to disk: 127.33 sec elapsed
Step 7: create umi counts matrices: 57.812 sec elapsed
Running workflow I: 292.864 sec elapsed
data_list$read_counts 
The datatable is ordered in descending order of qr, the posterior probability of incorrectly assigning s as the inferred sample of origin. n is the number of CUGs with the corresponding outcome and p_outcome is the observed marginal probability of that outcome.
data_list$outcome_counts 
p_chimeras is the proportion CUGs that are chimeric. g is the estimated proportion of fugue molecules and u is the molecule inflation factor such that n_cugs x u would give the number of non-fugue phantom molecules. The estimated total number of phantom molecules present in the dataset is given by n_pm=n_cugs x (u+g).
 data_list$summary_stats$summary_estimates
 data_list$summary_stats$conditional
 data_list$summary_stats$pi_r_hat
Plot
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=250)
p_read <- plot_grid(p_read$p, 
          p_read$legend,
          ncol=2,
          rel_widths=c(1, 0.1))
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_molcomplexity.pdf")),
       p_read,
       width=9,
       height=6)
p_read
data_list$fit_out$glm_estimates
p_fit <- plot_fit(data_list, dataset_name, x_lim=250)
p_fit <-plot_grid(p_fit$p,
          p_fit$legend,
          ncol=2,
          rel_widths=c(1, 0.2))
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_fit.pdf")),
       p_fit,
       width=9,
       height=6)
p_fit
data_list$summary_stats$cutoff_dt
The row discard_torc shows the outcome whose qr value is the maximum allowed. Reads corresponding to outcomes with greater qr values are discarded. no_discarding corresponds to retaining all reassigned reads and no_purging corresponds to keeping the data as it is.
p_fp <- plot_fp_reduction(data_list, dataset_name)
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_fp_performance.pdf")),
       p_fp,
       width=9,
       height=6)
p_fp
p_tradeoff <- plot_fp_tradoff(data_list, dataset_name)
ggsave(file.path(figures_dir,
                 paste0(dataset_name, "_tradeoff.pdf")),
       p_tradeoff,
       width=9,
       height=6)
p_tradeoff
data_list <- identify_rna_cells(data_list, output_dir)
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 11
Warning in smooth.spline(x[new.keep], y[new.keep], df = df, ...): not using
invalid df; must have 1 < df <= n := #{unique x} = 6
Step 7: identify RNA-containing cells: 877.212 sec elapsed
Step 9: tallying molecules by cell-barcode: 34.345 sec elapsed
Running workflow II: 911.558 sec elapsed
Here we examine the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here m is the number of total molecules in millions; rm_ret is the number of predicted real molecules and prm_ret is the proportion; rm_disc is the number of discarded real molecules; and pm is the number of predicted phantom molecules.
data_list$summary_stats$marginal
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$summary_stats$marginal_called_cells
The rows corresponding to consensus_background and consensus_cell refer to the number of barcodes that were categorized as background cells or rna-containing cells, respectively, no matter whether the data was purged or not. In contrast, transition_background and transition_cell refer to the number of barcodes that were recatgorized as background and cell, respectively. phantom_background and phantom_cell are phantom cells that disappear once phantom molecules are purged.
data_list$called_cells_tally
data_list$umi_counts_cell %>% 
  map(list("called_cells"))
$A1
# A tibble: 538 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT  8325   1084  7238       3
 2 GGCTGGTGTCCCTACT  6661    888  5773       0
 3 TCGTACCGTCCATGAT  5765    804  4960       1
 4 CATGCCTAGTGCTGCC  5496    805  4691       0
 5 TGCACCTTCTGGCGAC  4967    676  4290       1
 6 CACACAAAGGCGATAC  3464    542  2922       0
 7 GTGCAGCCAAGCGAGT  3020    370  2650       0
 8 GGAATAAGTCAGGACA  2961    443  2518       0
 9 CTAGCCTTCCGCGCAA  2717    411  2306       0
10 AAGCCGCGTGCTCTTC  2501    363  2137       1
# … with 528 more rows
$A2
# A tibble: 670 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT  7834   1194  6637       3
 2 GGCTGGTGTCCCTACT  6399    970  5428       1
 3 TCGTACCGTCCATGAT  5485    882  4602       1
 4 CATGCCTAGTGCTGCC  5421    828  4593       0
 5 TGCACCTTCTGGCGAC  4687    757  3930       0
 6 CACACAAAGGCGATAC  3280    530  2749       1
 7 GTGCAGCCAAGCGAGT  2841    404  2437       0
 8 GGAATAAGTCAGGACA  2869    476  2392       1
 9 CTAGCCTTCCGCGCAA  2628    426  2202       0
10 AAGCCGCGTGCTCTTC  2533    369  2164       0
# … with 660 more rows
$B1
# A tibble: 1,023 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT 23426   1449 21976       1
 2 TCGTACCGTCCATGAT 16849    934 15913       2
 3 TGCACCTTCTGGCGAC 13898    858 13038       2
 4 GTGCAGCCAAGCGAGT  8279    440  7839       0
 5 GAAACTCAGTTCCACA  7856    259  7597       0
 6 CTAGCCTTCCGCGCAA  7648    485  7163       0
 7 AGCAGCCGTTAAGGGC  5905    377  5528       0
 8 TGGCCAGTCATGTCTT  4906    151  4755       0
 9 GCATGTATCCCAGGTG  4308    125  4183       0
10 GTGCTTCAGGCATTGG  4303    137  4166       0
# … with 1,013 more rows
$B2
# A tibble: 905 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GGCTGGTGTCCCTACT 15818   1118 14699       1
 2 CATGCCTAGTGCTGCC 12752   1006 11739       7
 3 CACACAAAGGCGATAC  8024    669  7354       1
 4 GAAACTCAGTTCCACA  7246    258  6988       0
 5 GGAATAAGTCAGGACA  6611    508  6101       2
 6 AAGCCGCGTGCTCTTC  6047    453  5592       2
 7 CATCGAAAGTCAATAG  4913    379  4530       4
 8 TGGCCAGTCATGTCTT  4521    186  4335       0
 9 GTGCTTCAGGCATTGG  4036    120  3916       0
10 GCATGTATCCCAGGTG  3977    130  3847       0
# … with 895 more rows
$C1
# A tibble: 848 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT  7983   1100  6880       3
 2 GGCTGGTGTCCCTACT  6334    852  5481       1
 3 TCGTACCGTCCATGAT  5530    718  4810       2
 4 CATGCCTAGTGCTGCC  5396    776  4619       1
 5 TGCACCTTCTGGCGAC  4715    609  4105       1
 6 CACACAAAGGCGATAC  3351    516  2835       0
 7 GTGCAGCCAAGCGAGT  2941    373  2568       0
 8 GGAATAAGTCAGGACA  2852    432  2419       1
 9 CTAGCCTTCCGCGCAA  2597    344  2252       1
10 AAGCCGCGTGCTCTTC  2485    337  2148       0
# … with 838 more rows
$C2
# A tibble: 1,472 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT  8207   1285  6916       6
 2 GGCTGGTGTCCCTACT  6761    978  5783       0
 3 TCGTACCGTCCATGAT  5509    838  4668       3
 4 CATGCCTAGTGCTGCC  5478    826  4647       5
 5 TGCACCTTCTGGCGAC  4786    742  4040       4
 6 CACACAAAGGCGATAC  3515    515  3000       0
 7 GTGCAGCCAAGCGAGT  2849    391  2458       0
 8 GGAATAAGTCAGGACA  2877    474  2403       0
 9 AAGCCGCGTGCTCTTC  2588    399  2189       0
10 CTAGCCTTCCGCGCAA  2608    441  2164       3
# … with 1,462 more rows
$D1
# A tibble: 725 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT  7496   1036  6460       0
 2 GGCTGGTGTCCCTACT  5953    786  5164       3
 3 TCGTACCGTCCATGAT  5054    699  4354       1
 4 CATGCCTAGTGCTGCC  5060    735  4325       0
 5 TGCACCTTCTGGCGAC  4437    639  3797       1
 6 CACACAAAGGCGATAC  3188    500  2688       0
 7 GTGCAGCCAAGCGAGT  2707    337  2370       0
 8 GGAATAAGTCAGGACA  2657    390  2267       0
 9 AAGCCGCGTGCTCTTC  2311    315  1995       1
10 CTAGCCTTCCGCGCAA  2344    352  1992       0
# … with 715 more rows
$D2
# A tibble: 194 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 CTCACACGTCTCTCGT  8986   1146  7832       8
 2 GGCTGGTGTCCCTACT  7238    911  6326       1
 3 TCGTACCGTCCATGAT  6015    759  5253       3
 4 CATGCCTAGTGCTGCC  6039    842  5194       3
 5 TGCACCTTCTGGCGAC  5328    728  4595       5
 6 CACACAAAGGCGATAC  3797    545  3250       2
 7 GTGCAGCCAAGCGAGT  3315    392  2921       2
 8 GGAATAAGTCAGGACA  3158    467  2690       1
 9 CTAGCCTTCCGCGCAA  2964    395  2564       5
10 AAGCCGCGTGCTCTTC  2936    378  2555       3
# … with 184 more rows
data_list$umi_counts_cell %>% 
  map(list("background_cells"))
$A1
# A tibble: 150,609 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GAGGTGATCCAAGCCG   544     78   466       0
 2 TGCGCAGCACGTGAGA   298     38   260       0
 3 CTACACCTCGCCTGAG   307     48   259       0
 4 TGTTCCGAGTACGTAA   293     44   249       0
 5 CACTCCAAGCTAAACA   285     40   245       0
 6 AAAGCAAGTGAACCTT   256     28   228       0
 7 CCATGTCCACGAGGTA   258     31   227       0
 8 CAGATCAGTGCTCTTC   259     33   226       0
 9 AAACGGGAGTGGGCTA   267     44   223       0
10 TGTGGTATCTCCGGTT   249     31   218       0
# … with 150,599 more rows
$A2
# A tibble: 168,114 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GAGGTGATCCAAGCCG   506     67   439       0
 2 TGAAAGACATCCGTGG   372     43   324       5
 3 ACACCAACAATGCCAT   379     64   309       6
 4 ACGGCCACAAGGACAC   343     50   290       3
 5 CCAGCGATCTGTTTGT   333     44   285       4
 6 AACTCAGTCCGCTGTT   350     64   283       3
 7 ATAGACCAGTACGTAA   325     48   273       4
 8 GCACATAAGTACGTAA   325     54   270       1
 9 TTTACTGAGTAGGTGC   298     36   258       4
10 GAGTCCGAGCGATATA   305     46   254       5
# … with 168,104 more rows
$B1
# A tibble: 110,209 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GTGCTTCGTCCGTCAG  2099     80  2019       0
 2 CGAGCACAGTGAAGTT  1941     82  1859       0
 3 TACACGACAGGCGATA  1918     87  1831       0
 4 TCTGAGACACACCGAC  1813     56  1757       0
 5 GCTCCTATCCTAGAAC  1644     60  1584       0
 6 TTTATGCAGTTAAGTG  1571     55  1516       0
 7 ACATGGTGTCGTTGTA  1499     55  1444       0
 8 ACTGTCCCACCGAAAG  1375     57  1318       0
 9 CGTTCTGTCCCACTTG  1368     53  1315       0
10 TGACTTTGTAGAGCTG  1370     63  1307       0
# … with 110,199 more rows
$B2
# A tibble: 110,504 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 TACACGACAGGCGATA  1785     55  1728       2
 2 CGAGCACAGTGAAGTT  1820     98  1722       0
 3 GAGGTGATCCAAGCCG  1719     79  1640       0
 4 TCTGAGACACACCGAC  1683     70  1612       1
 5 ACTGAACTCCCGACTT  1621     55  1565       1
 6 TTTATGCAGTTAAGTG  1605     58  1546       1
 7 GCTCCTATCCTAGAAC  1536     57  1479       0
 8 GGAACTTCATTTGCTT  1544     66  1478       0
 9 ACATGGTGTCGTTGTA  1519     59  1460       0
10 GTTCATTTCTTGAGAC  1435     55  1380       0
# … with 110,494 more rows
$C1
# A tibble: 180,232 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GAGGTGATCCAAGCCG   581     81   500       0
 2 GGAATAATCTGGGCCA   394     55   339       0
 3 TATCTCAGTCGCGAAA   344     41   303       0
 4 TGAAAGACATCCGTGG   344     38   302       4
 5 AACTCAGTCCGCTGTT   342     40   298       4
 6 CCAGCGATCTGTTTGT   377     81   295       1
 7 ACGGCCACAAGGACAC   346     51   292       3
 8 GTGCTTCGTCCGTCAG   334     40   292       2
 9 TCAATCTTCCAGAGGA   334     49   285       0
10 GCACATAAGTACGTAA   320     42   276       2
# … with 180,222 more rows
$C2
# A tibble: 242,496 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GAGGTGATCCAAGCCG   597    100   497       0
 2 AACCATGTCAGCGACC   412     74   338       0
 3 CTGGTCTTCGGTTCGG   354     29   325       0
 4 TGAAAGACATCCGTGG   376     58   313       5
 5 CCACCTAGTCGATTGT   351     44   307       0
 6 ACGGCCACAAGGACAC   439    130   304       5
 7 ATCACGACATGCTAGT   348     53   295       0
 8 TTCGAAGCAAGGTGTG   337     44   293       0
 9 CAGGTGCTCTGAAAGA   330     43   287       0
10 TATCTCAGTCGCGAAA   328     43   285       0
# … with 242,486 more rows
$D1
# A tibble: 131,097 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GAGGTGATCCAAGCCG   513     88   424       1
 2 GATCGTAAGATACACA   401     48   353       0
 3 TTTCCTCCATCACGAT   375     42   328       5
 4 ACTGATGTCAATCACG   419     98   321       0
 5 GGAGCAATCCGCATCT   358     49   308       1
 6 CAGCAGCCAGAGCCAA   347     46   300       1
 7 TTCGAAGCAACAACCT   348     49   299       0
 8 ACGGCCACAAGGACAC   327     38   285       4
 9 GCACATAAGTACGTAA   330     48   278       4
10 ATAGACCAGTACGTAA   321     43   277       1
# … with 131,087 more rows
$D2
# A tibble: 98,206 x 5
   cell                 m rm_ret    pm rm_disc
   <chr>            <int>  <dbl> <int>   <dbl>
 1 GAGGTGATCCAAGCCG   577     74   502       1
 2 AGAATAGTCATTCACT   525     75   450       0
 3 ACTGATGTCAATCACG   554    127   427       0
 4 CTACACCCACGCGAAA   464     69   394       1
 5 CAGCAGCCAGAGCCAA   446     53   393       0
 6 ACACCAACAATGCCAT   440     52   388       0
 7 ACGGCCACAAGGACAC   420     46   373       1
 8 ATAGACCAGTACGTAA   410     42   368       0
 9 CGTAGGCCAACACCTA   415     49   366       0
10 AACTCAGTCCGCTGTT   415     49   365       1
# … with 98,196 more rows
tic("saving output")
data_list$read_counts <- NULL
saveRDS(data_list, results_filepath)
toc()
saving output: 332.045 sec elapsed
# memory usage
gc()
            used   (Mb) gc trigger    (Mb)   max used    (Mb)
Ncells   6271619  335.0   12858931   686.8   12858931   686.8
Vcells 834233111 6364.7 2727834242 20811.8 3402985819 25962.8
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
 [1] cowplot_0.9.4      data.table_1.12.2  tictoc_1.0        
 [4] furrr_0.1.0        future_1.13.0      broom_0.5.2       
 [7] matrixStats_0.54.0 forcats_0.4.0      stringr_1.4.0     
[10] dplyr_0.8.1        purrr_0.3.2        readr_1.3.1       
[13] tidyr_0.8.3        tibble_2.1.2       ggplot2_3.1.1     
[16] tidyverse_1.2.1    rhdf5_2.26.2       rmarkdown_1.13    
loaded via a namespace (and not attached):
 [1] nlme_3.1-137                bitops_1.0-6               
 [3] lubridate_1.7.4             httr_1.4.0                 
 [5] rprojroot_1.3-2             GenomeInfoDb_1.18.2        
 [7] tools_3.5.2                 backports_1.1.4            
 [9] utf8_1.1.4                  R6_2.4.0                   
[11] HDF5Array_1.10.1            lazyeval_0.2.2             
[13] BiocGenerics_0.28.0         colorspace_1.4-1           
[15] withr_2.1.2                 tidyselect_0.2.5           
[17] compiler_3.5.2              cli_1.1.0                  
[19] rvest_0.3.4                 Biobase_2.42.0             
[21] xml2_1.2.0                  DelayedArray_0.8.0         
[23] labeling_0.3                scales_1.0.0               
[25] digest_0.6.19               XVector_0.22.0             
[27] base64enc_0.1-3             pkgconfig_2.0.2            
[29] htmltools_0.3.6             limma_3.38.3               
[31] rlang_0.3.4                 readxl_1.3.1               
[33] rstudioapi_0.10             generics_0.0.2             
[35] jsonlite_1.6                BiocParallel_1.16.6        
[37] RCurl_1.95-4.12             magrittr_1.5               
[39] GenomeInfoDbData_1.2.0      Matrix_1.2-15              
[41] Rcpp_1.0.1                  munsell_0.5.0              
[43] S4Vectors_0.20.1            Rhdf5lib_1.4.2             
[45] fansi_0.4.0                 stringi_1.4.3              
[47] yaml_2.2.0                  edgeR_3.24.3               
[49] MASS_7.3-51.1               SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0             plyr_1.8.4                 
[53] grid_3.5.2                  parallel_3.5.2             
[55] listenv_0.7.0               crayon_1.3.4               
[57] lattice_0.20-38             haven_2.1.0                
[59] hms_0.4.2                   locfit_1.5-9.1             
[61] zeallot_0.1.0               knitr_1.23                 
[63] pillar_1.4.1                GenomicRanges_1.34.0       
[65] codetools_0.2-16            stats4_3.5.2               
[67] glue_1.3.1                  evaluate_0.14              
[69] modelr_0.1.4                vctrs_0.1.0                
[71] cellranger_1.1.0            gtable_0.3.0               
[73] assertthat_0.2.1            xfun_0.7                   
[75] DropletUtils_1.2.2          viridisLite_0.3.0          
[77] SingleCellExperiment_1.4.1  IRanges_2.16.0             
[79] globals_0.12.4