knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <-"hiseq2500"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: novaseq_l2
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
Each sample’s molecule_info.h5 file should be renamed to {sample_name}.h5 and placed in ../project_dir/data/{dataset_name}/input/. The purged UMI count matrices and other output files are saved to ../project_dir/data/{dataset_name}/output/.
code_dir <- file.path(project_dir, "code")
data_dir <- file.path(project_dir, "data",
dataset_name)
input_dir <- file.path(data_dir, "input")
output_dir <- file.path(data_dir, "output")
figures_dir <- file.path(output_dir, "figures")
read_counts_filepath <- file.path(output_dir,
sprintf("%s_read_counts.rds",
dataset_name))
results_filepath <- file.path(output_dir,
sprintf("%s_results.rds",
dataset_name))
Create directories if they don’t exist.
dir.create(output_dir)
Warning in dir.create(output_dir): '/project/6007998/rfarouni/
index_hopping/data/novaseq_l2/output' already exists
dir.create(figures_dir)
Warning in dir.create(figures_dir): '/project/6007998/rfarouni/
index_hopping/data/novaseq_l2/output/figures' already exists
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
purge_phantoms <- function(input_dir,
output_dir,
read_counts_filepath = NULL,
torc = 3,
max_r = NULL) {
tic("Running workflow I")
tic("Step 1: loading molecule_info files and creating read counts datatable")
read_counts <- create_joined_counts(input_dir, read_counts_filepath)
toc()
sample_names <-
setdiff(
colnames(read_counts),
c("cell", "umi", "gene", "outcome")
)
S <- length(sample_names)
tic("Step 2: creating outcome counts datatable with grouping vars")
outcome_counts <- create_outcome_counts(read_counts, sample_names)
toc()
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
fit_out <-
estimate_hopping_rate(
outcome_counts,
S,
max_r = max_r
)
toc()
# compute_molecular_complexity_profile
tic("Step 4: compute molecular complexity profile and other summary statistics")
summary_stats <-
compute_summary_stats(
outcome_counts,
fit_out$glm_estimates$phat,
sample_names
)
toc()
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
outcome_counts <-
reassign_reads_and_mark_retained_observations(
outcome_counts,
summary_stats,
sample_names,
fit_out,
torc
)
# get the tradoff ratio cutoff
summary_stats <- get_threshold(outcome_counts, summary_stats)
toc()
tic("Step 6: Purge and save read counts datatable to disk")
read_counts <-
left_join(read_counts %>%
select(outcome, cell, umi, gene, sample_names),
outcome_counts %>%
select(c("outcome", "retain", paste0(sample_names, "_hat"))),
by = c("outcome")
) %>%
select(-outcome)
purge_and_save_read_counts(
read_counts,
dataset_name,
sample_names,
output_dir
)
toc()
tic("Step 7: create umi counts matrices")
umi_counts_cell_gene <-
create_umi_counts(
read_counts,
sample_names
)
toc()
outcome_counts <-
outcome_counts %>%
arrange(-qr) %>%
select(-c(paste0(sample_names, "_hat")))
read_counts <-
read_counts %>%
select(-c("retain", paste0(sample_names, "_hat")))
summary_stats$sample_names <- sample_names
data_list <-
list(
umi_counts_cell_gene = umi_counts_cell_gene,
read_counts = read_counts,
outcome_counts = outcome_counts,
fit_out = fit_out,
summary_stats = summary_stats
)
toc()
return(data_list)
}
identify_rna_cells <- function(data_list, output_dir) {
tic("Running workflow II")
tic("Step 7: identify RNA-containing cells")
called_cells_out <- call_cells_all_samples(data_list$umi_counts_cell_gene, output_dir)
toc()
tic("Step 9: tallying molecules by cell-barcode")
umi_counts_cell <- map2(
called_cells_out$called_cells,
data_list$umi_counts_cell_gene,
get_umi_counts_cell
)
umi_counts_sample <-
map(umi_counts_cell,
map_dfr,
get_umi_counts_sample,
.id = "split"
) %>%
bind_rows(.id = "sample")
data_list$summary_stats <-
update_summary_stats(
data_list$summary_stats,
umi_counts_sample
)
toc()
data_list <-
c(
data_list,
list(
umi_counts_cell = umi_counts_cell,
called_cells_tally = called_cells_out$called_cells_tally
)
)
toc()
return(data_list)
}
Estimate the sample index hopping probability, infer the true sample of origin, and reassign reads.
data_list <- purge_phantoms(input_dir, output_dir, read_counts_filepath, torc=torc)
Step 1: loading molecule_info files and creating read counts datatable: 499.752 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 61.48 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.18 sec elapsed
Step 4: compute molecular complexity profile and other summary statistics: 0.49 sec elapsed
Step 5: reassign read counts, determine cutoff, and mark retained observations: 6.194 sec elapsed
Step 6: Purge and save read counts datatable to disk: 724.377 sec elapsed
Step 7: create umi counts matrices: 762.945 sec elapsed
Running workflow I: 2055.481 sec elapsed
data_list$read_counts
The datatable is ordered in descending order of qr, the posterior probability of incorrectly assigning s as the inferred sample of origin. n is the number of CUGs with the corresponding outcome and p_outcome is the observed marginal probability of that outcome.
data_list$outcome_counts
p_chimeras is the proportion CUGs that are chimeric. g is the estimated proportion of fugue molecules and u is the molecule inflation factor such that n_cugs x u would give the number of non-fugue phantom molecules. The estimated total number of phantom molecules present in the dataset is given by n_pm=n_cugs x (u+g).
data_list$summary_stats$summary_estimates
data_list$summary_stats$conditional
data_list$summary_stats$pi_r_hat
Plot
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=250)
p_read <- plot_grid(p_read$p,
p_read$legend,
ncol=2,
rel_widths=c(1, 0.1))
ggsave(file.path(figures_dir,
paste0(dataset_name, "_molcomplexity.pdf")),
p_read,
width=9,
height=6)
p_read
data_list$fit_out$glm_estimates
p_fit <- plot_fit(data_list, dataset_name, x_lim=250)
p_fit <-plot_grid(p_fit$p,
p_fit$legend,
ncol=2,
rel_widths=c(1, 0.2))
ggsave(file.path(figures_dir,
paste0(dataset_name, "_fit.pdf")),
p_fit,
width=9,
height=6)
p_fit
data_list$summary_stats$cutoff_dt
The row discard_torc shows the outcome whose qr value is the maximum allowed. Reads corresponding to outcomes with greater qr values are discarded. no_discarding corresponds to retaining all reassigned reads and no_purging corresponds to keeping the data as it is.
p_fp <- plot_fp_reduction(data_list, dataset_name)
ggsave(file.path(figures_dir,
paste0(dataset_name, "_fp_performance.pdf")),
p_fp,
width=9,
height=6)
p_fp
p_tradeoff <- plot_fp_tradoff(data_list, dataset_name)
ggsave(file.path(figures_dir,
paste0(dataset_name, "_tradeoff.pdf")),
p_tradeoff,
width=9,
height=6)
p_tradeoff
data_list <- identify_rna_cells(data_list, output_dir)
Step 7: identify RNA-containing cells: 1989.231 sec elapsed
Step 9: tallying molecules by cell-barcode: 322.964 sec elapsed
Running workflow II: 2312.195 sec elapsed
Here we examine the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here m is the number of total molecules in millions; rm_ret is the number of predicted real molecules and prm_ret is the proportion; rm_disc is the number of discarded real molecules; and pm is the number of predicted phantom molecules.
data_list$summary_stats$marginal
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$summary_stats$marginal_called_cells
The rows corresponding to consensus_background and consensus_cell refer to the number of barcodes that were categorized as background cells or rna-containing cells, respectively, no matter whether the data was purged or not. In contrast, transition_background and transition_cell refer to the number of barcodes that were recatgorized as background and cell, respectively. phantom_background and phantom_cell are phantom cells that disappear once phantom molecules are purged.
data_list$called_cells_tally
data_list$umi_counts_cell %>%
map(list("called_cells"))
$P7_0
# A tibble: 1,160 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 487 49 433 5
2 AGTGAGGTCTGCCCTA 867 433 430 4
3 GAACATCTCCTTGGTC 445 34 409 2
4 CTAACTTAGTTGTAGA 420 27 391 2
5 CGGGTCATCCCAAGTA 425 33 388 4
6 CATGCCTTCAATACCG 782 409 373 0
7 AGACGTTGTCCGAGTC 405 35 370 0
8 CTCGAGGGTTTCGCTC 378 27 351 0
9 GAATAAGCATATGGTC 395 48 337 10
10 TGAGAGGCAACACGCC 351 44 307 0
# … with 1,150 more rows
$P7_1
# A tibble: 835 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 595 51 540 4
2 AGACGTTGTCCGAGTC 482 36 445 1
3 CGGGTCATCCCAAGTA 493 45 444 4
4 CTAACTTAGTTGTAGA 499 53 444 2
5 GAATAAGCATATGGTC 472 55 417 0
6 TGTTCCGAGGCTAGAC 483 67 416 0
7 GTGGGTCCAAACTGTC 464 79 385 0
8 GAACGGAGTTGCGCAC 425 46 379 0
9 TAAACCGTCTCTGTCG 413 36 374 3
10 GTGCTTCGTCCCTACT 446 87 358 1
# … with 825 more rows
$P7_10
# A tibble: 2,464 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 286 21 265 0
2 AGTGAGGTCTGCCCTA 279 24 253 2
3 CTAACTTAGTTGTAGA 272 20 251 1
4 CGGGTCATCCCAAGTA 263 35 227 1
5 AGACGTTGTCCGAGTC 251 29 222 0
6 GAACATCTCCTTGGTC 234 13 221 0
7 CTCGAGGGTTTCGCTC 203 19 183 1
8 CATGCCTTCAATACCG 222 33 179 10
9 TGTTCCGAGGCTAGAC 201 19 179 3
10 GTGCTTCGTCCCTACT 225 51 173 1
# … with 2,454 more rows
$P7_11
# A tibble: 3,514 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 387 36 347 4
2 GAACATCTCCTTGGTC 351 23 327 1
3 CGGGTCATCCCAAGTA 382 51 325 6
4 AGTGAGGTCTGCCCTA 368 57 310 1
5 AGACGTTGTCCGAGTC 309 25 284 0
6 TAAACCGTCTCTGTCG 296 19 272 5
7 AAATGCCTCCCAAGTA 294 25 262 7
8 TGTTCCGAGGCTAGAC 296 31 260 5
9 CTCGAGGGTTTCGCTC 294 31 259 4
10 TAAGAGATCTTGGGTA 279 26 241 12
# … with 3,504 more rows
$P7_12
# A tibble: 3,062 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 384 39 339 6
2 CTAACTTAGTTGTAGA 336 28 303 5
3 AGTGAGGTCTGCCCTA 318 31 280 7
4 AGACGTTGTCCGAGTC 306 27 278 1
5 GAACATCTCCTTGGTC 313 40 273 0
6 CTCGAGGGTTTCGCTC 281 31 248 2
7 GAATAAGCATATGGTC 295 30 247 18
8 AAATGCCTCCCAAGTA 264 31 230 3
9 GGACATTTCGTAGGAG 253 23 226 4
10 TGTTCCGAGGCTAGAC 249 21 226 2
# … with 3,052 more rows
$P7_13
# A tibble: 3,416 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CGGGTCATCCCAAGTA 343 45 296 2
2 AGTGAGGTCTGCCCTA 325 31 292 2
3 CTAACTTAGTTGTAGA 317 33 283 1
4 GAACATCTCCTTGGTC 305 22 282 1
5 AGGGATGGTCTAACGT 307 33 271 3
6 AGACGTTGTCCGAGTC 293 31 261 1
7 GAATAAGCATATGGTC 287 36 241 10
8 CTCGAGGGTTTCGCTC 264 23 240 1
9 TAAACCGTCTCTGTCG 235 18 215 2
10 GGACATTTCGTAGGAG 249 32 213 4
# … with 3,406 more rows
$P7_14
# A tibble: 2,630 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 380 36 341 3
2 CTAACTTAGTTGTAGA 307 17 288 2
3 AGACGTTGTCCGAGTC 319 35 283 1
4 AGTGAGGTCTGCCCTA 307 32 271 4
5 GAACATCTCCTTGGTC 284 33 251 0
6 CTCGAGGGTTTCGCTC 265 22 242 1
7 TAAACCGTCTCTGTCG 260 18 241 1
8 GAACATCTCAGAGACG 276 38 238 0
9 GTGCTTCGTCCCTACT 299 61 234 4
10 GATCTAGTCGAGAACG 247 21 225 1
# … with 2,620 more rows
$P7_15
# A tibble: 2,765 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 290 26 261 3
2 CTAACTTAGTTGTAGA 248 18 227 3
3 CGGGTCATCCCAAGTA 242 24 217 1
4 AGACGTTGTCCGAGTC 229 17 211 1
5 AGTGAGGTCTGCCCTA 226 22 204 0
6 GAACATCTCCTTGGTC 220 19 201 0
7 CTCGAGGGTTTCGCTC 206 16 189 1
8 GTGCTTCGTCCCTACT 211 43 167 1
9 CTCGTCATCAGAGGTG 5116 5030 78 8
10 CATATGGGTTGCGTTA 1665 1597 66 2
# … with 2,755 more rows
$P7_2
# A tibble: 4,058 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 341 44 297 0
2 AGTGAGGTCTGCCCTA 308 29 275 4
3 CTAACTTAGTTGTAGA 293 26 267 0
4 GAACATCTCCTTGGTC 280 20 259 1
5 CGGGTCATCCCAAGTA 287 35 251 1
6 CTCGAGGGTTTCGCTC 261 29 232 0
7 AGACGTTGTCCGAGTC 262 41 221 0
8 TAAACCGTCTCTGTCG 239 26 213 0
9 TGTTCCGAGGCTAGAC 241 30 208 3
10 AAATGCCTCCCAAGTA 232 28 204 0
# … with 4,048 more rows
$P7_3
# A tibble: 4,006 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 373 82 290 1
2 CTAACTTAGTTGTAGA 311 43 267 1
3 AGTGAGGTCTGCCCTA 292 32 258 2
4 AGACGTTGTCCGAGTC 273 26 247 0
5 TGTTCCGAGGCTAGAC 278 34 244 0
6 CGGGTCATCCCAAGTA 281 41 239 1
7 CTCGAGGGTTTCGCTC 266 25 238 3
8 GAACATCTCCTTGGTC 269 28 237 4
9 AAATGCCTCCCAAGTA 252 27 220 5
10 TAAACCGTCTCTGTCG 246 27 218 1
# … with 3,996 more rows
$P7_4
# A tibble: 999 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 727 147 578 2
2 GTGCTTCGTCCCTACT 583 75 506 2
3 AGGGATGGTCTAACGT 402 32 368 2
4 TGGGAAGCATTCGACA 343 17 326 0
5 AGGCCGTGTGCGATAG 339 18 321 0
6 CGGGTCATCCCAAGTA 326 29 294 3
7 CTAACTTAGTTGTAGA 304 17 287 0
8 AGTGAGGTCTGCCCTA 302 25 277 0
9 AGACGTTGTCCGAGTC 306 28 276 2
10 CAGATCAAGACAGAGA 298 29 269 0
# … with 989 more rows
$P7_5
# A tibble: 2,041 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 307 34 272 1
2 AGACGTTGTCCGAGTC 276 29 247 0
3 AGTGAGGTCTGCCCTA 12731 12472 219 40
4 CTAACTTAGTTGTAGA 242 26 215 1
5 CGGGTCATCCCAAGTA 234 27 207 0
6 GAACATCTCCTTGGTC 224 23 201 0
7 CTCGAGGGTTTCGCTC 219 19 200 0
8 AAATGCCTCCCAAGTA 217 18 196 3
9 CATGCCTTCAATACCG 221 29 192 0
10 TGTTCCGAGGCTAGAC 210 19 190 1
# … with 2,031 more rows
$P7_6
# A tibble: 7,280 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 260 23 234 3
2 AGGGATGGTCTAACGT 243 27 215 1
3 CTAACTTAGTTGTAGA 234 19 214 1
4 GAACATCTCCTTGGTC 227 18 208 1
5 CTCGAGGGTTTCGCTC 217 18 198 1
6 CGGGTCATCCCAAGTA 220 24 195 1
7 AAATGCCTCCCAAGTA 201 21 177 3
8 GTGCTTCGTCCCTACT 212 48 157 7
9 TAAGAGATCTTGGGTA 5407 5268 116 23
10 CTAACTTCATCGATGT 3752 3619 103 30
# … with 7,270 more rows
$P7_7
# A tibble: 743 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 285 29 256 0
2 GAACATCTCCTTGGTC 235 9 225 1
3 AGTGAGGTCTGCCCTA 237 20 216 1
4 CTCGAGGGTTTCGCTC 241 23 216 2
5 CTAACTTAGTTGTAGA 232 22 208 2
6 AGACGTTGTCCGAGTC 222 19 202 1
7 TGTTCCGAGGCTAGAC 208 15 193 0
8 GGACATTTCGTAGGAG 213 18 189 6
9 CGGGTCATCCCAAGTA 211 23 186 2
10 GAACATCTCAGAGACG 201 27 174 0
# … with 733 more rows
$P7_8
# A tibble: 1,209 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 495 38 457 0
2 GAACATCTCCTTGGTC 425 29 396 0
3 GAACATCTCAGAGACG 443 50 393 0
4 GAATAAGCATATGGTC 421 40 381 0
5 CTCGAGGGTTTCGCTC 408 29 379 0
6 TGTTCCGAGGCTAGAC 400 28 372 0
7 TAAGAGATCTTGGGTA 403 39 364 0
8 GAACGGAGTTGCGCAC 398 38 360 0
9 CATGCCTTCAATACCG 398 43 355 0
10 GGCTGGTGTGTCGCTG 365 27 338 0
# … with 1,199 more rows
$P7_9
# A tibble: 2,400 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 446 39 406 1
2 CTAACTTAGTTGTAGA 420 37 381 2
3 AGTGAGGTCTGCCCTA 411 31 378 2
4 AGACGTTGTCCGAGTC 375 24 348 3
5 CGGGTCATCCCAAGTA 380 35 343 2
6 GAACATCTCCTTGGTC 365 25 337 3
7 GAATAAGCATATGGTC 372 38 334 0
8 CTCGAGGGTTTCGCTC 346 27 318 1
9 CATGCCTTCAATACCG 340 36 304 0
10 GGCTGGTGTGTCGCTG 322 17 303 2
# … with 2,390 more rows
data_list$umi_counts_cell %>%
map(list("background_cells"))
$P7_0
# A tibble: 309,061 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 429 21 403 5
2 GCTGCTTGTCCGAAGA 555 220 317 18
3 AAATGCCTCCCAAGTA 370 45 315 10
4 GCAAACTAGCTTATCG 252 20 228 4
5 AACTCAGTCTAACGGT 249 24 222 3
6 TTAACTCCAATGGTCT 234 15 216 3
7 CTTAACTCACTACAGT 220 18 201 1
8 ACGCCGAGTCTACCTC 216 23 191 2
9 ACACCAACATAAGACA 208 18 189 1
10 ACATACGGTAATCGTC 210 20 189 1
# … with 309,051 more rows
$P7_1
# A tibble: 306,030 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 580 162 418 0
2 GAACATCTCAGAGACG 439 63 376 0
3 CAGATCAAGACAGAGA 261 60 201 0
4 CCTAGCTGTTCTGGTA 199 11 186 2
5 TTAGTTCGTTCAGCGC 198 11 186 1
6 GTTTCTACAGTAAGAT 194 7 185 2
7 CTCTAATAGGACATTA 197 14 183 0
8 TCTTTCCCAGACAGGT 195 10 183 2
9 ATCATCTAGTTACGGG 193 11 182 0
10 TAGACCACAAACGCGA 197 15 182 0
# … with 306,020 more rows
$P7_10
# A tibble: 418,600 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GAATAAGCATATGGTC 266 28 230 8
2 CACACCTTCAACGCTA 224 16 207 1
3 GCTGCTTGTCCGAAGA 260 71 186 3
4 GAACGGAGTTGCGCAC 215 28 178 9
5 CGGACACAGCGCTTAT 186 14 172 0
6 TCAGGTACATAACCTG 186 15 171 0
7 TGAGAGGCAACACGCC 180 15 165 0
8 TAAACCGTCTCTGTCG 182 19 163 0
9 AAATGCCTCCCAAGTA 190 29 159 2
10 ACTTACTCAGCTCGAC 176 17 159 0
# … with 418,590 more rows
$P7_11
# A tibble: 325,495 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CTAACTTAGTTGTAGA 345 23 321 1
2 CACACCTTCAACGCTA 317 18 297 2
3 GCTGCTTGTCCGAAGA 374 90 273 11
4 ACTTACTCAGCTCGAC 279 17 262 0
5 GAACGGAGTTGCGCAC 304 37 258 9
6 GAATAAGCATATGGTC 294 44 242 8
7 CGGACACAGCGCTTAT 257 26 230 1
8 CATCCACAGATGGCGT 247 29 215 3
9 AAACGGGAGGATATAC 229 16 210 3
10 GTGCTTCGTCCCTACT 276 60 208 8
# … with 325,485 more rows
$P7_12
# A tibble: 352,841 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 314 30 278 6
2 CGGGTCATCCCAAGTA 298 31 265 2
3 GCTGCTTGTCCGAAGA 347 98 238 11
4 GAACGGAGTTGCGCAC 264 30 223 11
5 CATCCACAGATGGCGT 242 20 221 1
6 CGGACACAGCGCTTAT 222 10 211 1
7 GGATGTTAGGGAACGG 223 20 200 3
8 AAACGGGAGGATATAC 215 16 198 1
9 GCGGGTTTCAGGATCT 199 15 183 1
10 GTATTCTTCACCATAG 190 7 183 0
# … with 352,831 more rows
$P7_13
# A tibble: 360,836 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 251 20 226 5
2 GAACGGAGTTGCGCAC 261 37 209 15
3 CATCCACAGATGGCGT 243 31 208 4
4 GCTGCTTGTCCGAAGA 302 90 205 7
5 ACTTACTCAGCTCGAC 222 27 194 1
6 AAACGGGAGGATATAC 205 13 191 1
7 GGATGTTAGGGAACGG 216 25 188 3
8 GCTGCGACAGTAAGCG 200 19 181 0
9 CCTACCAAGGTAAACT 195 14 178 3
10 CCTACCACATCGGTTA 207 29 177 1
# … with 360,826 more rows
$P7_14
# A tibble: 310,691 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 314 17 295 2
2 CGGGTCATCCCAAGTA 292 26 262 4
3 GAATAAGCATATGGTC 281 35 245 1
4 GCTGCTTGTCCGAAGA 331 82 237 12
5 GAACGGAGTTGCGCAC 252 36 216 0
6 CGGACACAGCGCTTAT 209 18 191 0
7 AAACGGGAGGATATAC 216 24 190 2
8 GCTGCGACAGTAAGCG 200 17 183 0
9 CGACTTCAGACCTAGG 199 16 181 2
10 GTAGGCCAGCCGATTT 203 22 181 0
# … with 310,681 more rows
$P7_15
# A tibble: 298,328 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 197 17 178 2
2 GCTGCGACAGTAAGCG 179 8 171 0
3 TAAACCGTCTCTGTCG 189 18 170 1
4 GAACATCTCAGAGACG 197 25 169 3
5 GATCTAGTCGAGAACG 188 19 167 2
6 ACTTACTCAGCTCGAC 179 13 166 0
7 TCTATTGCATAAAGGT 175 9 166 0
8 TGAGAGGCAACACGCC 178 12 166 0
9 GAATAAGCATATGGTC 193 23 165 5
10 GCAAACTTCGACAGCC 180 15 164 1
# … with 298,318 more rows
$P7_2
# A tibble: 361,126 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 270 23 239 8
2 GCTGCTTGTCCGAAGA 309 75 228 6
3 GAATAAGCATATGGTC 251 31 206 14
4 ACTTACTCAGCTCGAC 246 41 205 0
5 CATCCACAGATGGCGT 237 37 200 0
6 GAACGGAGTTGCGCAC 232 26 197 9
7 GCTGCGACAGTAAGCG 195 16 179 0
8 CGACTTCAGACCTAGG 193 18 174 1
9 CGGACACAGCGCTTAT 196 22 172 2
10 GCGCCAAAGGCTATCT 190 18 172 0
# … with 361,116 more rows
$P7_3
# A tibble: 381,119 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 325 81 233 11
2 CACACCTTCAACGCTA 255 19 231 5
3 GAATAAGCATATGGTC 264 33 221 10
4 GAACGGAGTTGCGCAC 247 34 205 8
5 CATCCACAGATGGCGT 219 22 196 1
6 CGGACACAGCGCTTAT 217 22 195 0
7 TCAGGTACATAACCTG 195 20 175 0
8 GCTGCGACAGTAAGCG 188 17 171 0
9 CGACTTCAGACCTAGG 193 22 170 1
10 GATCTAGTCGAGAACG 210 39 170 1
# … with 381,109 more rows
$P7_4
# A tibble: 315,858 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 303 18 285 0
2 CGTCCATGTGTTCGAT 199 10 189 0
3 AGATTGCAGAGCAATT 200 16 184 0
4 CGATGTACATATGCTG 199 14 184 1
5 AAATGCCGTGAACCTT 199 15 182 2
6 AACGTTGTCAACACAC 195 14 181 0
7 GCTGCGACAGTAAGCG 194 12 180 2
8 TAAGCGTTCAGAGACG 194 14 180 0
9 TCGAGGCTCCCTCTTT 193 10 180 3
10 ACACCAACATAAGACA 198 20 178 0
# … with 315,848 more rows
$P7_5
# A tibble: 364,537 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 288 48 239 1
2 TAAACCGTCTCTGTCG 190 16 174 0
3 GAACATCTCAGAGACG 199 31 166 2
4 ACTTACTCAGCTCGAC 178 13 165 0
5 GGCTGGTGTGTCGCTG 178 15 161 2
6 TAAACCGCATGTAGTC 181 21 158 2
7 GTGGGTCCAAACTGTC 190 34 156 0
8 GCTGCGACAGTAAGCG 166 11 155 0
9 CCGTACTGTCAGATAA 175 19 154 2
10 CATCCACAGATGGCGT 164 10 151 3
# … with 364,527 more rows
$P7_6
# A tibble: 351,694 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 327 86 231 10
2 CACACCTTCAACGCTA 224 22 196 6
3 TAAACCGTCTCTGTCG 188 15 173 0
4 TGTTCCGAGGCTAGAC 200 23 173 4
5 GGACATTTCGTAGGAG 180 9 168 3
6 ACTTACTCAGCTCGAC 183 17 165 1
7 AGACGTTGTCCGAGTC 185 21 164 0
8 GGCTGGTGTGTCGCTG 172 9 163 0
9 CGGACACAGCGCTTAT 199 42 157 0
10 CGACTTCAGACCTAGG 169 14 154 1
# … with 351,684 more rows
$P7_7
# A tibble: 275,155 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 238 16 215 7
2 GCTGCTTGTCCGAAGA 272 65 200 7
3 ACTTACTCAGCTCGAC 206 14 191 1
4 GTGCTTCGTCCCTACT 245 50 187 8
5 TGAGAGGCAACACGCC 194 6 186 2
6 CATCCACAGATGGCGT 197 14 182 1
7 TAAACCGTCTCTGTCG 194 13 180 1
8 TCTATTGCATAAAGGT 195 15 179 1
9 CATGCCTTCAATACCG 199 29 170 0
10 TAAACCGCATGTAGTC 197 26 167 4
# … with 275,145 more rows
$P7_8
# A tibble: 272,056 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 478 33 445 0
2 AAATGCCTCCCAAGTA 418 48 370 0
3 GCTGCTTGTCCGAAGA 474 152 322 0
4 ACGCCGAGTCTACCTC 253 19 234 0
5 ACACCAACATAAGACA 251 22 229 0
6 GCAAACTAGCTTATCG 247 23 224 0
7 TTAACTCCAATGGTCT 249 29 220 0
8 GATCGTATCGGCGCAT 249 43 206 0
9 ACGCCGAAGATCCGAG 203 12 191 0
10 ATTCTACCAGGACGTA 199 9 190 0
# … with 272,046 more rows
$P7_9
# A tibble: 316,062 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 397 32 362 3
2 ACTTACTCAGCTCGAC 321 26 295 0
3 GTAGGCCAGCCGATTT 284 33 251 0
4 CATCCACAGATGGCGT 271 22 248 1
5 AAATGCCTCCCAAGTA 275 29 244 2
6 CGGACACAGCGCTTAT 274 28 244 2
7 CGGAGTCCATGAGCGA 248 12 236 0
8 AAACGGGAGGATATAC 269 43 225 1
9 CGATGTACATATGCTG 245 22 223 0
10 GCAAACTTCGACAGCC 243 18 223 2
# … with 316,052 more rows
tic("saving output")
data_list$read_counts <- NULL
saveRDS(data_list, results_filepath)
toc()
saving output: 3582.473 sec elapsed
# memory usage
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6381580 340.9 9589840 512.2 9589840 512.2
Vcells 8960661518 68364.5 31123741425 237455.4 32387901149 247100.1
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_0.9.4 data.table_1.12.2 tictoc_1.0
[4] furrr_0.1.0 future_1.13.0 broom_0.5.2
[7] matrixStats_0.54.0 forcats_0.4.0 stringr_1.4.0
[10] dplyr_0.8.1 purrr_0.3.2 readr_1.3.1
[13] tidyr_0.8.3 tibble_2.1.2 ggplot2_3.1.1
[16] tidyverse_1.2.1 rhdf5_2.26.2 rmarkdown_1.13
loaded via a namespace (and not attached):
[1] nlme_3.1-137 bitops_1.0-6
[3] lubridate_1.7.4 httr_1.4.0
[5] rprojroot_1.3-2 GenomeInfoDb_1.18.2
[7] tools_3.5.2 backports_1.1.4
[9] utf8_1.1.4 R6_2.4.0
[11] HDF5Array_1.10.1 lazyeval_0.2.2
[13] BiocGenerics_0.28.0 colorspace_1.4-1
[15] withr_2.1.2 tidyselect_0.2.5
[17] compiler_3.5.2 cli_1.1.0
[19] rvest_0.3.4 Biobase_2.42.0
[21] xml2_1.2.0 DelayedArray_0.8.0
[23] labeling_0.3 scales_1.0.0
[25] digest_0.6.19 XVector_0.22.0
[27] base64enc_0.1-3 pkgconfig_2.0.2
[29] htmltools_0.3.6 limma_3.38.3
[31] rlang_0.3.4 readxl_1.3.1
[33] rstudioapi_0.10 generics_0.0.2
[35] jsonlite_1.6 BiocParallel_1.16.6
[37] RCurl_1.95-4.12 magrittr_1.5
[39] GenomeInfoDbData_1.2.0 Matrix_1.2-15
[41] Rcpp_1.0.1 munsell_0.5.0
[43] S4Vectors_0.20.1 Rhdf5lib_1.4.2
[45] fansi_0.4.0 stringi_1.4.3
[47] yaml_2.2.0 edgeR_3.24.3
[49] MASS_7.3-51.1 SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0 plyr_1.8.4
[53] grid_3.5.2 parallel_3.5.2
[55] listenv_0.7.0 crayon_1.3.4
[57] lattice_0.20-38 haven_2.1.0
[59] hms_0.4.2 locfit_1.5-9.1
[61] zeallot_0.1.0 knitr_1.23
[63] pillar_1.4.1 GenomicRanges_1.34.0
[65] codetools_0.2-16 stats4_3.5.2
[67] glue_1.3.1 evaluate_0.14
[69] modelr_0.1.4 vctrs_0.1.0
[71] cellranger_1.1.0 gtable_0.3.0
[73] assertthat_0.2.1 xfun_0.7
[75] DropletUtils_1.2.2 viridisLite_0.3.0
[77] SingleCellExperiment_1.4.1 IRanges_2.16.0
[79] globals_0.12.4