knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
dataset_name <- commandArgs(trailingOnly=T)[1]
#dataset_name <-"hiseq2500"
message(sprintf("Dataset name: %s", dataset_name))
Dataset name: novaseq_l1
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /project/6007998/rfarouni/index_hopping
Each sample’s molecule_info.h5 file should be renamed to {sample_name}.h5 and placed in ../project_dir/data/{dataset_name}/input/. The purged UMI count matrices and other output files are saved to ../project_dir/data/{dataset_name}/output/.
code_dir <- file.path(project_dir, "code")
data_dir <- file.path(project_dir, "data",
dataset_name)
input_dir <- file.path(data_dir, "input")
output_dir <- file.path(data_dir, "output")
figures_dir <- file.path(output_dir, "figures")
read_counts_filepath <- file.path(output_dir,
sprintf("%s_read_counts.rds",
dataset_name))
results_filepath <- file.path(output_dir,
sprintf("%s_results.rds",
dataset_name))
Create directories if they don’t exist.
dir.create(output_dir)
Warning in dir.create(output_dir): '/project/6007998/rfarouni/
index_hopping/data/novaseq_l1/output' already exists
dir.create(figures_dir)
Warning in dir.create(figures_dir): '/project/6007998/rfarouni/
index_hopping/data/novaseq_l1/output/figures' already exists
Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.
torc <- 3
library(rhdf5)
#library(DropletUtils) # install but not load
library(tidyverse)
library(matrixStats)
library(broom)
library(furrr)
library(tictoc)
library(data.table)
library(cowplot)
plan(multiprocess)
source(file.path(code_dir, "1_create_joined_counts_table.R"))
source(file.path(code_dir, "2_create_counts_by_outcome_table.R"))
source(file.path(code_dir, "3_estimate_sample_index_hopping_rate.R"))
source(file.path(code_dir, "4_compute_summary_statistics.R"))
source(file.path(code_dir, "5_reassign_hopped_reads.R"))
source(file.path(code_dir, "6_purge_phantom_molecules.R"))
source(file.path(code_dir, "7_call_cells.R"))
source(file.path(code_dir, "8_summarize_purge.R"))
source(file.path(code_dir, "9_plotting_functions.R"))
purge_phantoms <- function(input_dir,
output_dir,
read_counts_filepath = NULL,
torc = 3,
max_r = NULL) {
tic("Running workflow I")
tic("Step 1: loading molecule_info files and creating read counts datatable")
read_counts <- create_joined_counts(input_dir, read_counts_filepath)
toc()
sample_names <-
setdiff(
colnames(read_counts),
c("cell", "umi", "gene", "outcome")
)
S <- length(sample_names)
tic("Step 2: creating outcome counts datatable with grouping vars")
outcome_counts <- create_outcome_counts(read_counts, sample_names)
toc()
tic("Step 3: creating a chimera counts datatable and estimating hopping rate")
fit_out <-
estimate_hopping_rate(
outcome_counts,
S,
max_r = max_r
)
toc()
# compute_molecular_complexity_profile
tic("Step 4: compute molecular complexity profile and other summary statistics")
summary_stats <-
compute_summary_stats(
outcome_counts,
fit_out$glm_estimates$phat,
sample_names
)
toc()
tic("Step 5: reassign read counts, determine cutoff, and mark retained observations")
outcome_counts <-
reassign_reads_and_mark_retained_observations(
outcome_counts,
summary_stats,
sample_names,
fit_out,
torc
)
# get the tradoff ratio cutoff
summary_stats <- get_threshold(outcome_counts, summary_stats)
toc()
tic("Step 6: Purge and save read counts datatable to disk")
read_counts <-
left_join(read_counts %>%
select(outcome, cell, umi, gene, sample_names),
outcome_counts %>%
select(c("outcome", "retain", paste0(sample_names, "_hat"))),
by = c("outcome")
) %>%
select(-outcome)
purge_and_save_read_counts(
read_counts,
dataset_name,
sample_names,
output_dir
)
toc()
tic("Step 7: create umi counts matrices")
umi_counts_cell_gene <-
create_umi_counts(
read_counts,
sample_names
)
toc()
outcome_counts <-
outcome_counts %>%
arrange(-qr) %>%
select(-c(paste0(sample_names, "_hat")))
read_counts <-
read_counts %>%
select(-c("retain", paste0(sample_names, "_hat")))
summary_stats$sample_names <- sample_names
data_list <-
list(
umi_counts_cell_gene = umi_counts_cell_gene,
read_counts = read_counts,
outcome_counts = outcome_counts,
fit_out = fit_out,
summary_stats = summary_stats
)
toc()
return(data_list)
}
identify_rna_cells <- function(data_list, output_dir) {
tic("Running workflow II")
tic("Step 7: identify RNA-containing cells")
called_cells_out <- call_cells_all_samples(data_list$umi_counts_cell_gene, output_dir)
toc()
tic("Step 9: tallying molecules by cell-barcode")
umi_counts_cell <- map2(
called_cells_out$called_cells,
data_list$umi_counts_cell_gene,
get_umi_counts_cell
)
umi_counts_sample <-
map(umi_counts_cell,
map_dfr,
get_umi_counts_sample,
.id = "split"
) %>%
bind_rows(.id = "sample")
data_list$summary_stats <-
update_summary_stats(
data_list$summary_stats,
umi_counts_sample
)
toc()
data_list <-
c(
data_list,
list(
umi_counts_cell = umi_counts_cell,
called_cells_tally = called_cells_out$called_cells_tally
)
)
toc()
return(data_list)
}
Estimate the sample index hopping probability, infer the true sample of origin, and reassign reads.
data_list <- purge_phantoms(input_dir, output_dir, read_counts_filepath, torc=torc)
Step 1: loading molecule_info files and creating read counts datatable: 498.264 sec elapsed
Step 2: creating outcome counts datatable with grouping vars: 60.014 sec elapsed
Step 3: creating a chimera counts datatable and estimating hopping rate: 0.138 sec elapsed
Step 4: compute molecular complexity profile and other summary statistics: 0.522 sec elapsed
Step 5: reassign read counts, determine cutoff, and mark retained observations: 6.362 sec elapsed
Step 6: Purge and save read counts datatable to disk: 722.83 sec elapsed
Step 7: create umi counts matrices: 765.427 sec elapsed
Running workflow I: 2053.622 sec elapsed
data_list$read_counts
The datatable is ordered in descending order of qr, the posterior probability of incorrectly assigning s as the inferred sample of origin. n is the number of CUGs with the corresponding outcome and p_outcome is the observed marginal probability of that outcome.
data_list$outcome_counts
p_chimeras is the proportion CUGs that are chimeric. g is the estimated proportion of fugue molecules and u is the molecule inflation factor such that n_cugs x u would give the number of non-fugue phantom molecules. The estimated total number of phantom molecules present in the dataset is given by n_pm=n_cugs x (u+g).
data_list$summary_stats$summary_estimates
data_list$summary_stats$conditional
data_list$summary_stats$pi_r_hat
Plot
p_read <- plot_molecules_distributions(data_list, dataset_name, x_lim=250)
p_read <- plot_grid(p_read$p,
p_read$legend,
ncol=2,
rel_widths=c(1, 0.1))
ggsave(file.path(figures_dir,
paste0(dataset_name, "_molcomplexity.pdf")),
p_read,
width=9,
height=6)
p_read
data_list$fit_out$glm_estimates
p_fit <- plot_fit(data_list, dataset_name, x_lim=250)
p_fit <-plot_grid(p_fit$p,
p_fit$legend,
ncol=2,
rel_widths=c(1, 0.2))
ggsave(file.path(figures_dir,
paste0(dataset_name, "_fit.pdf")),
p_fit,
width=9,
height=6)
p_fit
data_list$summary_stats$cutoff_dt
The row discard_torc shows the outcome whose qr value is the maximum allowed. Reads corresponding to outcomes with greater qr values are discarded. no_discarding corresponds to retaining all reassigned reads and no_purging corresponds to keeping the data as it is.
p_fp <- plot_fp_reduction(data_list, dataset_name)
ggsave(file.path(figures_dir,
paste0(dataset_name, "_fp_performance.pdf")),
p_fp,
width=9,
height=6)
p_fp
p_tradeoff <- plot_fp_tradoff(data_list, dataset_name)
ggsave(file.path(figures_dir,
paste0(dataset_name, "_tradeoff.pdf")),
p_tradeoff,
width=9,
height=6)
p_tradeoff
data_list <- identify_rna_cells(data_list, output_dir)
Step 7: identify RNA-containing cells: 1985.566 sec elapsed
Step 9: tallying molecules by cell-barcode: 318.323 sec elapsed
Running workflow II: 2303.89 sec elapsed
Here we examine the extent of the effects of index hopping on individual samples and then on cell-barcodes.
Here m is the number of total molecules in millions; rm_ret is the number of predicted real molecules and prm_ret is the proportion; rm_disc is the number of discarded real molecules; and pm is the number of predicted phantom molecules.
data_list$summary_stats$marginal
The called cells were determined from the unpurged data in order to show the level of contamination by phantom molecules if data were not purged.
data_list$summary_stats$marginal_called_cells
The rows corresponding to consensus_background and consensus_cell refer to the number of barcodes that were categorized as background cells or rna-containing cells, respectively, no matter whether the data was purged or not. In contrast, transition_background and transition_cell refer to the number of barcodes that were recatgorized as background and cell, respectively. phantom_background and phantom_cell are phantom cells that disappear once phantom molecules are purged.
data_list$called_cells_tally
data_list$umi_counts_cell %>%
map(list("called_cells"))
$P7_0
# A tibble: 1,315 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 475 40 430 5
2 AGTGAGGTCTGCCCTA 850 432 417 1
3 CTAACTTAGTTGTAGA 433 34 399 0
4 CATGCCTTCAATACCG 785 392 391 2
5 CTCGAGGGTTTCGCTC 396 27 368 1
6 AGACGTTGTCCGAGTC 403 36 365 2
7 GAACATCTCCTTGGTC 369 32 336 1
8 CGGGTCATCCCAAGTA 369 39 329 1
9 GTCGTAAGTTAAGATG 367 42 325 0
10 AGCATACCATCATCCC 357 37 320 0
# … with 1,305 more rows
$P7_1
# A tibble: 774 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 555 51 500 4
2 AGACGTTGTCCGAGTC 518 42 471 5
3 CTAACTTAGTTGTAGA 492 48 443 1
4 CGGGTCATCCCAAGTA 486 53 432 1
5 GAATAAGCATATGGTC 464 48 416 0
6 TGTTCCGAGGCTAGAC 477 70 407 0
7 GTGGGTCCAAACTGTC 470 87 382 1
8 CATGCCTTCAATACCG 434 57 377 0
9 TAAGAGATCTTGGGTA 480 114 366 0
10 GAACATCTCAGAGACG 424 59 365 0
# … with 764 more rows
$P7_10
# A tibble: 2,460 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 293 39 254 0
2 CTAACTTAGTTGTAGA 255 21 234 0
3 GAACATCTCCTTGGTC 227 13 214 0
4 AGACGTTGTCCGAGTC 234 25 209 0
5 CGGGTCATCCCAAGTA 224 15 209 0
6 AGTGAGGTCTGCCCTA 218 21 196 1
7 CTCGAGGGTTTCGCTC 214 19 195 0
8 GGACATTTCGTAGGAG 207 17 189 1
9 GTGCTTCGTCCCTACT 205 40 163 2
10 ACATACGCAGCATGAG 12457 12321 103 33
# … with 2,450 more rows
$P7_11
# A tibble: 3,506 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 394 55 338 1
2 AGGGATGGTCTAACGT 369 30 334 5
3 CGGGTCATCCCAAGTA 373 62 310 1
4 CTCGAGGGTTTCGCTC 323 20 302 1
5 AGACGTTGTCCGAGTC 304 27 274 3
6 GAACATCTCCTTGGTC 297 22 272 3
7 CATGCCTTCAATACCG 316 45 270 1
8 TGTTCCGAGGCTAGAC 284 23 258 3
9 GGACATTTCGTAGGAG 271 18 247 6
10 GAACATCTCAGAGACG 268 32 232 4
# … with 3,496 more rows
$P7_12
# A tibble: 3,055 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 368 40 322 6
2 CTAACTTAGTTGTAGA 339 26 313 0
3 AGTGAGGTCTGCCCTA 331 29 297 5
4 AGACGTTGTCCGAGTC 303 37 261 5
5 TGTTCCGAGGCTAGAC 273 21 251 1
6 GAACATCTCAGAGACG 315 60 250 5
7 CTCGAGGGTTTCGCTC 271 24 246 1
8 GAACATCTCCTTGGTC 275 32 242 1
9 GGCTGGTGTGTCGCTG 258 30 225 3
10 TAAACCGTCTCTGTCG 252 25 224 3
# … with 3,045 more rows
$P7_13
# A tibble: 3,409 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 381 42 336 3
2 CTAACTTAGTTGTAGA 312 22 286 4
3 AGTGAGGTCTGCCCTA 308 18 285 5
4 AGACGTTGTCCGAGTC 290 31 255 4
5 GAACATCTCCTTGGTC 276 22 251 3
6 CGGGTCATCCCAAGTA 271 19 246 6
7 CTCGAGGGTTTCGCTC 260 25 234 1
8 GAACATCTCAGAGACG 272 43 223 6
9 TGTTCCGAGGCTAGAC 257 36 218 3
10 GAATAAGCATATGGTC 249 22 216 11
# … with 3,399 more rows
$P7_14
# A tibble: 2,629 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 371 35 334 2
2 AGACGTTGTCCGAGTC 320 38 282 0
3 CTAACTTAGTTGTAGA 311 28 282 1
4 CGGGTCATCCCAAGTA 315 31 281 3
5 AGTGAGGTCTGCCCTA 301 20 279 2
6 GAACATCTCCTTGGTC 293 16 277 0
7 CTCGAGGGTTTCGCTC 268 17 249 2
8 GAATAAGCATATGGTC 273 43 230 0
9 GAACATCTCAGAGACG 261 37 224 0
10 TGAGAGGCAACACGCC 239 17 221 1
# … with 2,619 more rows
$P7_15
# A tibble: 2,761 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 322 28 289 5
2 CTAACTTAGTTGTAGA 242 17 221 4
3 CGGGTCATCCCAAGTA 234 23 209 2
4 AGTGAGGTCTGCCCTA 223 19 204 0
5 AGACGTTGTCCGAGTC 218 18 200 0
6 GAATAAGCATATGGTC 227 24 192 11
7 TGAGAGGCAACACGCC 204 14 190 0
8 GCGCCAAAGGCTATCT 226 80 146 0
9 CATATGGGTTGCGTTA 1696 1606 89 1
10 CTCGTCATCAGAGGTG 5133 5047 81 5
# … with 2,751 more rows
$P7_2
# A tibble: 4,058 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 347 38 309 0
2 AGTGAGGTCTGCCCTA 317 26 278 13
3 CGGGTCATCCCAAGTA 282 30 251 1
4 GAACATCTCCTTGGTC 268 19 247 2
5 CTAACTTAGTTGTAGA 266 24 241 1
6 CTCGAGGGTTTCGCTC 275 36 238 1
7 GAACATCTCAGAGACG 288 46 238 4
8 CATGCCTTCAATACCG 277 35 227 15
9 AGACGTTGTCCGAGTC 253 35 217 1
10 TGTTCCGAGGCTAGAC 232 19 211 2
# … with 4,048 more rows
$P7_3
# A tibble: 4,000 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 376 77 299 0
2 AGTGAGGTCTGCCCTA 310 30 276 4
3 CTAACTTAGTTGTAGA 307 39 268 0
4 AGACGTTGTCCGAGTC 290 23 265 2
5 GAACATCTCCTTGGTC 275 29 245 1
6 CTCGAGGGTTTCGCTC 258 22 235 1
7 TGTTCCGAGGCTAGAC 254 29 223 2
8 CGGGTCATCCCAAGTA 249 31 218 0
9 CATCCACAGATGGCGT 238 22 215 1
10 CATGCCTTCAATACCG 253 31 213 9
# … with 3,990 more rows
$P7_4
# A tibble: 996 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 764 158 606 0
2 GTGCTTCGTCCCTACT 570 86 483 1
3 AGGGATGGTCTAACGT 396 41 352 3
4 TGGGAAGCATTCGACA 358 18 340 0
5 CTAACTTAGTTGTAGA 324 25 297 2
6 AGTGAGGTCTGCCCTA 329 33 295 1
7 AGGCCGTGTGCGATAG 309 16 293 0
8 AGACGTTGTCCGAGTC 295 26 268 1
9 AGCGGTCTCTTAGCCC 285 20 265 0
10 CAGATCAAGACAGAGA 289 27 262 0
# … with 986 more rows
$P7_5
# A tibble: 1,981 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 272 27 243 2
2 CGGGTCATCCCAAGTA 251 26 224 1
3 CTAACTTAGTTGTAGA 241 22 219 0
4 AGTGAGGTCTGCCCTA 12662 12418 210 34
5 GAACATCTCCTTGGTC 224 21 203 0
6 CTCGAGGGTTTCGCTC 216 15 201 0
7 AGACGTTGTCCGAGTC 217 18 197 2
8 TAAACCGTCTCTGTCG 213 20 193 0
9 GAATAAGCATATGGTC 222 29 183 10
10 GGACATTTCGTAGGAG 257 73 182 2
# … with 1,971 more rows
$P7_6
# A tibble: 7,285 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 266 16 248 2
2 AGGGATGGTCTAACGT 248 31 217 0
3 GAACATCTCCTTGGTC 238 22 216 0
4 CGGGTCATCCCAAGTA 210 19 191 0
5 CTCGAGGGTTTCGCTC 208 15 191 2
6 GTGCTTCGTCCCTACT 219 32 182 5
7 GAATAAGCATATGGTC 208 17 181 10
8 CTAACTTAGTTGTAGA 201 22 178 1
9 AAATGCCTCCCAAGTA 208 29 176 3
10 TAAGAGATCTTGGGTA 5376 5228 126 22
# … with 7,275 more rows
$P7_7
# A tibble: 744 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 301 30 269 2
2 AGTGAGGTCTGCCCTA 258 20 238 0
3 CTAACTTAGTTGTAGA 247 10 235 2
4 CTCGAGGGTTTCGCTC 234 17 216 1
5 GAACATCTCAGAGACG 246 33 213 0
6 TAAACCGTCTCTGTCG 228 14 211 3
7 GAACATCTCCTTGGTC 235 25 209 1
8 AGACGTTGTCCGAGTC 222 17 204 1
9 GGACATTTCGTAGGAG 219 28 191 0
10 TAAACCGCATGTAGTC 212 22 187 3
# … with 734 more rows
$P7_8
# A tibble: 1,197 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGTGAGGTCTGCCCTA 505 40 465 0
2 GAATAAGCATATGGTC 468 48 420 0
3 GAACATCTCCTTGGTC 442 34 408 0
4 CTCGAGGGTTTCGCTC 410 21 389 0
5 TGTTCCGAGGCTAGAC 421 44 377 0
6 GAACATCTCAGAGACG 415 52 363 0
7 GAACGGAGTTGCGCAC 406 45 361 0
8 CATGCCTTCAATACCG 405 49 356 0
9 GTGCTTCGTCCCTACT 425 81 344 0
10 AAATGCCTCCCAAGTA 391 50 341 0
# … with 1,187 more rows
$P7_9
# A tibble: 2,395 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 AGGGATGGTCTAACGT 446 42 403 1
2 AGTGAGGTCTGCCCTA 409 31 376 2
3 CGGGTCATCCCAAGTA 411 34 374 3
4 CTAACTTAGTTGTAGA 385 27 356 2
5 CTCGAGGGTTTCGCTC 375 26 346 3
6 AGACGTTGTCCGAGTC 365 26 338 1
7 GAACATCTCCTTGGTC 359 29 330 0
8 GAATAAGCATATGGTC 337 34 303 0
9 GGCTGGTGTGTCGCTG 316 18 292 6
10 GAACATCTCAGAGACG 329 45 284 0
# … with 2,385 more rows
data_list$umi_counts_cell %>%
map(list("background_cells"))
$P7_0
# A tibble: 307,850 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 449 30 413 6
2 GCTGCTTGTCCGAAGA 533 222 302 9
3 AAATGCCTCCCAAGTA 353 54 296 3
4 GGACATTTCGTAGGAG 315 34 277 4
5 TAAACCGCATGTAGTC 278 33 238 7
6 GCAAACTAGCTTATCG 253 13 237 3
7 ACACCAACATAAGACA 238 8 227 3
8 ACGCCGAGTCTACCTC 234 22 210 2
9 TTAACTCCAATGGTCT 228 18 209 1
10 AACTCAGTCTAACGGT 226 16 208 2
# … with 307,840 more rows
$P7_1
# A tibble: 305,602 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 GCTGCTTGTCCGAAGA 546 145 400 1
2 AGAGTGGTCATGTCCC 199 10 188 1
3 GACCAATAGTACGACG 200 14 186 0
4 CTCGTCATCAGAGGTG 200 14 185 1
5 TGGCTGGAGAAGGTGA 196 12 184 0
6 AGGGATGAGTACTTGC 197 15 182 0
7 GCTTCCAGTTCCGGCA 195 11 182 2
8 CATGGCGGTGCGATAG 198 17 180 1
9 CCACCTAAGGCCCGTT 199 19 180 0
10 GGCAATTTCTTCTGGC 195 15 180 0
# … with 305,592 more rows
$P7_10
# A tibble: 417,050 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 225 20 205 0
2 GAATAAGCATATGGTC 227 34 188 5
3 GATCTAGTCGAGAACG 189 15 174 0
4 TGTTCCGAGGCTAGAC 195 19 171 5
5 GAACATCTCAGAGACG 197 31 166 0
6 GGCTGGTGTGTCGCTG 182 17 165 0
7 TAAGAGATCTTGGGTA 196 27 164 5
8 GCTGCTTGTCCGAAGA 243 79 163 1
9 AAATGCCTCCCAAGTA 183 21 162 0
10 GAACGGAGTTGCGCAC 190 19 162 9
# … with 417,040 more rows
$P7_11
# A tibble: 323,694 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CTAACTTAGTTGTAGA 332 26 304 2
2 CACACCTTCAACGCTA 323 23 298 2
3 GCTGCTTGTCCGAAGA 403 131 263 9
4 TAAACCGTCTCTGTCG 272 21 250 1
5 GAATAAGCATATGGTC 290 24 249 17
6 AAATGCCTCCCAAGTA 288 36 248 4
7 CATCCACAGATGGCGT 243 20 218 5
8 GATCTAGTCGAGAACG 236 19 217 0
9 GAACGGAGTTGCGCAC 263 39 213 11
10 GCAAACTTCGACAGCC 233 18 213 2
# … with 323,684 more rows
$P7_12
# A tibble: 351,075 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CGGGTCATCCCAAGTA 342 37 302 3
2 CACACCTTCAACGCTA 292 31 261 0
3 GCTGCTTGTCCGAAGA 383 110 259 14
4 GAATAAGCATATGGTC 300 34 255 11
5 GAACGGAGTTGCGCAC 276 32 232 12
6 ACTTACTCAGCTCGAC 225 16 206 3
7 TAAGAGATCTTGGGTA 239 32 202 5
8 AAACGGGAGGATATAC 219 26 191 2
9 GGATGTTAGGGAACGG 204 11 191 2
10 CATCCACAGATGGCGT 220 30 188 2
# … with 351,065 more rows
$P7_13
# A tibble: 358,946 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 275 25 245 5
2 GCTGCTTGTCCGAAGA 311 91 213 7
3 CGGACACAGCGCTTAT 223 15 206 2
4 CATCCACAGATGGCGT 215 29 184 2
5 GAACGGAGTTGCGCAC 225 36 183 6
6 GCTGCGACAGTAAGCG 190 8 181 1
7 GGCTGGTGTGTCGCTG 198 16 178 4
8 ACATACGCAGCATGAG 198 24 174 0
9 ACTTTCAGTTTGACTG 194 18 174 2
10 CGACTTCAGACCTAGG 194 20 174 0
# … with 358,936 more rows
$P7_14
# A tibble: 308,824 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 306 21 284 1
2 GAACGGAGTTGCGCAC 257 25 232 0
3 GCTGCTTGTCCGAAGA 309 77 225 7
4 GATCTAGTCGAGAACG 240 32 208 0
5 CATCCACAGATGGCGT 223 21 199 3
6 GCAAACTTCGACAGCC 213 21 190 2
7 ACTTACTCAGCTCGAC 206 16 189 1
8 AAATGCCGTGAACCTT 197 13 184 0
9 CGACTTCAGACCTAGG 198 15 182 1
10 TGGGAAGTCAACCAAC 198 17 181 0
# … with 308,814 more rows
$P7_15
# A tibble: 296,796 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 TAAACCGTCTCTGTCG 198 11 187 0
2 GAACGGAGTTGCGCAC 212 24 180 8
3 TGTTCCGAGGCTAGAC 195 22 173 0
4 CACACCTTCAACGCTA 193 18 172 3
5 CTCGAGGGTTTCGCTC 188 18 170 0
6 GAACATCTCCTTGGTC 180 10 170 0
7 CATGCCTTCAATACCG 197 31 166 0
8 GGCTGGTGTGTCGCTG 182 16 166 0
9 GTGCTTCGTCCCTACT 201 33 165 3
10 ACTTACTCAGCTCGAC 177 13 162 2
# … with 296,786 more rows
$P7_2
# A tibble: 360,298 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 254 18 234 2
2 GATCTAGTCGAGAACG 219 17 201 1
3 GAATAAGCATATGGTC 238 32 196 10
4 ACTTACTCAGCTCGAC 228 36 192 0
5 GCTGCTTGTCCGAAGA 271 69 191 11
6 CGGACACAGCGCTTAT 208 26 181 1
7 GAACGGAGTTGCGCAC 208 20 181 7
8 GCTGCGACAGTAAGCG 197 17 180 0
9 AAACGGGAGGATATAC 189 12 176 1
10 CATCCACAGATGGCGT 197 27 170 0
# … with 360,288 more rows
$P7_3
# A tibble: 379,883 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 247 18 223 6
2 GCTGCTTGTCCGAAGA 306 85 215 6
3 GAATAAGCATATGGTC 263 43 209 11
4 TAAACCGTCTCTGTCG 231 26 205 0
5 AGATTGCAGAGCAATT 200 13 187 0
6 ACTTTCAGTTTGACTG 198 19 179 0
7 TCGAGGCTCCCTCTTT 194 16 178 0
8 GAACGGAGTTGCGCAC 214 29 176 9
9 CTCAGAACATATGCTG 194 19 170 5
10 CGGACACAGCGCTTAT 188 20 168 0
# … with 379,873 more rows
$P7_4
# A tibble: 314,515 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 320 16 304 0
2 CGTCTACTCTTGACGA 195 9 186 0
3 CCTACCACATCGGTTA 197 19 178 0
4 GACTACAGTTCAACCA 189 11 178 0
5 GGTGAAGAGGTGATAT 189 12 177 0
6 ACTTTCAGTTTGACTG 192 14 176 2
7 AACGTTGTCAACACAC 187 11 175 1
8 AAATGCCGTGAACCTT 199 25 174 0
9 TCGAGGCTCGCGCCAA 191 17 174 0
10 TGTTCCGCACTTAAGC 187 13 174 0
# … with 314,505 more rows
$P7_5
# A tibble: 363,944 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 287 40 244 3
2 GCTGCTTGTCCGAAGA 301 79 217 5
3 TGAGAGGCAACACGCC 198 15 180 3
4 AAATGCCTCCCAAGTA 195 22 169 4
5 TGTTCCGAGGCTAGAC 191 21 167 3
6 ACTTACTCAGCTCGAC 181 13 166 2
7 AGATTGCAGAGCAATT 167 8 159 0
8 CCGTACTGTCAGATAA 178 17 159 2
9 TAAGAGATCTTGGGTA 182 20 156 6
10 TAAACCGCATGTAGTC 174 17 154 3
# … with 363,934 more rows
$P7_6
# A tibble: 349,966 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 220 11 204 5
2 GCTGCTTGTCCGAAGA 268 78 187 3
3 AGACGTTGTCCGAGTC 195 15 180 0
4 TGTTCCGAGGCTAGAC 195 22 170 3
5 GGACATTTCGTAGGAG 187 14 168 5
6 TGAGAGGCAACACGCC 179 16 162 1
7 AAATGCCGTGAACCTT 167 11 155 1
8 TCTATTGCATAAAGGT 168 13 155 0
9 ACTTACTCAGCTCGAC 169 15 153 1
10 CATGCCTTCAATACCG 182 22 152 8
# … with 349,956 more rows
$P7_7
# A tibble: 273,888 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 230 10 219 1
2 CGGGTCATCCCAAGTA 235 25 210 0
3 CATCCACAGATGGCGT 196 12 184 0
4 GTGCTTCGTCCCTACT 235 48 184 3
5 TGTTCCGAGGCTAGAC 199 21 178 0
6 GCTGCTTGTCCGAAGA 253 75 173 5
7 GATCTAGTCGAGAACG 190 20 170 0
8 GCTGCGACAGTAAGCG 179 9 169 1
9 AAATGCCTCCCAAGTA 199 30 168 1
10 ACTTACTCAGCTCGAC 181 13 168 0
# … with 273,878 more rows
$P7_8
# A tibble: 270,528 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 470 38 432 0
2 GCTGCTTGTCCGAAGA 473 137 336 0
3 GCAAACTAGCTTATCG 299 15 284 0
4 GAATAAGTCCTAGGGC 304 29 275 0
5 GTAGGCCAGCCGATTT 309 39 270 0
6 ACACCAACATAAGACA 247 15 232 0
7 GCTTCCACACTTCGAA 251 28 223 0
8 TTAACTCCAATGGTCT 252 30 222 0
9 GCACTCTAGATACACA 243 25 218 0
10 ACGCCGAGTCTACCTC 233 18 215 0
# … with 270,518 more rows
$P7_9
# A tibble: 314,582 x 5
cell m rm_ret pm rm_disc
<chr> <int> <dbl> <int> <dbl>
1 CACACCTTCAACGCTA 389 27 358 4
2 GAACGGAGTTGCGCAC 362 44 318 0
3 AAATGCCTCCCAAGTA 322 29 292 1
4 ACTTACTCAGCTCGAC 290 33 256 1
5 CATCCACAGATGGCGT 274 24 249 1
6 GTAGGCCAGCCGATTT 272 37 235 0
7 GATCTAGTCGAGAACG 254 19 233 2
8 CGGACACAGCGCTTAT 259 28 230 1
9 AAACGGGAGGATATAC 264 35 228 1
10 GGATGTTAGGGAACGG 247 27 219 1
# … with 314,572 more rows
tic("saving output")
data_list$read_counts <- NULL
saveRDS(data_list, results_filepath)
toc()
saving output: 3599.002 sec elapsed
# memory usage
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 6380399 340.8 12111514 646.9 12111514 646.9
Vcells 8947572279 68264.6 31057732902 236951.7 32330482593 246662.1
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /cvmfs/soft.computecanada.ca/easybuild/software/2017/Core/imkl/2018.3.222/compilers_and_libraries_2018.3.222/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_0.9.4 data.table_1.12.2 tictoc_1.0
[4] furrr_0.1.0 future_1.13.0 broom_0.5.2
[7] matrixStats_0.54.0 forcats_0.4.0 stringr_1.4.0
[10] dplyr_0.8.1 purrr_0.3.2 readr_1.3.1
[13] tidyr_0.8.3 tibble_2.1.2 ggplot2_3.1.1
[16] tidyverse_1.2.1 rhdf5_2.26.2 rmarkdown_1.13
loaded via a namespace (and not attached):
[1] nlme_3.1-137 bitops_1.0-6
[3] lubridate_1.7.4 httr_1.4.0
[5] rprojroot_1.3-2 GenomeInfoDb_1.18.2
[7] tools_3.5.2 backports_1.1.4
[9] utf8_1.1.4 R6_2.4.0
[11] HDF5Array_1.10.1 lazyeval_0.2.2
[13] BiocGenerics_0.28.0 colorspace_1.4-1
[15] withr_2.1.2 tidyselect_0.2.5
[17] compiler_3.5.2 cli_1.1.0
[19] rvest_0.3.4 Biobase_2.42.0
[21] xml2_1.2.0 DelayedArray_0.8.0
[23] labeling_0.3 scales_1.0.0
[25] digest_0.6.19 XVector_0.22.0
[27] base64enc_0.1-3 pkgconfig_2.0.2
[29] htmltools_0.3.6 limma_3.38.3
[31] rlang_0.3.4 readxl_1.3.1
[33] rstudioapi_0.10 generics_0.0.2
[35] jsonlite_1.6 BiocParallel_1.16.6
[37] RCurl_1.95-4.12 magrittr_1.5
[39] GenomeInfoDbData_1.2.0 Matrix_1.2-15
[41] Rcpp_1.0.1 munsell_0.5.0
[43] S4Vectors_0.20.1 Rhdf5lib_1.4.2
[45] fansi_0.4.0 stringi_1.4.3
[47] yaml_2.2.0 edgeR_3.24.3
[49] MASS_7.3-51.1 SummarizedExperiment_1.12.0
[51] zlibbioc_1.28.0 plyr_1.8.4
[53] grid_3.5.2 parallel_3.5.2
[55] listenv_0.7.0 crayon_1.3.4
[57] lattice_0.20-38 haven_2.1.0
[59] hms_0.4.2 locfit_1.5-9.1
[61] zeallot_0.1.0 knitr_1.23
[63] pillar_1.4.1 GenomicRanges_1.34.0
[65] codetools_0.2-16 stats4_3.5.2
[67] glue_1.3.1 evaluate_0.14
[69] modelr_0.1.4 vctrs_0.1.0
[71] cellranger_1.1.0 gtable_0.3.0
[73] assertthat_0.2.1 xfun_0.7
[75] DropletUtils_1.2.2 viridisLite_0.3.0
[77] SingleCellExperiment_1.4.1 IRanges_2.16.0
[79] globals_0.12.4