library(PhantomPurgeR)

Read filepaths of all h5 files found in the data input folder

The data directory input_dir is the filepath of the folder that contains all the molecule_info.h5 files of all the samples that were multiplexed on the same lane. The files can be renamed by sample names (e.g. sample_name.h5) before being loaded or after as shown below.

input_dir <- "~/Documents/hiseq4000"
samples_filepaths <- get_h5_filenames(input_dir)
samples_filepaths
##                                         A1 
## "/home/rfarouni/Documents/hiseq4000/A1.h5" 
##                                         A2 
## "/home/rfarouni/Documents/hiseq4000/A2.h5" 
##                                         B1 
## "/home/rfarouni/Documents/hiseq4000/B1.h5" 
##                                         B2 
## "/home/rfarouni/Documents/hiseq4000/B2.h5" 
##                                         C1 
## "/home/rfarouni/Documents/hiseq4000/C1.h5" 
##                                         C2 
## "/home/rfarouni/Documents/hiseq4000/C2.h5" 
##                                         D1 
## "/home/rfarouni/Documents/hiseq4000/D1.h5" 
##                                         D2 
## "/home/rfarouni/Documents/hiseq4000/D2.h5"

Change the names of the samples if the filenames are too long.

#names(samples_filepaths) <- paste0("s", 1:length(samples_filepaths))

Run PhantomPurgeR workflow

The workflow can be run in four stages or alternatively all steps can all be run at once using the phantom_purger function.

#out <- phantom_purger(samples, torc=3, return_readcounts=FALSE, return_discarded=FALSE)

(1) Read data

out <- read10xMolInfoSamples(samples_filepaths)
## Loading samples data from molecule_info.h5 files produced by 10X Genomics CellRanger software: 31.511 sec elapsed

(2) Join and merge data

out <- join_data(out)
## Joining data. Step 1: creating read counts datatables from lists: 0.009 sec elapsed
## Joining data. Step 2: joining and merging datatables for all samples keyed by cell, umi, and gene: 50.894 sec elapsed
## Joining data. Step 3: creating an outcome variable for each row: 212.291 sec elapsed

Read counts datatable

out$read_counts
##                       cell  gene     umi A1 A2 B1 B2 C1 C2 D1 D2
##        1: AAACCTGAGAAACCAT  9004  468967 21  0  0  0  0  0  0  0
##        2: AAACCTGAGAAACCAT 22877  410924  1  0  0  0  0  0  0  0
##        3: AAACCTGAGAAACCAT  9800   43591  1  0  0  0  0  0  0  0
##        4: AAACCTGAGAAACCAT  3756 1012503 13  0  0  0  0  0  0  0
##        5: AAACCTGAGAAACCAT  7009  621442 12  0  0  0  0  0  0  0
##       ---                                                       
## 53636691: TTTGTCATCTTGTCAT 22857 1022717  0  0  0  0  0  0  0  1
## 53636692: TTTGTCATCTTGTCAT 19753  241763  0  0  0  0  0  0  0 20
## 53636693: TTTGTCATCTTGTTTG 20921  676602  0  0  0  0  0  0  0  1
## 53636694: TTTGTCATCTTGTTTG 19719  717054  0  0  0  0  0  0  0  1
## 53636695: TTTGTCATCTTGTTTG 19546  171242  0  0  0  0  0  0  0  3
##                    outcome
##        1: 21,0,0,0,0,0,0,0
##        2:  1,0,0,0,0,0,0,0
##        3:  1,0,0,0,0,0,0,0
##        4: 13,0,0,0,0,0,0,0
##        5: 12,0,0,0,0,0,0,0
##       ---                 
## 53636691:  0,0,0,0,0,0,0,1
## 53636692: 0,0,0,0,0,0,0,20
## 53636693:  0,0,0,0,0,0,0,1
## 53636694:  0,0,0,0,0,0,0,1
## 53636695:  0,0,0,0,0,0,0,3

(3) Estimate sample index hopping rate

out <- estimate_hopping_rate(out)
## Estimating SIHR. Step 1: creating outcome counts datatable: 12.977 sec elapsed
## Estimating SIHR. Step 2: creating chimera counts datatable: 0.053 sec elapsed
## Estimating SIHR. Step 3: fitting GLM: 0.056 sec elapsed
## Estimating SIHR. Step 4: computing summary statistics: 1.111 sec elapsed

Estimates

out$glm_estimates
## # A tibble: 1 x 6
##   max_r  phat phat_low phat_high    SIHR   SBIHR
##   <int> <dbl>    <dbl>     <dbl>   <dbl>   <dbl>
## 1  3744 0.991    0.991     0.991 0.00866 0.00958

Summary statistics of the joined read counts datatable

out$summary_stats$summary_estimates
## # A tibble: 1 x 7
##     n_reads   n_cugs    n_pm      u       g   RMR p_chimeras
##       <int>    <dbl>   <dbl>  <dbl>   <dbl> <dbl>      <dbl>
## 1 626168518 53636695 4460074 0.0817 0.00147  11.7     0.0710

p_chimeras is the proportion CUGs that are chimeric. g is the estimated proportion of fugue molecules and u is the molecule inflation factor such that n_cugs x u would give the number of non-fugue phantom molecules. The estimated total number of phantom molecules present in the dataset is given by n_pm=n_cugs x (u+g).

Marginal summary statistics

out$summary_stats$marginal
## # A tibble: 8 x 6
##   sample  n_reads prop_reads n_molecs p_molecs   RMR
##   <chr>     <dbl>      <dbl>    <int>    <dbl> <dbl>
## 1 A1     81735236      0.131  7026743   0.121  11.6 
## 2 A2     82030289      0.131  9592019   0.165   8.55
## 3 B1     80278882      0.128  1820293   0.0314 44.1 
## 4 B2     87690082      0.140  1751340   0.0302 50.1 
## 5 C1     76298833      0.122 10456528   0.180   7.30
## 6 C2     75477112      0.121 18646495   0.321   4.05
## 7 D1     66843761      0.107  6564850   0.113  10.2 
## 8 D2     75814323      0.121  2159758   0.0372 35.1

(4) Purge phantom molecules

Call garbage collection first

gc()
##             used   (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells   6540555  349.4   11033891   589.3   11033891   589.3
## Vcells 605436388 4619.2 1923031191 14671.6 1923031191 14671.6

Set the trade-off ratio cost cutoff (torc). The parameter torc represents the number of real molecules one is willing to incorrectly discard in order to correctly purge one phantom molecule. Since discarding a large proportion of the data is undesirable, reasonable values of torc are expected to be within the range of 1-5.

out <- purge_phantoms(out,
                      torc=3,
                      return_readcounts=FALSE,
                      return_discarded=FALSE)
## Purging phantom molecules. Step 1: reassigning hopped reads: 6.666 sec elapsed
## Purging phantom molecules. Step 2: getting observed tor threshold below user-provided cutoff: 0.066 sec elapsed
## Purging phantom molecules. Step 3: marking retained observations in read counts table: 5.887 sec elapsed
## Purging phantom molecules. Step 4: creating sparse count matrices of cleaned and discarded data: 18.456 sec elapsed
out$summary_stats$cutoff_dt
## # A tibble: 5 x 13
##   approach outcome     s     qr     n    tor     o     FP     FN     TP     TN
##   <chr>    <chr>   <int>  <dbl> <int>  <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 torc_be… 0,1,2,…     3  0.107     1   6.24 0.998 6.92e4 114007 5.34e7 4.39e6
## 2 discard… 0,0,1,…     3  0.111 99899   2.96 1.000 8.03e4  25186 5.35e7 4.38e6
## 3 torc_af… 0,1,0,…     8  0.111     1   2.96 1.000 8.03e4  25185 5.35e7 4.38e6
## 4 no_disc… 0,1,0,…     5  0.563     6 NaN    1.    8.63e4   7546 5.36e7 4.37e6
## 5 no_purg… <NA>       NA NA        NA  NA    1.08  4.46e6      0 5.36e7 0.    
## # … with 2 more variables: FPm <dbl>, FNm <dbl>

The row discard_torc shows the outcome whose qr value is the maximum allowed. Reads corresponding to outcomes with greater qr values are discarded. no_discarding corresponds to retaining all reassigned reads and no_purging corresponds to keeping the data as it is.

out$summary_stats$purge_summary
## # A tibble: 8 x 6
##   sample umi_total retained purged_real purged_phantom phantom_prop
##   <chr>      <int>    <int>       <int>          <int>        <dbl>
## 1 A1       7026743  6789262         536         236945       0.0337
## 2 A2       9592019  9376805        2272         212942       0.0222
## 3 B1       1820293   246010        1009        1573274       0.864 
## 4 B2       1751340   307244        1118        1442978       0.824 
## 5 C1      10456528 10227585        3989         224954       0.0215
## 6 C2      18646495 18410983       11803         223709       0.0120
## 7 D1       6564850  6357450        1584         205816       0.0314
## 8 D2       2159758  1897746        1299         260713       0.121
str(out$umi_counts$retained$A1)
## Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   ..@ i       : int [1:1948968] 3755 4003 6120 7008 8996 9003 9422 9799 10531 12313 ...
##   ..@ p       : int [1:151148] 0 16 17 22 29 37 38 53 54 56 ...
##   ..@ Dim     : int [1:2] 27998 151147
##   ..@ Dimnames:List of 2
##   .. ..$ : chr [1:27998(1d)] "ENSMUSG00000051951" "ENSMUSG00000089699" "ENSMUSG00000102343" "ENSMUSG00000025900" ...
##   .. ..$ : chr [1:151147] "AAACCTGAGAAACCAT" "AAACCTGAGAAACGAG" "AAACCTGAGAACTCGG" "AAACCTGAGAAGCCCA" ...
##   ..@ x       : num [1:1948968] 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ factors : list()

Outcome counts datatable

The datatable is ordered in descending order of qr, the posterior probability of incorrectly assigning s as the inferred sample of origin. n is the number of CUGs with the corresponding outcome and p_outcome is the observed marginal probability of that outcome.

out$outcome_counts
## # A tibble: 187,567 x 17
##    outcome     s    qr     r     n     o    FP    FN     TP     TN   FPm   FNm
##    <chr>   <int> <dbl> <int> <int> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl>
##  1 0,1,0,…     5 0.563     3     6 1.    86289  7546 5.36e7 4.37e6     0     0
##  2 1,1,0,…     5 0.556     3     2 1.000 86286  7548 5.36e7 4.37e6     3     2
##  3 0,0,1,…     4 0.549     2   290 1.000 86285  7549 5.36e7 4.37e6     4     3
##  4 0,1,2,…     4 0.543     6     1 1.000 86125  7680 5.36e7 4.37e6   164   134
##  5 1,1,1,…     2 0.540     5     1 1.000 86125  7680 5.36e7 4.37e6   164   134
##  6 1,1,0,…     2 0.535     3     1 1.000 86124  7681 5.36e7 4.37e6   165   135
##  7 0,0,2,…     4 0.527     5     1 1.000 86124  7681 5.36e7 4.37e6   165   135
##  8 1,1,2,…     4 0.513     6     1 1.000 86123  7682 5.36e7 4.37e6   166   136
##  9 1,0,0,…     7 0.512     3     3 1.000 86123  7682 5.36e7 4.37e6   166   136
## 10 2,0,0,…     1 0.499     5     1 1.000 86121  7684 5.36e7 4.37e6   168   138
## # … with 187,557 more rows, and 5 more variables: tor <dbl>, retain <lgl>,
## #   qs <dbl>, FPp <dbl>, p_outcome <dbl>

Make diagnostic plots

dataset_name <- "Hiseq4000"
plots <- make_plots(out, dataset_name, x_lim = 150, legend_rel_width=0.2)
plots 
## $p_read

## 
## $p_fit

## 
## $p_tor

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] purrr_0.3.3          PhantomPurgeR_0.99.0 knitr_1.26          
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.46.0              edgeR_3.28.0               
##  [3] tidyr_1.0.0                 viridisLite_0.3.0          
##  [5] R.utils_2.9.0               assertthat_0.2.1           
##  [7] stats4_3.6.2                dqrng_0.2.1                
##  [9] GenomeInfoDbData_1.2.2      yaml_2.2.0                 
## [11] globals_0.12.4              pillar_1.4.2               
## [13] backports_1.1.5             lattice_0.20-38            
## [15] glue_1.3.1                  limma_3.42.0               
## [17] digest_0.6.23               GenomicRanges_1.38.0       
## [19] XVector_0.26.0              colorspace_1.4-1           
## [21] cowplot_1.0.0               htmltools_0.4.0            
## [23] Matrix_1.2-18               R.oo_1.23.0                
## [25] pkgconfig_2.0.3             broom_0.5.2                
## [27] listenv_0.7.0               zlibbioc_1.32.0            
## [29] scales_1.1.0                HDF5Array_1.14.0           
## [31] BiocParallel_1.20.0         tibble_2.1.3               
## [33] farver_2.0.1                generics_0.0.2             
## [35] IRanges_2.20.1              tictoc_1.0                 
## [37] ggplot2_3.2.1               ellipsis_0.3.0             
## [39] SummarizedExperiment_1.16.0 furrr_0.1.0                
## [41] BiocGenerics_0.32.0         lazyeval_0.2.2             
## [43] cli_2.0.0                   magrittr_1.5               
## [45] crayon_1.3.4                evaluate_0.14              
## [47] R.methodsS3_1.7.1           future_1.15.1              
## [49] fansi_0.4.0                 nlme_3.1-143               
## [51] MASS_7.3-51.5               tools_3.6.2                
## [53] data.table_1.12.6           lifecycle_0.1.0            
## [55] matrixStats_0.55.0          stringr_1.4.0              
## [57] Rhdf5lib_1.8.0              DropletUtils_1.6.1         
## [59] S4Vectors_0.24.0            munsell_0.5.0              
## [61] locfit_1.5-9.1              DelayedArray_0.12.0        
## [63] compiler_3.6.2              GenomeInfoDb_1.22.0        
## [65] rlang_0.4.2                 rhdf5_2.30.0               
## [67] grid_3.6.2                  RCurl_1.95-4.12            
## [69] SingleCellExperiment_1.8.0  labeling_0.3               
## [71] bitops_1.0-6                rmarkdown_1.18             
## [73] gtable_0.3.0                codetools_0.2-16           
## [75] R6_2.4.1                    dplyr_0.8.3                
## [77] utf8_1.1.4                  zeallot_0.1.0              
## [79] stringi_1.4.3               parallel_3.6.2             
## [81] Rcpp_1.0.3                  vctrs_0.2.1                
## [83] tidyselect_0.2.5            xfun_0.11