library(tidyverse)
library(rhdf5)
#library(DropletUtils) # install but do not load
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /home/rfarouni/Documents/index_hopping
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
datasets_names <- c("hiseq4000_nonplexed", "hiseq4000_plexed")
names(datasets_names) <- datasets_names
datasets_names
hiseq4000_nonplexed hiseq4000_plexed
"hiseq4000_nonplexed" "hiseq4000_plexed"
validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
dir.create(validation_output_dir)
'/home/rfarouni/Documents/index_hopping/data/hiseq4000_validation' already exists
figures_dir <- file.path(validation_output_dir, "figures")
dir.create(figures_dir)
'/home/rfarouni/Documents/index_hopping/data/hiseq4000_validation/figures' already exists
get_read_counts <- function(dataset_name) {
data_dir <- file.path(project_dir, "data", dataset_name)
output_dir <- file.path(data_dir, "output")
input_dir <- file.path(data_dir, "input")
read_counts_filepath <- file.path(
output_dir,
paste0(
dataset_name,
"_read_counts.rds"
)
)
read_counts <-
create_joined_counts(
input_dir,
read_counts_filepath
)
return(read_counts)
}
read_counts <- map(datasets_names, get_read_counts)
data_fj <-
full_join(read_counts$hiseq4000_nonplexed %>%
select(-outcome),
read_counts$hiseq4000_plexed,
by = c("cell", "gene", "umi"),
suffix = c("_nonplexed", "_plexed")
)
data_fj
data_fj <-
data_fj %>%
select(cell, gene, umi, everything()) %>%
#select(-gene_new) %>%
mutate_if(is.double, as.integer) %>%
set_names(c("cell", "gene", "umi", "s1_nonplexed", "s2_nonplexed", "s1_plexed", "s2_plexed", "outcome"))
data_fj
write_tsv(data_fj,
file.path(validation_output_dir, "hiseq4000_joined_datatable_plexed_nonplexed_hg38_ensembl95.txt"))
inner join
data <-
data_fj %>%
drop_na()
data
data <-
data %>%
mutate(label= case_when(
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed!=0 & s2_plexed!=0~ "r,r",
s1_nonplexed == 0 & s1_plexed == 0 ~ "0,r",
s2_nonplexed == 0 & s2_plexed == 0 ~ "r,0",
s1_nonplexed == 0 & s1_plexed != 0 & s2_plexed == 0 ~ "f,0",
s1_nonplexed == 0 & s1_plexed != 0 & s2_plexed != 0 ~ "f,r",
s2_nonplexed == 0 & s2_plexed != 0 & s1_plexed == 0 ~ "0,f",
s2_nonplexed == 0 & s2_plexed != 0 & s1_plexed != 0 ~ "r,f",
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed==0 & s2_plexed!=0~ "0,r",
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed!=0 & s2_plexed==0~ "r,0",
TRUE ~ "NA"
))
data
label_tally <-
data %>%
group_by(label) %>%
tally() %>%
add_row(label = "TOTAL", n=sum(.$n))
label_tally
There are only 52 (real, real) colliding CUGs out of 9252147 so assumption I is validated. The collision rate is 0.00001.
data <-
data %>%
filter(label!="r,r")
write_tsv(data, file.path(validation_output_dir,"hiseq4000_inner_joined_with_labels_hg38_ensembl95.txt"))
number of reads (in millions)
data_fj %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
data %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6)))
number of molecules (in millions)
data_fj %>%
mutate_at(vars(matches("^s")), list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
data %>%
mutate_at(vars(matches("^s")), list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
p1 <- ggplot(data, aes(x = s1_nonplexed,
y = s1_plexed))
p1 + geom_hex(bins = 500)+
geom_abline(slope=.5,intercept=0)
p2 <- ggplot(data,
aes(x = s2_nonplexed,
y = s2_plexed))
p2 +
geom_hex(bins = 500)+
geom_abline(slope=.5,intercept=0)
# geom_point()
data_reads_cell_gene <-
data %>%
group_by(cell, gene) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))
ggplot(data_reads_cell_gene,
aes(x = s1_nonplexed,
y = s1_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
ggplot(data_reads_cell_gene,
aes(x = s2_nonplexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_nonplexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_plexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_plexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
## Gene expression UMI counts with same cell-gene label
data_molec_cell_gene <-
data %>%
mutate(purged= case_when(
label=="r,0"~ "1,0",
label=="r,f"~ "1,0",
label=="0,r"~ "0,1",
label=="f,r"~ "0,1",
label=="f,0"~ "0,0",
label=="0,f"~ "0,0"
)) %>%
separate(purged, c("s1_purged", "s2_purged"), sep=",", convert=TRUE) %>%
group_by(cell, gene) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))
data_molec_cell_gene
ggplot(data_molec_cell_gene,
aes(x = s1_nonplexed,
y = s1_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
ggplot(data_molec_cell_gene,
aes(x = s2_nonplexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene,
aes(x = s1_nonplexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
data_molec_cell_gene <-
data_molec_cell_gene %>%
mutate(source= case_when(s1_purged==0 ~ "s2",
s2_purged==0 ~ "s1",
TRUE ~"s1s2"))
p_hopping <-
ggplot(data_molec_cell_gene ) +
geom_point(
aes(x = s1_plexed,
y = s2_plexed,
color=source),
alpha=0.4, size=0.4) +
# geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10() +
labs(x="UMI count by cell and gene (Sample 1 Multiplexed)",
y="UMI count by cell and gene (Sample 2 Multiplexed)") +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
expand_limits(x = c(0, 650000), y = c(0, 110000))
ggsave(file.path(figures_dir, "samples_multiplexed_hopping.jpeg"), p_hopping, width=9, height=6, dpi=320)
p_hopping
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
p_purged <-
ggplot(data_molec_cell_gene) +
geom_point( aes(x = s1_purged,
y = s2_purged,
color=source),
size=1.2) +
# geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10() +
labs(x="UMI count by cell and gene (Sample 1 Multiplexed)",
y="UMI count by cell and gene (Sample 2 Multiplexed)") +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
expand_limits(x = c(0, 650000), y = c(0, 110000))
ggsave(file.path(figures_dir, "samples_multiplexed_purged.jpeg"), p_purged, width=9, height=6)
p_purged
data_molec_cell <-
data_molec_cell_gene %>%
select(-gene, -source) %>%
group_by(cell) %>%
summarize_all(sum)
data_molec_cell <-
data_molec_cell %>%
mutate(source= case_when(s1_purged==0 ~ "s2",
s2_purged==0 ~ "s1",
TRUE ~"s1s2"))
p_phantomcells <-
ggplot(data_molec_cell) +
geom_point(
aes(x = s1_plexed,
y = s2_plexed,
color=source),
alpha=0.4, size=0.3) +
# geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10() +
labs(x="UMI count by cell (Sample 1 Multiplexed)",
y="UMI count by cell (Sample 2 Multiplexed)") +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
expand_limits(x = c(0, 1000000), y = c(0, 300000))
ggsave(file.path(figures_dir, "samples_multiplexed_phantomcells.jpeg"), p_phantomcells, width=9, height=6, dpi=320)
p_phantomcells
p_cells <-
ggplot(data_molec_cell) +
geom_point(
aes(x = s1_purged,
y = s2_purged,
color=source),
alpha=0.5, size=0.6) +
# geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10() +
labs(x="UMI count by cell (Sample 1 Multiplexed)",
y="UMI count by cell (Sample 2 Multiplexed)") +
theme_bw() +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank()) +
expand_limits(x = c(0, 1000000), y = c(0, 300000))
ggsave(file.path(figures_dir, "samples_multiplexed_cells.jpeg"), p_cells, width=9, height=6, dpi=320)
p_cells
sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] rhdf5_2.30.1 forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.3 readr_1.3.1
[7] tidyr_1.0.2 tibble_3.0.0 ggplot2_3.3.0 tidyverse_1.3.0
loaded via a namespace (and not attached):
[1] Rcpp_1.0.4 lubridate_1.7.8 lattice_0.20-40 prettyunits_1.1.1 ps_1.3.2
[6] utf8_1.1.4 digest_0.6.25 assertthat_0.2.1 rprojroot_1.3-2 packrat_0.5.0
[11] R6_2.4.1 cellranger_1.1.0 backports_1.1.6 reprex_0.3.0 stats4_3.6.3
[16] evaluate_0.14 httr_1.4.1 pillar_1.4.3 rlang_0.4.5 readxl_1.3.1
[21] rstudioapi_0.11 hexbin_1.28.1 callr_3.4.3 rmarkdown_2.1 labeling_0.3
[26] loo_2.2.0 munsell_0.5.0 broom_0.5.5 compiler_3.6.3 modelr_0.1.6
[31] xfun_0.12 rstan_2.19.3 base64enc_0.1-3 pkgconfig_2.0.3 pkgbuild_1.0.6
[36] htmltools_0.4.0 tidyselect_1.0.0 gridExtra_2.3 matrixStats_0.56.0 fansi_0.4.1
[41] crayon_1.3.4 dbplyr_1.4.2 withr_2.1.2 grid_3.6.3 nlme_3.1-144
[46] jsonlite_1.6.1 gtable_0.3.0 lifecycle_0.2.0 DBI_1.1.0 magrittr_1.5
[51] StanHeaders_2.19.2 scales_1.1.0 cli_2.0.2 stringi_1.4.6 farver_2.0.3
[56] fs_1.4.1 xml2_1.3.0 ellipsis_0.3.0 generics_0.0.2 vctrs_0.2.4
[61] Rhdf5lib_1.8.0 tools_3.6.3 glue_1.4.0 hms_0.5.3 yaml_2.2.1
[66] processx_3.4.2 parallel_3.6.3 inline_0.3.15 colorspace_1.4-1 rvest_0.3.5
[71] knitr_1.28 haven_2.2.0