library(tidyverse)
library(rhdf5)
#library(DropletUtils) # install but do not load
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file(),
fig.width=15,
digit=5,
scipen=8)
options(digits=5,
scipen=8,
future.globals.maxSize = +Inf)
project_dir <- rprojroot::find_rstudio_root_file()
if(is.null(project_dir)){
project_dir <- getwd()
warning(sprintf("No rstudio project root file found.
Setting project directory to current workflow.Rmd file location: %s.
Override if needed.",
project_dir))
}
message(sprintf("Project directory: %s",
project_dir))
Project directory: /home/rfarouni/Documents/index_hopping
code_dir <- file.path(project_dir, "code")
source(file.path(code_dir, "1_create_joined_counts_table.R"))
datasets_names <- c("hiseq4000_nonplexed", "hiseq4000_plexed")
names(datasets_names) <- datasets_names
datasets_names
hiseq4000_nonplexed hiseq4000_plexed
"hiseq4000_nonplexed" "hiseq4000_plexed"
validation_output_dir <- file.path(project_dir, "data", "hiseq4000_validation")
dir.create(validation_output_dir)
figures_dir <- file.path(validation_output_dir, "figures")
dir.create(figures_dir)
get_read_counts <- function(dataset_name) {
data_dir <- file.path(project_dir, "data", dataset_name)
output_dir <- file.path(data_dir, "output")
input_dir <- file.path(data_dir, "input")
read_counts_filepath <- file.path(
output_dir,
paste0(
dataset_name,
"_read_counts.rds"
)
)
read_counts <-
create_joined_counts(
input_dir,
read_counts_filepath
)
return(read_counts)
}
read_counts <- map(datasets_names, get_read_counts)
data_fj <-
full_join(read_counts$hiseq4000_nonplexed %>%
select(-outcome),
read_counts$hiseq4000_plexed,
by = c("cell", "gene", "umi"),
suffix = c("_nonplexed", "_plexed")
)
create keys
old_key <- unique(data_fj$gene)
new_key <-
sample.int(
n = length(old_key),
size = length(old_key)
)
names(new_key) <- old_key
new_key <-
enframe(new_key,
name = "gene",
value = "gene_new"
)
recode
data_fj <-
left_join(data_fj, new_key, by = "gene") %>%
mutate(
gene = NULL,
gene = gene_new
) %>%
select(cell, gene, umi, everything()) %>%
select(-gene_new) %>%
mutate_if(is.double, as.integer) %>%
set_names(c("cell", "gene", "umi", "s1_nonplexed", "s2_nonplexed", "s1_plexed", "s2_plexed", "outcome"))
data_fj
write_tsv(data_fj,
file.path(validation_output_dir, "hiseq4000_joined_datatable_plexed_nonplexed.txt"))
inner join
data <-
data_fj %>%
drop_na()
data
data <-
data %>%
mutate(label= case_when(
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed!=0 & s2_plexed!=0~ "r,r",
s1_nonplexed == 0 & s1_plexed == 0 ~ "0,r",
s2_nonplexed == 0 & s2_plexed == 0 ~ "r,0",
s1_nonplexed == 0 & s1_plexed != 0 & s2_plexed == 0 ~ "f,0",
s1_nonplexed == 0 & s1_plexed != 0 & s2_plexed != 0 ~ "f,r",
s2_nonplexed == 0 & s2_plexed != 0 & s1_plexed == 0 ~ "0,f",
s2_nonplexed == 0 & s2_plexed != 0 & s1_plexed != 0 ~ "r,f",
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed==0 & s2_plexed!=0~ "0,r",
s1_nonplexed != 0 & s2_nonplexed != 0 & s1_plexed!=0 & s2_plexed==0~ "r,0",
TRUE ~ "NA"
))
data
label_tally <-
data %>%
group_by(label) %>%
tally() %>%
add_row(label = "TOTAL", n=sum(.$n))
label_tally
There are only 52 (real, real) colliding CUGs out of 9252147 so assumption I is validated. The collision rate is 0.00001.
data <-
data %>%
filter(label!="r,r")
write_tsv(data, file.path(validation_output_dir,"hiseq4000_inner_joined_with_labels.txt"))
number of reads (in millions)
data_fj %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
data %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6)))
number of molecules (in millions)
data_fj %>%
mutate_at(vars(matches("^s")), list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
data %>%
mutate_at(vars(matches("^s")), list(~ as.integer(.!=0))) %>%
ungroup() %>%
summarize_at(vars(matches("^s")), list(~ sum(./10^6, na.rm = TRUE)))
p1 <- ggplot(data, aes(x = s1_nonplexed,
y = s1_plexed))
p1 + geom_hex(bins = 500)+
geom_abline(slope=.5,intercept=0)
p2 <- ggplot(data,
aes(x = s2_nonplexed,
y = s2_plexed))
p2 +
geom_hex(bins = 500)+
geom_abline(slope=.5,intercept=0)
# geom_point()
data_reads_cell_gene <-
data %>%
group_by(cell, gene) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))
ggplot(data_reads_cell_gene,
aes(x = s1_nonplexed,
y = s1_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
ggplot(data_reads_cell_gene,
aes(x = s2_nonplexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_nonplexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_plexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_reads_cell_gene,
aes(x = s1_plexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
## Gene expression UMI counts with same cell-gene label
data_molec_cell_gene <-
data %>%
mutate(purged= case_when(
label=="r,0"~ "1,0",
label=="r,f"~ "1,0",
label=="0,r"~ "0,1",
label=="f,r"~ "0,1",
label=="f,0"~ "0,0",
label=="0,f"~ "0,0"
)) %>%
separate(purged, c("s1_purged", "s2_purged"), sep=",", convert=TRUE) %>%
group_by(cell, gene) %>%
summarize_at(vars(matches("^s")),
list(~ sum(.)))
data_molec_cell_gene
ggplot(data_molec_cell_gene,
aes(x = s1_nonplexed,
y = s1_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
ggplot(data_molec_cell_gene,
aes(x = s2_nonplexed,
y = s2_plexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
ggplot(data_molec_cell_gene,
aes(x = s1_nonplexed,
y = s2_nonplexed)) +
geom_hex(bins = 400) +
geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10()
NA
p_hopping <-
ggplot(data_molec_cell_gene,
aes(x = s1_plexed,
y = s2_plexed)) +
#geom_point(alpha=0.4, size=0.4) +
geom_hex(bins = 450) +
# geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10() +
labs(x="UMI count by cell and gene (Sample 1 Multiplexed)",
y="UMI count by cell and gene (Sample 2 Multiplexed)")
ggsave(file.path(figures_dir, "samples_multiplexed_hopping.pdf"), p_hopping, width=9, height=6)
p_hopping
p_purged <-
ggplot(data_molec_cell_gene,
aes(x = s1_purged,
y = s2_purged)) +
#geom_point(alpha=0.4, size=0.4) +
geom_hex(bins = 450) +
# geom_abline(slope=1,intercept=0)+
scale_x_log10() +
scale_y_log10() +
labs(x="UMI count by cell and gene (Sample 1 Multiplexed)",
y="UMI count by cell and gene (Sample 2 Multiplexed)")
ggsave(file.path(figures_dir, "samples_multiplexed_purged.pdf"), p_purged, width=9, height=6)
p_purged
sessionInfo()
R version 3.6.0 (2019-04-26)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.2 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] hexbin_1.27.3 rhdf5_2.28.0 forcats_0.4.0 stringr_1.4.0 dplyr_0.8.1 purrr_0.3.2 readr_1.3.1 tidyr_0.8.3 tibble_2.1.2
[10] ggplot2_3.1.1 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] tidyselect_0.2.5 xfun_0.7 haven_2.1.0 lattice_0.20-38 colorspace_1.4-1 generics_0.0.2 htmltools_0.3.6 yaml_2.2.0 base64enc_0.1-3
[10] rlang_0.3.4 pillar_1.4.1 glue_1.3.1 withr_2.1.2 modelr_0.1.4 readxl_1.3.1 plyr_1.8.4 munsell_0.5.0 gtable_0.3.0
[19] cellranger_1.1.0 rvest_0.3.4 evaluate_0.14 labeling_0.3 knitr_1.23 broom_0.5.2 Rcpp_1.0.1 scales_1.0.0 backports_1.1.4
[28] jsonlite_1.6 hms_0.4.2 digest_0.6.19 stringi_1.4.3 grid_3.6.0 rprojroot_1.3-2 cli_1.1.0 tools_3.6.0 magrittr_1.5
[37] lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.2 xml2_1.2.0 lubridate_1.7.4 assertthat_0.2.1 rmarkdown_1.13 httr_1.4.0 rstudioapi_0.10
[46] Rhdf5lib_1.6.0 R6_2.4.0 nlme_3.1-140 compiler_3.6.0