Prepare analysis workflow

# knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
# options(
#   readr.show_progress = FALSE,
#   digits = 2,
#   scipen = 8,
#   future.globals.maxSize = +Inf
# )
knitr::opts_knit$set(root.dir = getwd())
library(tidyverse)
library(tximport)
library(DESeq2)
library(vsn)
library(cowplot)
library(glmpca)
library(apeglm)
library(rhdf5)
library(EnhancedVolcano)
library(pheatmap)
library(ensembldb)
library(scater)
library(PoiClaClu)
library(RColorBrewer)
require(edgeR)
theme_set(theme_bw())

Auxiliary functions

save_counts_table <- function(counts, gene_metadata, file_name) {
  counts <-
    counts %>%
    as_tibble(rownames = "gene_id")

  counts <-
    left_join(counts,
      gene_metadata %>%
        dplyr::select(gene_name, gene_id),
      by = "gene_id"
    ) %>%
    relocate(gene_name, .after = gene_id)

  write_tsv(counts, file_name)
}

filter_genes <- function(dds, meta_dt, max_mean_condition_lognorm_count = 3, frac_zeros_low_threshold = 0.2) {
  counts_normalized <-
    counts(dds, normalized = T) %>%
    as_tibble(rownames = "gene_id")

  counts_normalized_long <-
    counts_normalized %>%
    pivot_longer(cols = -gene_id, names_to = "sample", values_to = "normalized_counts") %>%
    left_join(., meta_dt %>% dplyr::select(sample, condition), by = "sample") %>%
    mutate(log_normalized_counts = log1p(normalized_counts))

  summary_counts_normalized_long <-
    counts_normalized_long %>%
    group_by(gene_id, condition) %>%
    summarize(
      mean_condition = mean(log_normalized_counts),
      frac_zeros = mean(normalized_counts == 0)
    ) %>%
    group_by(gene_id) %>%
    mutate(
      n_conditions_frac_zeros_low = sum(frac_zeros <= frac_zeros_low_threshold),
      max_mean = max(mean_condition)
    ) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    dplyr::filter(n_conditions_frac_zeros_low >= 2) %>%
    dplyr::filter(max_mean > max_mean_condition_lognorm_count)

  target_genes <-
    summary_counts_normalized_long %>%
    pull(gene_id) %>%
    unique()
  keep <- rownames(counts(dds)) %in% target_genes
  dds <- dds[keep, ]
  return(dds)
}



get_deseq_data <- function(target_tissue, design_matrix, gene_metadata, meta_dt, data_dir, tx2knownGene) {
  scaled_tpm_filepath <-
    file.path(
      "./tables/bulk_rnaseq",
      paste0(target_tissue, "_engraftment_scaled_tpm.tsv")
    )
  normalized_counts_filepath <-
    file.path(
      "./tables/bulk_rnaseq",
      paste0(target_tissue, "_engraftment_counts_normalized.tsv")
    )
  normalized_counts_filtered_filepath <-
    file.path(
      "./tables/bulk_rnaseq",
      paste0(target_tissue, "_engraftment_counts_normalized_filtered.tsv")
    )


  samples_dir <- file.path(
    data_dir,
    meta_dt %>%
      pull(name)
  )
  cdna_dir <- file.path(samples_dir)
  cdna_files <- file.path(cdna_dir, "abundance.h5")
  names(cdna_files) <-
    meta_dt %>%
    pull(sample)

  # bias corrected counts without an offset.  Corrects for potential differential isoform usage

  txi_scaledTPM <- tximport(cdna_files,
    type = "kallisto",
    tx2gene = tx2knownGene,
    countsFromAbundance = c("scaledTPM")
  )


  scaled_tpm <-
    txi_scaledTPM$counts %>%
    as_tibble(rownames = "gene_id")


  scaled_tpm <-
    left_join(scaled_tpm,
      gene_metadata %>%
        dplyr::select(gene_name, gene_id),
      by = "gene_id"
    ) %>%
    relocate(gene_name, .after = gene_id)




  txi <- tximport(cdna_files,
    type = "kallisto",
    tx2gene = tx2knownGene,
    countsFromAbundance = c("no")
  )


  dds <-
    DESeqDataSetFromTximport(
      txi,
      meta_dt %>%
        column_to_rownames("sample"),
      design = design_matrix
    )

  dds <- estimateSizeFactors(dds, type = "poscounts")


  save_counts_table(
    txi_scaledTPM$counts,
    gene_metadata,
    scaled_tpm_filepath
  )

  save_counts_table(
    counts(dds, normalized = T),
    gene_metadata,
    normalized_counts_filepath
  )

  dds <- filter_genes(dds, meta_dt)

  save_counts_table(
    counts(dds, normalized = T),
    gene_metadata,
    normalized_counts_filtered_filepath
  )

  design_matrix_reduced <-
    model.matrix(
      ~1,
      meta_dt %>%
        column_to_rownames("sample")
    )
  colnames(design_matrix_reduced) <- c("intercept")
  dds <- DESeq(dds, test = "LRT", reduced = design_matrix_reduced)

  return(dds)
}

# We use apeglm shrinkage to preserve the size of large LFC and compute s-values
# (the estimated rate of false sign). Shrinkage is useful for ranking and visualization,
# without the need for arbitrary filters on low count genes. https://doi.org/10.1093/bioinformatics/bty895

# The local false sign rate FSR is defined as the posterior probability that the posterior mode (MAP)
# of beta (log2FC) is of a biologically significant effect size (i.e. abs(beta) > lfcThreshold )

get_lfc <- function(dds, coef_name, lfc_threshold, svalue_threshold) {
  lfc <-
    lfcShrink(dds,
      coef = coef_name,
      type = "apeglm",
      svalue = T,
      lfcThreshold = lfc_threshold
    )

  # plotMA(lfc, ylim = c(-5, 5))
  # abline(h = c(-1, 1), col = "dodgerblue", lwd = 2)

  lfc <-
    lfc %>%
    as_tibble() %>%
    dplyr::select(-baseMean) %>%
    mutate(
      keep = ((abs(log2FoldChange) > lfc_threshold) & (svalue < svalue_threshold))
    ) %>%
    set_names(c(
      paste0(
        c("lfc_", "lfc_se_", "svalue_", "keep_"),
        coef_name
      )
    ))

  return(lfc)
}

get_lfc_estimates <- function(dds, gene_metadata, coef_names, lfc_threshold, svalue_threshold, batch_threshold) {
  lfcs <-
    map_dfc(coef_names,
      get_lfc,
      dds = dds,
      lfc_threshold = lfc_threshold,
      svalue_threshold = svalue_threshold
    )

  # coef_names_batch <- c(paste0(c("intercept", coef_names), "_batch"), paste0("SE_", c("intercept", coef_names), "_batch"))

  lfcs <-
    bind_cols(
      mcols(dds)[, c("baseMean", "baseVar", "dispOutlier", "maxCooks", "intercept", "SE_intercept", coef_names)] %>%
        as_tibble(rownames = "gene_id") %>%
        dplyr::select(gene_id, everything()),
      lfcs
    )



  lfcs <-
    lfcs %>%
    dplyr::select(
      gene_id, baseMean,
      intercept, paste0("lfc_", coef_names),
      paste0("svalue_", coef_names), everything()
    )

  lfcs <-
    lfcs %>%
    left_join(., gene_metadata, by = "gene_id") %>%
    dplyr::select(gene_name, everything()) %>%
    mutate(
      gene_name =
        scater::uniquifyFeatureNames(
          ID = gene_id,
          names = gene_name
        )
    )

  lfcs <-
    lfcs %>%
    mutate(
      total_batch = abs(intercept_batch) + abs(niche_batch) + abs(age_batch) + abs(engraf_batch),
      keep_engraf = case_when(
        keep_engraf & engraf >= 0 & (engraf + engraf_batch > batch_threshold) ~ "U",
        keep_engraf & engraf < 0 & (engraf + engraf_batch < -batch_threshold) ~ "D",
        TRUE ~ "N"
      ),
      keep_age = case_when(
        keep_age & age >= 0 & (age + age_batch > batch_threshold) ~ "U",
        keep_age & age < 0 & (age + age_batch < -batch_threshold) ~ "D",
        TRUE ~ "N"
      ),
      keep_niche = case_when(
        keep_niche & niche >= 0 & (niche + niche_batch > batch_threshold) ~ "U",
        keep_niche & niche < 0 & (niche + niche_batch < -batch_threshold) ~ "D",
        TRUE ~ "N"
      ),
      EAN = paste0(
        keep_engraf,
        keep_age,
        keep_niche
      )
    ) %>%
    dplyr::select(-keep_intercept_batch, -keep_engraf_batch, -keep_age_batch, -keep_niche_batch) %>%
    dplyr::select(
      gene_name, EAN, intercept:lfc_niche, intercept_batch:niche_batch,
      svalue_engraf:svalue_niche, keep_engraf, keep_age, keep_niche, total_batch, everything()
    )

  lfcs_filepath <-
    file.path(
      "./tables/bulk_rnaseq",
      "leg_engraftment_moderated_lfc_1_svalue_05_marked.tsv"
    )
  write_tsv(lfcs, lfcs_filepath)


  return(lfcs)
}


plot_volcano <- function(data, xvar, yvar, title, lfc_threshold, svalue_thresh, figures_dir) {
  gv <-
    EnhancedVolcano(data,
      lab = data %>% pull(gene_name),
      xlab = bquote(~ italic(Moderated) ~ Log[2] ~ FC),
      ylab = bquote(~ -Log[10] ~ italic(svalue)),
      x = xvar,
      y = yvar,
      col = c("grey30", "forestgreen", "red2", "royalblue"),
      pCutoff = svalue_thresh,
      FCcutoff = lfc_threshold,
      #  legend = c("NS", "Log2FC", "svalue", "svalue & Log2FC"),
      legendLabels = c(
        "NS",
        expression(Log[2] ~ FC),
        "svalue",
        expression(s - value ~ and ~ log[2] ~ FC)
      )
    )

  ggsave(file.path(figures_dir, paste0(title, "_volcano.pdf")), gv)
  return(gv)
}

plot_heatmap <- function(target_genes, title, show_genes = F, figures_dir) {
  
  counts_normalized_subset <-
    counts_normalized%>%
    mutate_if(is.numeric, ~ log1p(.)) %>%
    dplyr::filter(gene_id %in% target_genes)%>%
  mutate(
    gene_name =
      scater::uniquifyFeatureNames(
        ID = gene_id,
        names = gene_name
      )
  ) %>%
  dplyr::select(-gene_id) %>%
  column_to_rownames("gene_name") %>%
  as.matrix()
  
  
  gh <-
    pheatmap(counts_normalized_subset,
      color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "Spectral")))(100),
      scale = "row",
      cluster_rows = T,
      show_rownames = show_genes,
      treeheight_row = 0,
      fontsize_col = 4,
      fontsize_row = 6,
      angle_col = "45",
      cluster_cols = FALSE,
      annotation_col = meta_df
    )
  ggsave(file.path(figures_dir, paste0(title, "_heatmap_expression.pdf")), gh)

  return(gh)
}


plot_gene <- function(gene, figures_dir) {
  counts_normalized_long_subset <-
    counts_normalized_long %>%
    dplyr::filter(gene_id == gene)

  lfcs_subset <-
    lfcs %>%
    dplyr::filter(gene_id == gene)

  gene_name <- paste(counts_normalized_long_subset$gene_name[1], lfcs_subset$EAN[1], sep = "_")

  lfcs_label <-
    lfcs_subset %>%
    dplyr::select(lfc_engraf:niche_batch) %>%
    mutate(across(everything(), ~ round(.x, 2))) %>%
    relocate(intercept_batch, .after = last_col()) %>%
    unite("value", sep = ",") %>%
    pull(value)

  title_lab <- paste0(gene_name, ":", lfcs_label)
  p <- ggplot(
    counts_normalized_long_subset,
    aes(x = time, y = lognorm_counts)
  ) +
    geom_point(aes(color = donor)) +
    geom_line(aes(group = donor)) +
    labs(title = title_lab) +
    facet_grid(batch ~ age)
  ggsave(file.path(figures_dir, paste0(gene_name, "_lognorm_counts.pdf")), p)
  return(p)
}

Set filepaths

data_dir <- "./data/kallisto_out"
gene_metadata_filepath <- "./data/metadata/mm10_ens97_gene_meta.txt"
metadata_filepath <- "./data/metadata/samples_metadata_leg.tsv"

Get tx id versions

suppressPackageStartupMessages(library(AnnotationHub))
ah <- AnnotationHub()
DEPRECATION: As of AnnotationHub (>2.23.2), default caching location has changed.
  Problematic cache: /home/rick/.cache/AnnotationHub
  See https://bioconductor.org/packages/devel/bioc/vignettes/AnnotationHub/inst/doc/TroubleshootingTheCache.html#default-caching-location-update
snapshotDate(): 2021-05-18
edb <- ah[["AH73905"]]
loading from cache
seqlevelsStyle(edb) <- "UCSC"
# standard_chroms <- standardChromosomes(edb, species = organism(edb))
tx_meta <-
  transcripts(edb,
    columns = c(
      "tx_id_version",
      "gene_id",
      "gene_name",
      "gene_id_version"
    ),
    return.type = "DataFrame"
  )
tx2knownGene <-
  tx_meta %>%
  as_tibble() %>%
  dplyr::select(tx_id_version, gene_id)
meta_dt <-
  read_tsv(file.path(metadata_filepath)) %>%
  # arrange(tissue, age, time) %>%
  mutate(
    condition = paste(age, time, sep = "_"),
    age = factor(age, levels = c("young", "aged")),
    time = factor(time, levels = c("T0", "T21")),
    batch = factor(batch, levels = c(1, 2))
  )
Rows: 21 Columns: 8── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (7): name, sample, time, tissue, age, donor, type
dbl (1): batch
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
meta_dt
gene_metadata <-
  read_csv(gene_metadata_filepath) %>%
  dplyr::select(-gene_id_version)
Rows: 55487 Columns: 8── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): gene_name, gene_id, seqnames, gene_biotype, gene_id_version, description
dbl (2): gene_len, perc_gene_gc
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gene_metadata <-
  left_join(tx_meta %>% as_tibble() %>%
    group_by(gene_id) %>%
    dplyr::count(gene_name) %>%
    dplyr::select(-n),
  gene_metadata %>%
    dplyr::select(-gene_name),
  by = "gene_id"
  ) %>%
  mutate(
    gene_name =
      scater::uniquifyFeatureNames(
        ID = gene_id,
        names = gene_name
      )
  )
gene_metadata
design_matrix <-
  model.matrix(
    ~ time * age * batch,
    meta_dt %>%
      column_to_rownames("sample")
  )
colnames(design_matrix) <- c(
  "intercept",
  "engraf",
  "age",
  "intercept_batch",
  "niche",
  "engraf_batch",
  "age_batch",
  "niche_batch"
)

design_matrix <- design_matrix[, c(1, 2, 3, 5, 4, 6, 7, 8)]
design_matrix
            intercept engraf age niche intercept_batch engraf_batch age_batch niche_batch
yng_t0_01           1      0   0     0               0            0         0           0
yng_t0_02           1      0   0     0               0            0         0           0
yng_t0_03           1      0   0     0               0            0         0           0
yng_t0_04           1      0   0     0               1            0         0           0
yng_t0_05           1      0   0     0               1            0         0           0
yng_t0_06           1      0   0     0               1            0         0           0
yng_t21_01          1      1   0     0               0            0         0           0
yng_t21_02          1      1   0     0               0            0         0           0
yng_t21_04          1      1   0     0               1            1         0           0
yng_t21_05          1      1   0     0               1            1         0           0
yng_t21_06          1      1   0     0               1            1         0           0
aged_t0_01          1      0   1     0               0            0         0           0
aged_t0_02          1      0   1     0               0            0         0           0
aged_t0_03          1      0   1     0               1            0         1           0
aged_t0_04          1      0   1     0               1            0         1           0
aged_t0_05          1      0   1     0               1            0         1           0
aged_t21_01         1      1   1     1               0            0         0           0
aged_t21_02         1      1   1     1               0            0         0           0
aged_t21_03         1      1   1     1               1            1         1           1
aged_t21_04         1      1   1     1               1            1         1           1
aged_t21_05         1      1   1     1               1            1         1           1

Load data

dds <- get_deseq_data("leg", design_matrix, gene_metadata, meta_dt, data_dir, tx2knownGene)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 
summarizing abundance
summarizing counts
summarizing length
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 
summarizing abundance
summarizing counts
summarizing length
using counts and average transcript lengths from tximport
using 'avgTxLength' from assays(dds), correcting for library size
`summarise()` has grouped output by 'gene_id'. You can override using the `.groups` argument.using supplied model matrix
using pre-existing normalization factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
dds
class: DESeqDataSet 
dim: 17133 21 
metadata(1): version
assays(6): counts avgTxLength ... H cooks
rownames(17133): ENSMUSG00000000001 ENSMUSG00000000028 ... ENSMUSG00000118458 ENSMUSG00000118461
rowData names(33): baseMean baseVar ... deviance maxCooks
colnames(21): yng_t0_01 yng_t0_02 ... aged_t21_04 aged_t21_05
colData names(8): name time ... batch condition
summary_dt <-
  colSums(assay(dds)) %>%
  tibble::enframe(name = "sample", value = "counts")
summary_dt
plotDispEsts(dds)

res <- results(dds)
W <- res$stat
maxCooks <- apply(assays(dds)[["cooks"]], 1, max)
idx <- !is.na(W)
plot(rank(W[idx]), maxCooks[idx],
  xlab = "rank of Wald statistic",
  ylab = "maximum Cook's distance per gene",
  ylim = c(0, 5), cex = .4, col = rgb(0, 0, 0, .3)
)
m <- ncol(dds)
p <- 3
abline(h = qf(.99, p, m - p))

plot(res$baseMean + 1, -log10(res$pvalue),
  log = "x", xlab = "mean of normalized counts",
  ylab = expression(-log[10](pvalue)),
  ylim = c(0, 30),
  cex = .4, col = rgb(0, 0, 0, .3)
)

lfc_threshold <- 1
svalue_thresh <- 0.05
batch_threshold <- 1
coef_names <- colnames(design_matrix)[-1]
lfcs <-
  get_lfc_estimates(
    dds,
    gene_metadata,
    coef_names,
    lfc_threshold,
    svalue_thresh,
    batch_threshold
  )
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
computing FSOS 'false sign or small' s-values (T=1)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
computing FSOS 'false sign or small' s-values (T=1)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
computing FSOS 'false sign or small' s-values (T=1)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
computing FSOS 'false sign or small' s-values (T=1)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
computing FSOS 'false sign or small' s-values (T=1)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
computing FSOS 'false sign or small' s-values (T=1)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
computing FSOS 'false sign or small' s-values (T=1)
lfcs
table(lfcs$EAN)

  DDN   DDU   DNN   DNU   NDN   NDU   NND   NNN   NNU   NUD   NUN   UND   UNN   UNU   UUD   UUN 
   35    22    75    12    93    55   341 15557   163    58    64    91   496     1    50    20 
lfcs_hits <-
  lfcs %>%
  dplyr::filter(lfcs$EAN != "NNN")
lfcs_hits
lfcs_aging_up <-
  lfcs_hits %>%
  dplyr::filter(keep_age == "U")
lfcs_aging_up
lfcs_aging_down <-
  lfcs_hits %>%
  dplyr::filter(keep_age == "D")
lfcs_aging_down
lfcs_niche_up <-
  lfcs_hits %>%
  dplyr::filter(keep_niche == "U")
lfcs_niche_up
lfcs_niche_down <-
  lfcs_hits %>%
  dplyr::filter(keep_niche == "D")
lfcs_niche_down

Gene Expression plots

normalized_counts_filtered_filepath <-
  file.path(
    "./tables/bulk_rnaseq",
    "leg_engraftment_counts_normalized_filtered.tsv"
  )

counts_normalized <- read_tsv(normalized_counts_filtered_filepath)

counts_normalized_long <-
  counts_normalized %>%
  pivot_longer(cols = yng_t0_01:aged_t21_05, names_to = "sample", values_to = "counts") %>%
  left_join(., meta_dt %>% dplyr::select(sample, time, age, donor, batch, condition), by = "sample") %>%
  mutate(lognorm_counts = log1p(counts))
genes_to_plot <-
  lfcs_hits%>%
  arrange(total_batch) %>%
  pull(gene_id)
pp[1:10]
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]

[[6]]

[[7]]

[[8]]

[[9]]

[[10]]

Heatmap plot

meta_df <-
  as.data.frame(colData(dds)[, c("age", "time", "batch")])

aging_up_genes <-
  lfcs_aging_up%>%
  arrange(total_batch) %>%
  pull(gene_id)

aging_down_genes <-
  lfcs_aging_down%>%
  arrange(total_batch) %>%
  pull(gene_id)


niche_up_genes <-
  lfcs_niche_up%>%
  arrange(total_batch) %>%
  pull(gene_id)

niche_down_genes <-
  lfcs_niche_down%>%
  arrange(total_batch) %>%
  pull(gene_id)
figures_dir <- "./figures/bulk_rnaseq/heatmaps"
plot_heatmap(aging_up_genes, title = "lognorm_counts_aging_up", show_genes = T, figures_dir)

plot_heatmap(aging_down_genes, title = "lognorm_counts_aging_down", show_genes = T, figures_dir)

plot_heatmap(niche_down_genes, title = "lognorm_counts_niche_down", show_genes = T, figures_dir)

plot_heatmap(niche_up_genes, title = "lognorm_counts_niche_up", show_genes = T, figures_dir)

Volcano Plots

svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_niche", "svalue_niche", "niche", lfc_threshold, svalue_thresh, figures_dir)
Ignoring unknown parameters: xlim, ylimSaving 7 x 7 in image

plot_volcano(lfcs, "lfc_age", "svalue_age", "age", lfc_threshold, svalue_thresh, figures_dir)
Ignoring unknown parameters: xlim, ylimSaving 7 x 7 in image

plot_volcano(lfcs, "lfc_engraf", "svalue_engraf", "engraf", lfc_threshold, svalue_thresh, figures_dir)
Ignoring unknown parameters: xlim, ylimSaving 7 x 7 in image

svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_niche_batch", "svalue_niche_batch", "niche_batch", lfc_threshold, svalue_thresh, figures_dir)
Ignoring unknown parameters: xlim, ylimSaving 7 x 7 in image

svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_age_batch", "svalue_age_batch", "age_batch", lfc_threshold, svalue_thresh, figures_dir)
Ignoring unknown parameters: xlim, ylimSaving 7 x 7 in image

svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_engraf_batch", "svalue_engraf_batch", "engraf_batch", lfc_threshold, svalue_thresh, figures_dir)
Ignoring unknown parameters: xlim, ylimSaving 7 x 7 in image

svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_intercept_batch", "svalue_intercept_batch", "intercept_batch", lfc_threshold, svalue_thresh, figures_dir)
One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...Ignoring unknown parameters: xlim, ylimSaving 7 x 7 in image

sessionInfo()
R version 4.1.3 (2022-03-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] AnnotationHub_3.0.2         BiocFileCache_2.0.0         dbplyr_2.1.1                edgeR_3.34.1                limma_3.48.3                RColorBrewer_1.1-2          PoiClaClu_1.0.2.1           scater_1.20.1              
 [9] scuttle_1.2.1               SingleCellExperiment_1.14.1 ensembldb_2.16.4            AnnotationFilter_1.16.0     GenomicFeatures_1.44.2      AnnotationDbi_1.54.1        pheatmap_1.0.12             EnhancedVolcano_1.10.0     
[17] ggrepel_0.9.1               rhdf5_2.36.0                apeglm_1.14.0               glmpca_0.2.0                cowplot_1.1.1               vsn_3.60.0                  DESeq2_1.32.0               SummarizedExperiment_1.22.0
[25] Biobase_2.52.0              MatrixGenerics_1.4.3        matrixStats_0.61.0          GenomicRanges_1.44.0        GenomeInfoDb_1.28.4         IRanges_2.26.0              S4Vectors_0.30.2            BiocGenerics_0.38.0        
[33] tximport_1.20.0             forcats_0.5.1               stringr_1.4.0               dplyr_1.0.8                 purrr_0.3.4                 readr_2.1.2                 tidyr_1.2.0                 tibble_3.1.6               
[41] ggplot2_3.3.5               tidyverse_1.3.1            

loaded via a namespace (and not attached):
  [1] utf8_1.2.2                    tidyselect_1.1.2              RSQLite_2.2.10                grid_4.1.3                    BiocParallel_1.26.2           munsell_0.5.0                 ScaledMatrix_1.0.0            ragg_1.2.2                   
  [9] preprocessCore_1.54.0         withr_2.5.0                   colorspace_2.0-3              filelock_1.0.2                ggalt_0.4.0                   knitr_1.37                    rstudioapi_0.13               Rttf2pt1_1.3.10              
 [17] labeling_0.4.2                bbmle_1.0.24                  GenomeInfoDbData_1.2.6        farver_2.1.0                  bit64_4.0.5                   coda_0.19-4                   vctrs_0.3.8                   generics_0.1.2               
 [25] xfun_0.30                     R6_2.5.1                      ggbeeswarm_0.6.0              rsvd_1.0.5                    locfit_1.5-9.5                bitops_1.0-7                  rhdf5filters_1.4.0            cachem_1.0.6                 
 [33] DelayedArray_0.18.0           assertthat_0.2.1              vroom_1.5.7                   promises_1.2.0.1              BiocIO_1.2.0                  scales_1.1.1                  beeswarm_0.4.0                gtable_0.3.0                 
 [41] beachmat_2.8.1                ash_1.0-15                    affy_1.70.0                   rlang_1.0.2                   genefilter_1.74.1             systemfonts_1.0.4             splines_4.1.3                 lazyeval_0.2.2               
 [49] rtracklayer_1.52.1            extrafontdb_1.0               broom_0.7.12                  BiocManager_1.30.16           yaml_2.3.5                    modelr_0.1.8                  backports_1.4.1               httpuv_1.6.5                 
 [57] extrafont_0.17                tools_4.1.3                   affyio_1.62.0                 ellipsis_0.3.2                jquerylib_0.1.4               Rcpp_1.0.8.3                  plyr_1.8.6                    sparseMatrixStats_1.4.2      
 [65] progress_1.2.2                zlibbioc_1.38.0               RCurl_1.98-1.6                prettyunits_1.1.1             viridis_0.6.2                 haven_2.4.3                   fs_1.5.2                      magrittr_2.0.2               
 [73] reprex_2.0.1                  mvtnorm_1.1-3                 ProtGenerics_1.24.0           mime_0.12                     hms_1.1.1                     evaluate_0.15                 xtable_1.8-4                  XML_3.99-0.9                 
 [81] emdbook_1.3.12                readxl_1.3.1                  gridExtra_2.3                 compiler_4.1.3                biomaRt_2.48.3                bdsmatrix_1.3-4               maps_3.4.0                    KernSmooth_2.23-20           
 [89] crayon_1.5.0                  htmltools_0.5.2               later_1.3.0                   tzdb_0.2.0                    geneplotter_1.70.0            lubridate_1.8.0               DBI_1.1.2                     proj4_1.0-11                 
 [97] MASS_7.3-55                   rappdirs_0.3.3                Matrix_1.4-0                  cli_3.2.0                     pkgconfig_2.0.3               GenomicAlignments_1.28.0      numDeriv_2016.8-1.1           xml2_1.3.3                   
[105] annotate_1.70.0               vipor_0.4.5                   bslib_0.3.1                   XVector_0.32.0                rvest_1.0.2                   digest_0.6.29                 Biostrings_2.60.2             rmarkdown_2.13               
[113] cellranger_1.1.0              DelayedMatrixStats_1.14.3     restfulr_0.0.13               curl_4.3.2                    shiny_1.7.1                   Rsamtools_2.8.0               rjson_0.2.21                  lifecycle_1.0.1              
[121] jsonlite_1.8.0                Rhdf5lib_1.14.2               BiocNeighbors_1.10.0          viridisLite_0.4.0             fansi_1.0.2                   pillar_1.7.0                  lattice_0.20-45               ggrastr_1.0.1                
[129] KEGGREST_1.32.0               fastmap_1.1.0                 httr_1.4.2                    survival_3.2-13               interactiveDisplayBase_1.30.0 glue_1.6.2                    png_0.1-7                     BiocVersion_3.13.1           
[137] bit_4.0.4                     stringi_1.7.6                 sass_0.4.0                    blob_1.2.2                    textshaping_0.3.6             BiocSingular_1.8.1            memoise_2.0.1                 irlba_2.3.5                  
---
title: "Mouse muscle stem cell engraftment: Bulk RNA-seq gene differential analysis"
author: 
- name: Rick Farouni
  affiliation:
  - &cruk Génome Québec Innovation Centre, McGill University, Montreal, Canada
date: '`r format(Sys.Date(), "%Y-%B-%d")`'
output:
  html_notebook:
    df_print: paged
    code_folding: show
    toc: no
    toc_float: 
      collapsed: false
      smooth_scroll: false
---

# Prepare analysis workflow


```{r setup}
# knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
# options(
#   readr.show_progress = FALSE,
#   digits = 2,
#   scipen = 8,
#   future.globals.maxSize = +Inf
# )
knitr::opts_knit$set(root.dir = getwd())
```

```{r message=FALSE, warning=FALSE}
library(tidyverse)
library(tximport)
library(DESeq2)
library(vsn)
library(cowplot)
library(glmpca)
library(apeglm)
library(rhdf5)
library(EnhancedVolcano)
library(pheatmap)
library(ensembldb)
library(scater)
library(PoiClaClu)
library(RColorBrewer)
require(edgeR)
theme_set(theme_bw())
```


## Auxiliary functions



```{r}
save_counts_table <- function(counts, gene_metadata, file_name) {
  counts <-
    counts %>%
    as_tibble(rownames = "gene_id")

  counts <-
    left_join(counts,
      gene_metadata %>%
        dplyr::select(gene_name, gene_id),
      by = "gene_id"
    ) %>%
    relocate(gene_name, .after = gene_id)

  write_tsv(counts, file_name)
}

filter_genes <- function(dds, meta_dt, max_mean_condition_lognorm_count = 3, frac_zeros_low_threshold = 0.2) {
  counts_normalized <-
    counts(dds, normalized = T) %>%
    as_tibble(rownames = "gene_id")

  counts_normalized_long <-
    counts_normalized %>%
    pivot_longer(cols = -gene_id, names_to = "sample", values_to = "normalized_counts") %>%
    left_join(., meta_dt %>% dplyr::select(sample, condition), by = "sample") %>%
    mutate(log_normalized_counts = log1p(normalized_counts))

  summary_counts_normalized_long <-
    counts_normalized_long %>%
    group_by(gene_id, condition) %>%
    summarize(
      mean_condition = mean(log_normalized_counts),
      frac_zeros = mean(normalized_counts == 0)
    ) %>%
    group_by(gene_id) %>%
    mutate(
      n_conditions_frac_zeros_low = sum(frac_zeros <= frac_zeros_low_threshold),
      max_mean = max(mean_condition)
    ) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    dplyr::filter(n_conditions_frac_zeros_low >= 2) %>%
    dplyr::filter(max_mean > max_mean_condition_lognorm_count)

  target_genes <-
    summary_counts_normalized_long %>%
    pull(gene_id) %>%
    unique()
  keep <- rownames(counts(dds)) %in% target_genes
  dds <- dds[keep, ]
  return(dds)
}



get_deseq_data <- function(target_tissue, design_matrix, gene_metadata, meta_dt, data_dir, tx2knownGene) {
  scaled_tpm_filepath <-
    file.path(
      "./tables/bulk_rnaseq",
      paste0(target_tissue, "_engraftment_scaled_tpm.tsv")
    )
  normalized_counts_filepath <-
    file.path(
      "./tables/bulk_rnaseq",
      paste0(target_tissue, "_engraftment_counts_normalized.tsv")
    )
  normalized_counts_filtered_filepath <-
    file.path(
      "./tables/bulk_rnaseq",
      paste0(target_tissue, "_engraftment_counts_normalized_filtered.tsv")
    )


  samples_dir <- file.path(
    data_dir,
    meta_dt %>%
      pull(name)
  )
  cdna_dir <- file.path(samples_dir)
  cdna_files <- file.path(cdna_dir, "abundance.h5")
  names(cdna_files) <-
    meta_dt %>%
    pull(sample)

  # bias corrected counts without an offset.  Corrects for potential differential isoform usage

  txi_scaledTPM <- tximport(cdna_files,
    type = "kallisto",
    tx2gene = tx2knownGene,
    countsFromAbundance = c("scaledTPM")
  )


  scaled_tpm <-
    txi_scaledTPM$counts %>%
    as_tibble(rownames = "gene_id")


  scaled_tpm <-
    left_join(scaled_tpm,
      gene_metadata %>%
        dplyr::select(gene_name, gene_id),
      by = "gene_id"
    ) %>%
    relocate(gene_name, .after = gene_id)




  txi <- tximport(cdna_files,
    type = "kallisto",
    tx2gene = tx2knownGene,
    countsFromAbundance = c("no")
  )


  dds <-
    DESeqDataSetFromTximport(
      txi,
      meta_dt %>%
        column_to_rownames("sample"),
      design = design_matrix
    )

  dds <- estimateSizeFactors(dds, type = "poscounts")


  save_counts_table(
    txi_scaledTPM$counts,
    gene_metadata,
    scaled_tpm_filepath
  )

  save_counts_table(
    counts(dds, normalized = T),
    gene_metadata,
    normalized_counts_filepath
  )

  dds <- filter_genes(dds, meta_dt)

  save_counts_table(
    counts(dds, normalized = T),
    gene_metadata,
    normalized_counts_filtered_filepath
  )

  design_matrix_reduced <-
    model.matrix(
      ~1,
      meta_dt %>%
        column_to_rownames("sample")
    )
  colnames(design_matrix_reduced) <- c("intercept")
  dds <- DESeq(dds, test = "LRT", reduced = design_matrix_reduced)

  return(dds)
}

# We use apeglm shrinkage to preserve the size of large LFC and compute s-values
# (the estimated rate of false sign). Shrinkage is useful for ranking and visualization,
# without the need for arbitrary filters on low count genes. https://doi.org/10.1093/bioinformatics/bty895

# The local false sign rate FSR is defined as the posterior probability that the posterior mode (MAP)
# of beta (log2FC) is of a biologically significant effect size (i.e. abs(beta) > lfcThreshold )

get_lfc <- function(dds, coef_name, lfc_threshold, svalue_threshold) {
  lfc <-
    lfcShrink(dds,
      coef = coef_name,
      type = "apeglm",
      svalue = T,
      lfcThreshold = lfc_threshold
    )

  # plotMA(lfc, ylim = c(-5, 5))
  # abline(h = c(-1, 1), col = "dodgerblue", lwd = 2)

  lfc <-
    lfc %>%
    as_tibble() %>%
    dplyr::select(-baseMean) %>%
    mutate(
      keep = ((abs(log2FoldChange) > lfc_threshold) & (svalue < svalue_threshold))
    ) %>%
    set_names(c(
      paste0(
        c("lfc_", "lfc_se_", "svalue_", "keep_"),
        coef_name
      )
    ))

  return(lfc)
}

get_lfc_estimates <- function(dds, gene_metadata, coef_names, lfc_threshold, svalue_threshold, batch_threshold) {
  lfcs <-
    map_dfc(coef_names,
      get_lfc,
      dds = dds,
      lfc_threshold = lfc_threshold,
      svalue_threshold = svalue_threshold
    )

  # coef_names_batch <- c(paste0(c("intercept", coef_names), "_batch"), paste0("SE_", c("intercept", coef_names), "_batch"))

  lfcs <-
    bind_cols(
      mcols(dds)[, c("baseMean", "baseVar", "dispOutlier", "maxCooks", "intercept", "SE_intercept", coef_names)] %>%
        as_tibble(rownames = "gene_id") %>%
        dplyr::select(gene_id, everything()),
      lfcs
    )



  lfcs <-
    lfcs %>%
    dplyr::select(
      gene_id, baseMean,
      intercept, paste0("lfc_", coef_names),
      paste0("svalue_", coef_names), everything()
    )

  lfcs <-
    lfcs %>%
    left_join(., gene_metadata, by = "gene_id") %>%
    dplyr::select(gene_name, everything()) %>%
    mutate(
      gene_name =
        scater::uniquifyFeatureNames(
          ID = gene_id,
          names = gene_name
        )
    )

  lfcs <-
    lfcs %>%
    mutate(
      total_batch = abs(intercept_batch) + abs(niche_batch) + abs(age_batch) + abs(engraf_batch),
      keep_engraf = case_when(
        keep_engraf & engraf >= 0 & (engraf + engraf_batch > batch_threshold) ~ "U",
        keep_engraf & engraf < 0 & (engraf + engraf_batch < -batch_threshold) ~ "D",
        TRUE ~ "N"
      ),
      keep_age = case_when(
        keep_age & age >= 0 & (age + age_batch > batch_threshold) ~ "U",
        keep_age & age < 0 & (age + age_batch < -batch_threshold) ~ "D",
        TRUE ~ "N"
      ),
      keep_niche = case_when(
        keep_niche & niche >= 0 & (niche + niche_batch > batch_threshold) ~ "U",
        keep_niche & niche < 0 & (niche + niche_batch < -batch_threshold) ~ "D",
        TRUE ~ "N"
      ),
      EAN = paste0(
        keep_engraf,
        keep_age,
        keep_niche
      )
    ) %>%
    dplyr::select(-keep_intercept_batch, -keep_engraf_batch, -keep_age_batch, -keep_niche_batch) %>%
    dplyr::select(
      gene_name, EAN, intercept:lfc_niche, intercept_batch:niche_batch,
      svalue_engraf:svalue_niche, keep_engraf, keep_age, keep_niche, total_batch, everything()
    )

  lfcs_filepath <-
    file.path(
      "./tables/bulk_rnaseq",
      "leg_engraftment_moderated_lfc_1_svalue_05_marked.tsv"
    )
  write_tsv(lfcs, lfcs_filepath)


  return(lfcs)
}


plot_volcano <- function(data, xvar, yvar, title, lfc_threshold, svalue_thresh, figures_dir) {
  gv <-
    EnhancedVolcano(data,
      lab = data %>% pull(gene_name),
      xlab = bquote(~ italic(Moderated) ~ Log[2] ~ FC),
      ylab = bquote(~ -Log[10] ~ italic(svalue)),
      x = xvar,
      y = yvar,
      col = c("grey30", "forestgreen", "red2", "royalblue"),
      pCutoff = svalue_thresh,
      FCcutoff = lfc_threshold,
      #  legend = c("NS", "Log2FC", "svalue", "svalue & Log2FC"),
      legendLabels = c(
        "NS",
        expression(Log[2] ~ FC),
        "svalue",
        expression(s - value ~ and ~ log[2] ~ FC)
      )
    )

  ggsave(file.path(figures_dir, paste0(title, "_volcano.pdf")), gv)
  return(gv)
}

plot_heatmap <- function(target_genes, title, show_genes = F, figures_dir) {
  
  counts_normalized_subset <-
    counts_normalized%>%
    mutate_if(is.numeric, ~ log1p(.)) %>%
    dplyr::filter(gene_id %in% target_genes)%>%
  mutate(
    gene_name =
      scater::uniquifyFeatureNames(
        ID = gene_id,
        names = gene_name
      )
  ) %>%
  dplyr::select(-gene_id) %>%
  column_to_rownames("gene_name") %>%
  as.matrix()
  
  
  gh <-
    pheatmap(counts_normalized_subset,
      color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 11, name = "Spectral")))(100),
      scale = "row",
      cluster_rows = T,
      show_rownames = show_genes,
      treeheight_row = 0,
      fontsize_col = 4,
      fontsize_row = 6,
      angle_col = "45",
      cluster_cols = FALSE,
      annotation_col = meta_df
    )
  ggsave(file.path(figures_dir, paste0(title, "_heatmap_expression.pdf")), gh)

  return(gh)
}


plot_gene <- function(gene, figures_dir) {
  counts_normalized_long_subset <-
    counts_normalized_long %>%
    dplyr::filter(gene_id == gene)

  lfcs_subset <-
    lfcs %>%
    dplyr::filter(gene_id == gene)

  gene_name <- paste(counts_normalized_long_subset$gene_name[1], lfcs_subset$EAN[1], sep = "_")

  lfcs_label <-
    lfcs_subset %>%
    dplyr::select(lfc_engraf:niche_batch) %>%
    mutate(across(everything(), ~ round(.x, 2))) %>%
    relocate(intercept_batch, .after = last_col()) %>%
    unite("value", sep = ",") %>%
    pull(value)

  title_lab <- paste0(gene_name, ":", lfcs_label)
  p <- ggplot(
    counts_normalized_long_subset,
    aes(x = time, y = lognorm_counts)
  ) +
    geom_point(aes(color = donor)) +
    geom_line(aes(group = donor)) +
    labs(title = title_lab) +
    facet_grid(batch ~ age)
  ggsave(file.path(figures_dir, paste0(gene_name, "_lognorm_counts.pdf")), p)
  return(p)
}
```  


## Set filepaths

```{r}
data_dir <- "./data/kallisto_out"
gene_metadata_filepath <- "./data/metadata/mm10_ens97_gene_meta.txt"
metadata_filepath <- "./data/metadata/samples_metadata_leg.tsv"
```



## Get tx id versions

```{r}
suppressPackageStartupMessages(library(AnnotationHub))
ah <- AnnotationHub()
edb <- ah[["AH73905"]]
seqlevelsStyle(edb) <- "UCSC"
# standard_chroms <- standardChromosomes(edb, species = organism(edb))
tx_meta <-
  transcripts(edb,
    columns = c(
      "tx_id_version",
      "gene_id",
      "gene_name",
      "gene_id_version"
    ),
    return.type = "DataFrame"
  )
tx2knownGene <-
  tx_meta %>%
  as_tibble() %>%
  dplyr::select(tx_id_version, gene_id)
```


```{r}
meta_dt <-
  read_tsv(file.path(metadata_filepath)) %>%
  # arrange(tissue, age, time) %>%
  mutate(
    condition = paste(age, time, sep = "_"),
    age = factor(age, levels = c("young", "aged")),
    time = factor(time, levels = c("T0", "T21")),
    batch = factor(batch, levels = c(1, 2))
  )
meta_dt
```


```{r}
gene_metadata <-
  read_csv(gene_metadata_filepath) %>%
  dplyr::select(-gene_id_version)

gene_metadata <-
  left_join(tx_meta %>% as_tibble() %>%
    group_by(gene_id) %>%
    dplyr::count(gene_name) %>%
    dplyr::select(-n),
  gene_metadata %>%
    dplyr::select(-gene_name),
  by = "gene_id"
  ) %>%
  mutate(
    gene_name =
      scater::uniquifyFeatureNames(
        ID = gene_id,
        names = gene_name
      )
  )
gene_metadata
```


```{r}
design_matrix <-
  model.matrix(
    ~ time * age * batch,
    meta_dt %>%
      column_to_rownames("sample")
  )
colnames(design_matrix) <- c(
  "intercept",
  "engraf",
  "age",
  "intercept_batch",
  "niche",
  "engraf_batch",
  "age_batch",
  "niche_batch"
)

design_matrix <- design_matrix[, c(1, 2, 3, 5, 4, 6, 7, 8)]
design_matrix
```


## Load data
  
```{r}
dds <- get_deseq_data("leg", design_matrix, gene_metadata, meta_dt, data_dir, tx2knownGene)
dds
```

```{r}
summary_dt <-
  colSums(assay(dds)) %>%
  tibble::enframe(name = "sample", value = "counts")
summary_dt
```


```{r}
plotDispEsts(dds)
```


```{r}
res <- results(dds)
W <- res$stat
maxCooks <- apply(assays(dds)[["cooks"]], 1, max)
idx <- !is.na(W)
plot(rank(W[idx]), maxCooks[idx],
  xlab = "rank of Wald statistic",
  ylab = "maximum Cook's distance per gene",
  ylim = c(0, 5), cex = .4, col = rgb(0, 0, 0, .3)
)
m <- ncol(dds)
p <- 3
abline(h = qf(.99, p, m - p))
```

```{r}
plot(res$baseMean + 1, -log10(res$pvalue),
  log = "x", xlab = "mean of normalized counts",
  ylab = expression(-log[10](pvalue)),
  ylim = c(0, 30),
  cex = .4, col = rgb(0, 0, 0, .3)
)
```


```{r}
lfc_threshold <- 1
svalue_thresh <- 0.05
batch_threshold <- 1
coef_names <- colnames(design_matrix)[-1]
lfcs <-
  get_lfc_estimates(
    dds,
    gene_metadata,
    coef_names,
    lfc_threshold,
    svalue_thresh,
    batch_threshold
  )
lfcs
```


```{r}
table(lfcs$EAN)
```

```{r}
lfcs_hits <-
  lfcs %>%
  dplyr::filter(EAN != "NNN")
lfcs_hits
```
```{r}
lfcs_aging_up <-
  lfcs_hits %>%
  dplyr::filter(keep_age == "U")
lfcs_aging_up
```
```{r}
lfcs_aging_down <-
  lfcs_hits %>%
  dplyr::filter(keep_age == "D")
lfcs_aging_down
```

```{r}
lfcs_niche_up <-
  lfcs_hits %>%
  dplyr::filter(keep_niche == "U")
lfcs_niche_up
```

```{r}
lfcs_niche_down <-
  lfcs_hits %>%
  dplyr::filter(keep_niche == "D")
lfcs_niche_down
```
## Gene Expression plots

```{r}
normalized_counts_filtered_filepath <-
  file.path(
    "./tables/bulk_rnaseq",
    "leg_engraftment_counts_normalized_filtered.tsv"
  )

counts_normalized <- read_tsv(normalized_counts_filtered_filepath)

counts_normalized_long <-
  counts_normalized %>%
  pivot_longer(cols = yng_t0_01:aged_t21_05, names_to = "sample", values_to = "counts") %>%
  left_join(., meta_dt %>% dplyr::select(sample, time, age, donor, batch, condition), by = "sample") %>%
  mutate(lognorm_counts = log1p(counts))
```



```{r}
genes_to_plot <-
  lfcs_hits%>%
  arrange(total_batch) %>%
  pull(gene_id)
```


```{r fig.width=12}
figures_dir <- file.path("./figures/bulk_rnaseq/genes")
pp <- map(genes_to_plot, plot_gene, figures_dir)
pp[1:10]
```



### Heatmap plot

```{r}
meta_df <-
  as.data.frame(colData(dds)[, c("age", "time", "batch")])

aging_up_genes <-
  lfcs_aging_up%>%
  arrange(total_batch) %>%
  pull(gene_id)

aging_down_genes <-
  lfcs_aging_down%>%
  arrange(total_batch) %>%
  pull(gene_id)


niche_up_genes <-
  lfcs_niche_up%>%
  arrange(total_batch) %>%
  pull(gene_id)

niche_down_genes <-
  lfcs_niche_down%>%
  arrange(total_batch) %>%
  pull(gene_id)

```






```{r fig.height=30, fig.width=14}
figures_dir <- "./figures/bulk_rnaseq/heatmaps"
plot_heatmap(aging_up_genes, title = "lognorm_counts_aging_up", show_genes = T, figures_dir)
```

```{r fig.height=30, fig.width=14}
plot_heatmap(aging_down_genes, title = "lognorm_counts_aging_down", show_genes = T, figures_dir)
```
```{r fig.height=30, fig.width=14}
plot_heatmap(niche_down_genes, title = "lognorm_counts_niche_down", show_genes = T, figures_dir)
```


```{r fig.height=30, fig.width=14}
plot_heatmap(niche_up_genes, title = "lognorm_counts_niche_up", show_genes = T, figures_dir)
```

## Volcano Plots

```{r fig.width=12}
svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_niche", "svalue_niche", "niche", lfc_threshold, svalue_thresh, figures_dir)
```


```{r fig.width=12}
plot_volcano(lfcs, "lfc_age", "svalue_age", "age", lfc_threshold, svalue_thresh, figures_dir)
```


```{r fig.width=12}
plot_volcano(lfcs, "lfc_engraf", "svalue_engraf", "engraf", lfc_threshold, svalue_thresh, figures_dir)
```



```{r fig.width=12}
svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_niche_batch", "svalue_niche_batch", "niche_batch", lfc_threshold, svalue_thresh, figures_dir)
```

```{r fig.width=12}
svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_age_batch", "svalue_age_batch", "age_batch", lfc_threshold, svalue_thresh, figures_dir)
```
```{r fig.width=12}
svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_engraf_batch", "svalue_engraf_batch", "engraf_batch", lfc_threshold, svalue_thresh, figures_dir)
```
```{r fig.width=12}
svalue_thresh <- 0.05
figures_dir <- "./figures/volcanos"
plot_volcano(lfcs, "lfc_intercept_batch", "svalue_intercept_batch", "intercept_batch", lfc_threshold, svalue_thresh, figures_dir)
```





```{r}
sessionInfo()
```
