Prepare analysis workflow

Set filepaths and parameters

set.seed(42)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
options(
  readr.show_progress = FALSE,
  digits = 2
)

Load packages

suppressPackageStartupMessages({
  library(scater)
  library(scran)
  library(tidyverse)
  library(uwot) # For umap
  library(dbscan)
  library(batchelor)
  library(BiocSingular)
})
theme_set(theme_bw())

Define file paths

data_dir <- "./data"
figures_dir <- file.path("./figures", "qc_cells")

Load data

sce10x <-
  readRDS(file.path(
    data_dir,
    "preprocessed",
    "sce10x_filtered_cells.rds"
  ))
mitotic_dt<-
  perCellQCMetrics(
      sce10x,
      subsets = list(mitotic = which(rowData(sce10x)$gene_name %in% c("Ccna2", "Ccnb1", "Ccnb2"))),
      percent_top =NULL,
      exprs_values = c("counts"),
      flatten = TRUE
    ) %>%
    as_tibble() %>%
    dplyr::select(subsets_mitotic_sum, subsets_mitotic_detected, subsets_mitotic_percent)
mitotic_dt
colData(sce10x) <-
  cbind(colData(sce10x), mitotic_dt)

Cluster cell types

Cluster young cells

sce10x_yng <- sce10x[, colData(sce10x)$condition == "yng"]
n_exprs_genes_yng <-
  nexprs(sce10x_yng,
    detection_limit = 5,
    byrow = TRUE
  )

table(n_exprs_genes_yng)[1:20]
n_exprs_genes_yng
    0     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17 
24696  1436   778   467   333   269   236   200   175   125   150   129   111   111   112    84    71    88 
   18    19 
   68    74 
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_yng_tumap <-
  tumap(t(assay(sce10x_yng[n_exprs_genes_yng >= 3, ], "counts") %>%
    as.matrix()),
  metric = "cosine",
  ret_nn = TRUE
  )
set.seed(42)
sce10x_umap_yng <-
  umap(t(assay(
    sce10x_yng[n_exprs_genes_yng >= 3, ],
    "counts"
  ) %>%
    as.matrix()),
  metric = "cosine",
  nn_method = sce10x_yng_tumap$nn,
  a = .6,
  b = .55
  )
colnames(sce10x_umap_yng) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_yng, "umap") <- sce10x_umap_yng
dt <-
  bind_cols(
    colData(sce10x_yng) %>%
      as_tibble(),
    sce10x_umap_yng %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = markers
  ),
  size = 3,
  alpha = 1
  )
p2

# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
set.seed(42)
g <- buildSNNGraph(sce10x_yng, k = 20, use.dimred = "umap")
clusters_out <- igraph::cluster_walktrap(g)
# colLabels(sce10x) <- factor(igraph::cluster_louvain(g)$membership)
table(clusters_out$membership)

  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27 
147  72 354 370 147 434  88  90  93  87 272 236 250 176 214 404 146 295 493 110 238 151 172  98  69  96 233 
 28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
162  78  74  81  63 133 144  62  67 102  61  55  83  64  42  41  38  24  26  29  46  23  35  45  21  38  31 
 55 
 32 
igraph_clusters <- igraph::cut_at(clusters_out, n = 16)
table(igraph_clusters)
igraph_clusters
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
 404  109 1535  173 1056 1848  890  122  483  291   88   87   41   45   31   32 
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_yng)$markers)
cluster_by_marker
               
igraph_clusters  fap macrophage none stem
             1     0        402    0    2
             2     0        109    0    0
             3  1457          3   64   11
             4     6         17  126   24
             5     0          3   58  995
             6     5          8  146 1689
             7     7          3   68  812
             8    15          0   99    8
             9     0        480    2    1
             10    1          0   18  272
             11    1          1   25   61
             12    0          1    8   78
             13    4          5   22   10
             14    2          3   39    1
             15    2          2   27    0
             16    0          1   24    7
labels <- apply(cluster_by_marker, 1, which.max)
level_key_1 <- colnames(cluster_by_marker)[labels]
names(level_key_1) <- names(labels)
level_key_1
           1            2            3            4            5            6            7            8 
"macrophage" "macrophage"        "fap"       "none"       "stem"       "stem"       "stem"       "none" 
           9           10           11           12           13           14           15           16 
"macrophage"       "stem"       "stem"       "stem"       "none"       "none"       "none"       "none" 
table(level_key_1)
level_key_1
       fap macrophage       none       stem 
         1          3          6          6 
level_key_2 <- paste(level_key_1, names(level_key_1), sep = "_")
names(level_key_2) <- names(labels)
colData(sce10x_yng)$clusters_level1 <- recode(igraph_clusters, !!!level_key_1)
colData(sce10x_yng)$clusters_level2 <- recode(igraph_clusters, !!!level_key_2)
table(colData(sce10x_yng)$clusters_level1, colData(sce10x_yng)$sample)
            
             yng1 yng2 yng3
  fap         502  393  640
  macrophage  289  289  418
  none        185  158  101
  stem       1261 1936 1063
table(colData(sce10x_yng)$clusters_level2, colData(sce10x_yng)$sample)
              
               yng1 yng2 yng3
  fap_3         502  393  640
  macrophage_1  107  146  151
  macrophage_2    9   31   69
  macrophage_9  173  112  198
  none_13        15    8   18
  none_14        14   22    9
  none_15        23    3    5
  none_16         0    0   32
  none_4         96   74    3
  none_8         37   51   34
  stem_10        16  274    1
  stem_11         0    0   88
  stem_12        60   17   10
  stem_5        111  865   80
  stem_6       1066  766   16
  stem_7          8   14  868
dt <-
  bind_cols(
    colData(sce10x_yng) %>%
      as_tibble(),
    sce10x_umap_yng %>%
      as_tibble()
  )
p3 <-
  ggplot(dt, aes(umap1,
    umap2,
    colour = subsets_mitotic_detected >1
  )) +
  geom_jitter(
    width = .1,
    height = .1,
    size = 3,
    alpha = 1
  )
p3 + facet_wrap(~clusters_level2)

# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_louvain_clusters.pdf"), p3 )
colData(sce10x_yng)$clusters_level1[colData(sce10x_yng)$clusters_level2 =="stem_12"] <- "stem_mitotic"
dt <-
  bind_cols(
    colData(sce10x_yng) %>%
      as_tibble(),
    sce10x_umap_yng %>%
      as_tibble()
  )
p3 <-
  ggplot(dt, aes(umap1,
    umap2,
    colour = clusters_level1
  )) +
  geom_jitter(
    width = .1,
    height = .1,
    size = 3,
    alpha = 1
  )
p3 

# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_louvain_clusters.pdf"), p3 )

Cluster aged cells

sce10x_aged <- sce10x[, colData(sce10x)$condition == "aged"]
n_exprs_genes_aged <-
  nexprs(sce10x_aged,
    detection_limit = 5,
    byrow = TRUE
  )

table(n_exprs_genes_aged)[1:20]
n_exprs_genes_aged
    0     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17 
25860  1574   714   507   365   253   220   195   175   129   137   108   122    90    80    71    71    83 
   18    19 
   66    71 
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_aged_tumap <-
  tumap(t(assay(
    sce10x_aged[n_exprs_genes_aged >= 3, ],
    "counts"
  ) %>%
    as.matrix()),
  metric = "cosine",
  ret_nn = TRUE
  )
set.seed(42)
sce10x_umap_aged <-
  umap(t(assay(
    sce10x_aged[n_exprs_genes_aged >= 3, ],
    "counts"
  ) %>%
    as.matrix()),
  metric = "cosine",
  nn_method = sce10x_aged_tumap$nn,
  a = .6,
  b = .6
  )
colnames(sce10x_umap_aged) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_aged, "umap") <- sce10x_umap_aged
dt <-
  bind_cols(
    colData(sce10x_aged) %>%
      as_tibble(),
    sce10x_umap_aged %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = markers
  ),
  size = 1,
  alpha = 1
  )
p2

# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
set.seed(42)
g <- buildSNNGraph(sce10x_aged, k = 20, use.dimred = "umap")
clusters_out <- igraph::cluster_walktrap(g)
# colLabels(sce10x) <- factor(igraph::cluster_louvain(g)$membership)
table(clusters_out$membership)

  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27 
788 319 135 715  84 321 239 407 740  58 353 157 191 474 837 286 122 105 325 506 422 200 236 123 279 116 160 
 28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
 69 203 187  92  49  54  97  57 106  93  64 180  59  48  67  33  55  68  34  65  41  28  40  56  45  44  37 
 55 
 24 
igraph_clusters <- igraph::cut_at(clusters_out, n = 9)
table(igraph_clusters)
igraph_clusters
   1    2    3    4    5    6    7    8    9 
6639  131  343  674 2335  157   82  295   37 
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_aged)$markers)
cluster_by_marker
               
igraph_clusters  fap macrophage none stem
              1 5865         37  710   27
              2    0        127    3    1
              3    6          3  332    2
              4    6        664    3    1
              5   14         20  694 1607
              6   10          6  136    5
              7    3          2   74    3
              8   63          5  224    3
              9   32          0    5    0
labels <- apply(cluster_by_marker, 1, which.max)
level_key_1 <- colnames(cluster_by_marker)[labels]
names(level_key_1) <- names(labels)
level_key_1
           1            2            3            4            5            6            7            8 
       "fap" "macrophage"       "none" "macrophage"       "stem"       "none"       "none"       "none" 
           9 
       "fap" 
table(level_key_1)
level_key_1
       fap macrophage       none       stem 
         2          2          4          1 
level_key_2 <- paste(level_key_1, names(level_key_1), sep = "_")
names(level_key_2) <- names(labels)
level_key_2
             1              2              3              4              5              6              7 
       "fap_1" "macrophage_2"       "none_3" "macrophage_4"       "stem_5"       "none_6"       "none_7" 
             8              9 
      "none_8"        "fap_9" 
colData(sce10x_aged)$clusters_level1 <- recode(igraph_clusters, !!!level_key_1)
colData(sce10x_aged)$clusters_level2 <- recode(igraph_clusters, !!!level_key_2)
table(colData(sce10x_aged)$clusters_level1, colData(sce10x_aged)$sample)
            
             aged1 aged2 aged3 aged4
  fap         2027  2022  1583  1044
  macrophage    65   398   149   193
  none         168   345   143   221
  stem         545   692   337   761
table(colData(sce10x_aged)$clusters_level2, colData(sce10x_aged)$sample)
              
               aged1 aged2 aged3 aged4
  fap_1         1993  2021  1582  1043
  fap_9           34     1     1     1
  macrophage_2     0   128     0     3
  macrophage_4    65   270   149   190
  none_3          70    99   100    74
  none_6          36    85    10    26
  none_7          11    27     8    36
  none_8          51   134    25    85
  stem_5         545   692   337   761
dt <-
  bind_cols(
    colData(sce10x_aged) %>%
      as_tibble(),
    sce10x_umap_aged %>%
      as_tibble()
  )
p3 <-
  ggplot(dt, aes(umap1,
    umap2,
    colour = clusters_level1
  )) +
  geom_jitter(
    width = .1,
    height = .1,
    size = .7,
    alpha = 1
  )
p3 #+facet_wrap(~clusters_level2)

# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_louvain_clusters.pdf"), p3 )

Save data

meta_dt <-
  rbind(colData(sce10x_aged), colData(sce10x_yng))
colData(sce10x)$cluster_level2 <- paste(meta_dt$condition, meta_dt$clusters_level1, sep="_")
colData(sce10x) <- meta_dt
saveRDS(sce10x,
        file.path(
  data_dir,
  "preprocessed",
  "sce10x_filtered_cells.rds"))

Normalize and correct for batch effects

Discard non-marked and mitotic cells

sce10x <- sce10x[, colData(sce10x)$clusters_level1 %in% c("macrophage", "fap", "stem") ]
n_exprs_genes <-
  nexprs(sce10x,
    detection_limit = 0,
    byrow = TRUE
  )
table(n_exprs_genes)[1:20]
n_exprs_genes
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19 
 981 2716 1434 1055  759  591  471  444  357  357  336  333  285  252  226  239  231  190  209  167 
sce10x <- sce10x[n_exprs_genes >0,]
sce10x
class: SingleCellExperiment 
dim: 33578 16520 
metadata(42): tximetaInfo quantInfo ... txomeInfo txdbInfo
assays(3): counts spliced unspliced
rownames(33578): Gm1992 Gm6123 ... mt-Cytb mt-Nd6
rowData names(9): ensembl_gene_id_version external_gene_name ... nexprs_l1_s keep
colnames(16520): GACTATGTCCGGCTTT_d1 ATGGTTGGTTGTAAAG_d1 ... GTCCCTCTCCCTCTAG_g3
  AAACGCTGTCGTTTAC_g3
colData names(22): sum detected ... clusters_level1 clusters_level2
reducedDimNames(0):
altExpNames(0):
colData(sce10x)$clusters_level2 <- 
  paste(colData(sce10x)$condition, 
        colData(sce10x)$clusters_level1, sep="_")

Normalize data across samples

sce10x <-
    multiBatchNorm(sce10x,
      batch = colData(sce10x)$sample
    )
n_exprs_genes <-
  nexprs(sce10x,
    detection_limit = 5,
    byrow = TRUE
  )
table(n_exprs_genes)[1:20]
n_exprs_genes
    0     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17 
23438  1596   722   483   357   280   220   191   166   164   128   134    95    88   121    68    73    80 
   18    19 
   67    71 

Run fast mutual nearest neighbors correction

sce10x_corrected <-
  fastMNN(sce10x,
    batch = colData(sce10x)$sample,
    k = 20,
    d = 200,
    subset.row = names(n_exprs_genes)[n_exprs_genes >= 3]
  )
reducedDim(sce10x, "mnn_corrected") <- reducedDim(sce10x_corrected, "corrected")

Run UMAP

set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_tumap <-
  tumap(reducedDim(sce10x, 
                   "mnn_corrected"),
        metric = "cosine",
        ret_nn = TRUE)
set.seed(42)
sce10x_umap <-
  umap(reducedDim(sce10x,
                  "mnn_corrected"),
    metric = "cosine",
    nn_method = sce10x_tumap$nn,
    a = .8,
    b = .55
  )
colnames(sce10x_umap) <- paste0("umap", seq(1, 2))
reducedDim(sce10x, "mnn_umap") <- sce10x_umap
dt <-
  bind_cols(
    colData(sce10x) %>%
      as_tibble(),
    sce10x_umap %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters_level1
  ),
  size = .6,
  alpha = 1
  ) #+ facet_wrap(~condition)
p2 

ggsave(file.path(figures_dir, "mnn_umap.pdf"), p2 )
Saving 14 x 8 in image

Cluster stem cells

set.seed(42)
g <- buildSNNGraph(sce10x, k = 100, use.dimred = "mnn_umap")
clusters_out <- igraph::cluster_walktrap(g)
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_stem_tumap <-
  tumap(reducedDim(sce10x_stem, 
                   "mnn_corrected"),
        metric = "cosine",
        ret_nn = TRUE)
set.seed(42)
sce10x_stem_umap <-
  umap(reducedDim(sce10x_stem,
                  "mnn_corrected"),
    metric = "cosine",
    nn_method = sce10x_stem_tumap$nn,
    a = .85,
    b = .43
  )
colnames(sce10x_stem_umap) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_stem, "mnn_umap") <- sce10x_stem_umap
dt <-
  bind_cols(
    colData(sce10x_stem) %>%
      as_tibble(),
    sce10x_stem_umap %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters_level2
  ),
  size = 1,
  alpha =.6
  )
p2

# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
set.seed(42)
g <- buildSNNGraph(sce10x_stem, k = 100, use.dimred = "mnn_umap")
clusters_out <- igraph::cluster_walktrap(g)
igraph_clusters <- factor(clusters_out$membership)
table(igraph_clusters)
igraph_clusters
  1   2   3   4   5   6   7   8   9  10  11  12  13  14 
433 711 773 430 912 852 464 386 234 433 188 157 290 245 
igraph_clusters <- igraph::cut_at(clusters_out, n = 5)
table(igraph_clusters)
igraph_clusters
   1    2    3    4    5 
1931  621 2043 1061  852 
discard_cell <- sce10x_stem_umap[,1] < -10 | sce10x_stem_umap[,1] > 22  | (igraph_clusters ==2 & colData(sce10x_stem)$condition=="aged")
table(discard_cell)
discard_cell
FALSE  TRUE 
 6429    79 
igraph_clusters[discard_cell] <- 6
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_stem)$sample)
cluster_by_marker
               
igraph_clusters aged1 aged2 aged3 aged4 yng1 yng2 yng3
              1    20    86    52    85  495  902  291
              2     0     0     0     0  250  220  126
              3    98   178    79   196  356  606  530
              4   135   214   102   255   68  141   92
              5   268   204    93   203   30   44   10
              6    24    10    11    22    2    6    4
colData(sce10x_stem)$clusters <- factor(igraph_clusters)
sce10x_stem_subset <-
  sce10x_stem[, colData(sce10x_stem)$clusters!=6]
dt <-
  bind_cols(
    colData(sce10x_stem_subset) %>%
      as_tibble(),
    reducedDim(sce10x_stem_subset, "mnn_umap") %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters
  ),
  size = 1.7,
  alpha = 1
  )
p2  # + facet_wrap(~clusters)

ggsave(file.path(figures_dir, "mnn_umap_stem.pdf"), p2 )
Saving 14 x 10 in image

Cluster fap cells

sce10x_fap <- sce10x[,colData(sce10x)$clusters_level1 =="fap"]
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_fap_tumap <-
  tumap(reducedDim(sce10x_fap, 
                   "mnn_corrected"),
        metric = "cosine",
        ret_nn = TRUE)
set.seed(42)
sce10x_fap_umap <-
  umap(reducedDim(sce10x_fap,
                  "mnn_corrected"),
    metric = "cosine",
    nn_method = sce10x_fap_tumap$nn,
    a = .9,
    b = .78
  )
colnames(sce10x_fap_umap) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_fap, "mnn_umap") <- sce10x_fap_umap
dt <-
  bind_cols(
    colData(sce10x_fap) %>%
      as_tibble(),
    sce10x_fap_umap %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters_level2
  ),
  size = 1,
  alpha =.6
  )
p2

# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
set.seed(42)
g <- buildSNNGraph(sce10x_fap, k = 100, use.dimred = "mnn_umap")
clusters_out <- igraph::cluster_walktrap(g)
igraph_clusters <- factor(clusters_out$membership)
table(igraph_clusters)
igraph_clusters
   1    2    3    4    5    6    7    8    9   10 
1217  889 1246  664 1710 1383  413  304  239  146 
igraph_clusters <- igraph::cut_at(clusters_out, n = 3)
table(igraph_clusters)
igraph_clusters
   1    2    3 
6102 1963  146 
discard_cell <-  sce10x_fap_umap[,1] > 5
table(discard_cell)
discard_cell
FALSE  TRUE 
 8149    62 
igraph_clusters[discard_cell] <- 4
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_fap)$sample)
cluster_by_marker
               
igraph_clusters aged1 aged2 aged3 aged4 yng1 yng2 yng3
              1  1608  1562  1120   824  289  273  364
              2   369   419   434   189  196  107  249
              3    43    28    14    14   13   10   24
              4     7    13    15    17    4    3    3
colData(sce10x_fap)$clusters <- factor(igraph_clusters)
sce10x_fap_subset <-
  sce10x_fap[, colData(sce10x_fap)$clusters!=4]
dt <-
  bind_cols(
    colData(sce10x_fap_subset) %>%
      as_tibble(),
    reducedDim(sce10x_fap_subset, "mnn_umap") %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters
  ),
  size = 1.7,
  alpha = 1
  )
p2  # + facet_wrap(~clusters)

ggsave(file.path(figures_dir, "mnn_umap_fap.pdf"), p2 )
Saving 14 x 10 in image

Cluster macrophage cells

sce10x_macrophage <- sce10x[,colData(sce10x)$clusters_level1 =="macrophage"]
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_macrophage_tumap <-
  tumap(reducedDim(sce10x_macrophage, 
                   "mnn_corrected"),
        metric = "cosine",
        ret_nn = TRUE)
set.seed(42)
sce10x_macrophage_umap <-
  umap(reducedDim(sce10x_macrophage,
                  "mnn_corrected"),
    metric = "cosine",
    nn_method = sce10x_macrophage_tumap$nn,
    a = 0.75,
    b = 0.65
  )
colnames(sce10x_macrophage_umap) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_macrophage, "mnn_umap") <- sce10x_macrophage_umap
dt <-
  bind_cols(
    colData(sce10x_macrophage) %>%
      as_tibble(),
    sce10x_macrophage_umap %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters_level2
  ),
  size = 1,
  alpha =.6
  )
p2

# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
set.seed(42)
g <- buildSNNGraph(sce10x_macrophage, k = 50, use.dimred = "mnn_umap")
clusters_out <- igraph::cluster_walktrap(g)
igraph_clusters <- factor(clusters_out$membership)
table(igraph_clusters)
igraph_clusters
  1   2   3   4   5   6   7   8   9  10  11  12 
224 215 253 193 246 158 108  73  93  78  62  98 
igraph_clusters <- igraph::cut_at(clusters_out, n = 5)
table(igraph_clusters)
igraph_clusters
   1    2    3    4    5 
 134 1156  305  108   98 
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_macrophage)$sample)
cluster_by_marker
               
igraph_clusters aged1 aged2 aged3 aged4 yng1 yng2 yng3
              1     0    24     4     2   10   69   25
              2    37   138   137    89  265  171  319
              3    21    83     8    98    7   34   54
              4     7    56     0     3    7   15   20
              5     0    97     0     1    0    0    0
colData(sce10x_macrophage)$clusters <- factor(igraph_clusters)
sce10x_macrophage_subset <-
  sce10x_macrophage[, colData(sce10x_macrophage)$clusters %in% c(1,2, 3)]
dt <-
  bind_cols(
    colData(sce10x_macrophage_subset) %>%
      as_tibble(),
    reducedDim(sce10x_macrophage_subset, "mnn_umap") %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters
  ),
  size = 1.7,
  alpha = 1
  )
p2  # + facet_wrap(~clusters)

ggsave(file.path(figures_dir, "mnn_umap_macrophage.pdf"), p2 )
Saving 14 x 10 in image

Save data

meta_dt <-
  rbind(colData(sce10x_stem_subset), colData(sce10x_fap_subset), colData(sce10x_macrophage_subset)) %>%
  as_tibble(rownames = "cell") %>%
  mutate(cluster_level1_1= paste(clusters_level1, clusters, sep="_" ),
         cluster_level2_1= paste(clusters_level2, clusters, sep="_" )) %>%
  select(cell, cluster_level1_1, cluster_level2_1 )
meta_dt
colData(sce10x) <-
  left_join(colData(sce10x)%>%
  as_tibble(rownames = "cell"),
  meta_dt,
  by="cell") %>%
  replace_na(list(cluster_level2_1 = "discard", cluster_level1_1 = "discard")) %>%
  select(-clusters) %>%
  column_to_rownames("cell") %>%
  DataFrame(.)

discard_idx <- 
  colData(sce10x)$cluster_level2_1 == "discard"
table(discard_idx)
discard_idx
FALSE  TRUE 
16173   347 
sce10x <- sce10x[, !discard_idx]
n_exprs_genes <-
  nexprs(sce10x,
    detection_limit = 0,
    byrow = TRUE
  )

table(n_exprs_genes)[1:20]
n_exprs_genes
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19 
 104 2706 1447 1072  738  601  471  433  372  344  344  329  283  236  233  239  218  205  198  171 
sce10x <- sce10x[n_exprs_genes>0, ]
saveRDS(
  sce10x,
  file.path(
    data_dir,
    "preprocessed",
    "sce10x_filtered_final.rds"
  )
)

make plots

dt <-
  bind_cols(
    colData(sce10x) %>%
      as_tibble(),
    reducedDim(sce10x, "mnn_umap") %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = cluster_level1_1
  ),
  size = .6,
  alpha = 1
  ) + facet_wrap(~condition)
p2 

ggsave(file.path(figures_dir, "mnn_umap_split.pdf"), p2 )
Saving 14 x 8 in image
sessionInfo()
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
 [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
[10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] BiocSingular_1.4.0          batchelor_1.4.0             dbscan_1.1-5               
 [4] uwot_0.1.8                  Matrix_1.2-18               forcats_0.5.0              
 [7] stringr_1.4.0               dplyr_0.8.5                 purrr_0.3.4                
[10] readr_1.3.1                 tidyr_1.1.0                 tibble_3.0.1               
[13] tidyverse_1.3.0             scran_1.16.0                scater_1.16.0              
[16] ggplot2_3.3.0               SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.1
[19] DelayedArray_0.14.0         matrixStats_0.56.0          Biobase_2.48.0             
[22] GenomicRanges_1.40.0        GenomeInfoDb_1.24.0         IRanges_2.22.2             
[25] S4Vectors_0.26.1            BiocGenerics_0.34.0        

loaded via a namespace (and not attached):
 [1] nlme_3.1-147              fs_1.4.1                  bitops_1.0-6              lubridate_1.7.8          
 [5] RcppAnnoy_0.0.16          httr_1.4.1                rprojroot_1.3-2           tools_4.0.0              
 [9] backports_1.1.7           utf8_1.1.4                R6_2.4.1                  irlba_2.3.3              
[13] vipor_0.4.5               DBI_1.1.0                 colorspace_1.4-1          withr_2.2.0              
[17] tidyselect_1.1.0          gridExtra_2.3             compiler_4.0.0            cli_2.0.2                
[21] rvest_0.3.5               BiocNeighbors_1.6.0       xml2_1.3.2                labeling_0.3             
[25] scales_1.1.1              digest_0.6.25             rmarkdown_2.1             XVector_0.28.0           
[29] base64enc_0.1-3           pkgconfig_2.0.3           htmltools_0.4.0           dbplyr_1.4.4             
[33] limma_3.44.1              rlang_0.4.6               readxl_1.3.1              rstudioapi_0.11          
[37] DelayedMatrixStats_1.10.0 farver_2.0.3              generics_0.0.2            jsonlite_1.6.1           
[41] BiocParallel_1.22.0       RCurl_1.98-1.2            magrittr_1.5              GenomeInfoDbData_1.2.3   
[45] fansi_0.4.1               Rcpp_1.0.4.6              ggbeeswarm_0.6.0          munsell_0.5.0            
[49] viridis_0.5.1             lifecycle_0.2.0           stringi_1.4.6             yaml_2.2.1               
[53] edgeR_3.30.0              zlibbioc_1.34.0           grid_4.0.0                blob_1.2.1               
[57] dqrng_0.2.1               crayon_1.3.4              lattice_0.20-41           haven_2.3.0              
[61] hms_0.5.3                 locfit_1.5-9.4            knitr_1.28                pillar_1.4.4             
[65] igraph_1.2.5              codetools_0.2-16          reprex_0.3.0              glue_1.4.1               
[69] evaluate_0.14             modelr_0.1.8              vctrs_0.3.0               cellranger_1.1.0         
[73] gtable_0.3.0              assertthat_0.2.1          xfun_0.14                 rsvd_1.0.3               
[77] broom_0.5.6               RSpectra_0.16-0           viridisLite_0.3.0         beeswarm_0.2.3           
[81] statmod_1.4.34            ellipsis_0.3.1           
---
title: "Mouse Muscle Stem Cell Project "
subtitle: "Part 3: cluster cells"
author: 
- name: Rick Farouni
  affiliation:
  - &cruk Génome Québec Innovation Centre, McGill University, Montreal, Canada
date: '`r format(Sys.Date(), "%Y-%B-%d")`'
output:
  html_notebook:
    df_print: paged
    code_folding: show
    toc: no
    toc_float: 
      collapsed: false
      smooth_scroll: false
---



# Prepare analysis workflow

## Set filepaths and parameters

```{r setup}
set.seed(42)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
options(
  readr.show_progress = FALSE,
  digits = 2
)
```

## Load packages
```{r}
suppressPackageStartupMessages({
  library(scater)
  library(scran)
  library(tidyverse)
  library(uwot) # For umap
  library(dbscan)
  library(batchelor)
  library(BiocSingular)
})
theme_set(theme_bw())
```

## Define file paths

```{r}
data_dir <- "./data"
figures_dir <- file.path("./figures", "qc_cells")
```


## Load data

```{r}
sce10x <-
  readRDS(file.path(
    data_dir,
    "preprocessed",
    "sce10x_filtered_cells.rds"
  ))
```



```{r}
mitotic_dt<-
  perCellQCMetrics(
      sce10x,
      subsets = list(mitotic = which(rowData(sce10x)$gene_name %in% c("Ccna2", "Ccnb1", "Ccnb2"))),
      percent_top =NULL,
      exprs_values = c("counts"),
      flatten = TRUE
    ) %>%
    as_tibble() %>%
    dplyr::select(subsets_mitotic_sum, subsets_mitotic_detected, subsets_mitotic_percent)
mitotic_dt
```



```{r}
colData(sce10x) <-
  cbind(colData(sce10x), mitotic_dt)
```

# Cluster cell types

## Cluster young cells

```{r}
sce10x_yng <- sce10x[, colData(sce10x)$condition == "yng"]
```

```{r}
n_exprs_genes_yng <-
  nexprs(sce10x_yng,
    detection_limit = 5,
    byrow = TRUE
  )

table(n_exprs_genes_yng)[1:20]
```




```{r}
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_yng_tumap <-
  tumap(t(assay(sce10x_yng[n_exprs_genes_yng >= 3, ], "counts") %>%
    as.matrix()),
  metric = "cosine",
  ret_nn = TRUE
  )
```

```{r}
set.seed(42)
sce10x_umap_yng <-
  umap(t(assay(
    sce10x_yng[n_exprs_genes_yng >= 3, ],
    "counts"
  ) %>%
    as.matrix()),
  metric = "cosine",
  nn_method = sce10x_yng_tumap$nn,
  a = .6,
  b = .55
  )
colnames(sce10x_umap_yng) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_yng, "umap") <- sce10x_umap_yng
```

```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_yng) %>%
      as_tibble(),
    sce10x_umap_yng %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = markers
  ),
  size = 3,
  alpha = 1
  )
p2
# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
```


```{r}
set.seed(42)
g <- buildSNNGraph(sce10x_yng, k = 20, use.dimred = "umap")
```

```{r}
clusters_out <- igraph::cluster_walktrap(g)
# colLabels(sce10x) <- factor(igraph::cluster_louvain(g)$membership)
table(clusters_out$membership)
```


```{r}
igraph_clusters <- igraph::cut_at(clusters_out, n = 16)
table(igraph_clusters)
```


```{r}
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_yng)$markers)
cluster_by_marker
```



```{r}
labels <- apply(cluster_by_marker, 1, which.max)
level_key_1 <- colnames(cluster_by_marker)[labels]
names(level_key_1) <- names(labels)
level_key_1
```
```{r}
table(level_key_1)
```

```{r}
level_key_2 <- paste(level_key_1, names(level_key_1), sep = "_")
names(level_key_2) <- names(labels)
colData(sce10x_yng)$clusters_level1 <- recode(igraph_clusters, !!!level_key_1)
colData(sce10x_yng)$clusters_level2 <- recode(igraph_clusters, !!!level_key_2)
table(colData(sce10x_yng)$clusters_level1, colData(sce10x_yng)$sample)
```

```{r}
table(colData(sce10x_yng)$clusters_level2, colData(sce10x_yng)$sample)
```


```{r fig.height=12, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_yng) %>%
      as_tibble(),
    sce10x_umap_yng %>%
      as_tibble()
  )
p3 <-
  ggplot(dt, aes(umap1,
    umap2,
    colour = subsets_mitotic_detected >1
  )) +
  geom_jitter(
    width = .1,
    height = .1,
    size = 3,
    alpha = 1
  )
p3 + facet_wrap(~clusters_level2)
# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_louvain_clusters.pdf"), p3 )
```


```{r}
colData(sce10x_yng)$clusters_level1[colData(sce10x_yng)$clusters_level2 =="stem_12"] <- "stem_mitotic"
```


```{r fig.height=12, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_yng) %>%
      as_tibble(),
    sce10x_umap_yng %>%
      as_tibble()
  )
p3 <-
  ggplot(dt, aes(umap1,
    umap2,
    colour = clusters_level1
  )) +
  geom_jitter(
    width = .1,
    height = .1,
    size = 3,
    alpha = 1
  )
p3 
# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_louvain_clusters.pdf"), p3 )
```

## Cluster aged cells

```{r}
sce10x_aged <- sce10x[, colData(sce10x)$condition == "aged"]
```

```{r}
n_exprs_genes_aged <-
  nexprs(sce10x_aged,
    detection_limit = 5,
    byrow = TRUE
  )

table(n_exprs_genes_aged)[1:20]
```

```{r}
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_aged_tumap <-
  tumap(t(assay(
    sce10x_aged[n_exprs_genes_aged >= 3, ],
    "counts"
  ) %>%
    as.matrix()),
  metric = "cosine",
  ret_nn = TRUE
  )
```


```{r}
set.seed(42)
sce10x_umap_aged <-
  umap(t(assay(
    sce10x_aged[n_exprs_genes_aged >= 3, ],
    "counts"
  ) %>%
    as.matrix()),
  metric = "cosine",
  nn_method = sce10x_aged_tumap$nn,
  a = .6,
  b = .6
  )
colnames(sce10x_umap_aged) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_aged, "umap") <- sce10x_umap_aged
```

```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_aged) %>%
      as_tibble(),
    sce10x_umap_aged %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = markers
  ),
  size = 1,
  alpha = 1
  )
p2
# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
```





```{r}
set.seed(42)
g <- buildSNNGraph(sce10x_aged, k = 20, use.dimred = "umap")
```

```{r}
clusters_out <- igraph::cluster_walktrap(g)
# colLabels(sce10x) <- factor(igraph::cluster_louvain(g)$membership)
table(clusters_out$membership)
```

```{r}
igraph_clusters <- igraph::cut_at(clusters_out, n = 9)
table(igraph_clusters)
```


```{r}
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_aged)$markers)
cluster_by_marker
```



```{r}
labels <- apply(cluster_by_marker, 1, which.max)
level_key_1 <- colnames(cluster_by_marker)[labels]
names(level_key_1) <- names(labels)
level_key_1
```
```{r}
table(level_key_1)
```

```{r}
level_key_2 <- paste(level_key_1, names(level_key_1), sep = "_")
names(level_key_2) <- names(labels)
level_key_2
```

```{r}
colData(sce10x_aged)$clusters_level1 <- recode(igraph_clusters, !!!level_key_1)
colData(sce10x_aged)$clusters_level2 <- recode(igraph_clusters, !!!level_key_2)
```
```{r}
table(colData(sce10x_aged)$clusters_level1, colData(sce10x_aged)$sample)
```
```{r}
table(colData(sce10x_aged)$clusters_level2, colData(sce10x_aged)$sample)
```




```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_aged) %>%
      as_tibble(),
    sce10x_umap_aged %>%
      as_tibble()
  )
p3 <-
  ggplot(dt, aes(umap1,
    umap2,
    colour = clusters_level1
  )) +
  geom_jitter(
    width = .1,
    height = .1,
    size = .7,
    alpha = 1
  )
p3 #+facet_wrap(~clusters_level2)
# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_louvain_clusters.pdf"), p3 )
```

## Save data


```{r}
meta_dt <-
  rbind(colData(sce10x_aged), colData(sce10x_yng))
colData(sce10x) <- meta_dt
```


```{r}
saveRDS(sce10x,
        file.path(
  data_dir,
  "preprocessed",
  "sce10x_filtered_cells.rds"))
```


# Normalize and correct for batch effects

## Discard non-marked and mitotic cells


```{r}
sce10x <- sce10x[, colData(sce10x)$clusters_level1 %in% c("macrophage", "fap", "stem") ]
```

```{r}
n_exprs_genes <-
  nexprs(sce10x,
    detection_limit = 0,
    byrow = TRUE
  )
table(n_exprs_genes)[1:20]
```

```{r}
sce10x <- sce10x[n_exprs_genes >0,]
sce10x
```



```{r}
colData(sce10x)$clusters_level2 <- 
  paste(colData(sce10x)$condition, 
        colData(sce10x)$clusters_level1, sep="_")

```

## Normalize data across samples

```{r}
sce10x <-
    multiBatchNorm(sce10x,
      batch = colData(sce10x)$sample
    )
```

```{r}
n_exprs_genes <-
  nexprs(sce10x,
    detection_limit = 5,
    byrow = TRUE
  )
table(n_exprs_genes)[1:20]
```

## Run fast mutual nearest neighbors correction

```{r}
sce10x_corrected <-
  fastMNN(sce10x,
    batch = colData(sce10x)$sample,
    k = 20,
    d = 200,
    subset.row = names(n_exprs_genes)[n_exprs_genes >= 3]
  )
```

```{r}
reducedDim(sce10x, "mnn_corrected") <- reducedDim(sce10x_corrected, "corrected")
```


## Run UMAP



```{r}
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_tumap <-
  tumap(reducedDim(sce10x, 
                   "mnn_corrected"),
        metric = "cosine",
        ret_nn = TRUE)
```

```{r}
set.seed(42)
sce10x_umap <-
  umap(reducedDim(sce10x,
                  "mnn_corrected"),
    metric = "cosine",
    nn_method = sce10x_tumap$nn,
    a = .8,
    b = .55
  )
colnames(sce10x_umap) <- paste0("umap", seq(1, 2))
reducedDim(sce10x, "mnn_umap") <- sce10x_umap
```

```{r fig.height=8, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x) %>%
      as_tibble(),
    sce10x_umap %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters_level1
  ),
  size = .6,
  alpha = 1
  ) #+ facet_wrap(~condition)
p2 
ggsave(file.path(figures_dir, "mnn_umap.pdf"), p2 )
```





## Cluster stem cells

```{r}
sce10x_stem <- sce10x[,colData(sce10x)$clusters_level1 =="stem"]
```



```{r}
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_stem_tumap <-
  tumap(reducedDim(sce10x_stem, 
                   "mnn_corrected"),
        metric = "cosine",
        ret_nn = TRUE)
```

```{r}
set.seed(42)
sce10x_stem_umap <-
  umap(reducedDim(sce10x_stem,
                  "mnn_corrected"),
    metric = "cosine",
    nn_method = sce10x_stem_tumap$nn,
    a = .85,
    b = .43
  )
colnames(sce10x_stem_umap) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_stem, "mnn_umap") <- sce10x_stem_umap
```

```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_stem) %>%
      as_tibble(),
    sce10x_stem_umap %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters_level2
  ),
  size = 1,
  alpha =.6
  )
p2
# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
```



```{r}
set.seed(42)
g <- buildSNNGraph(sce10x_stem, k = 100, use.dimred = "mnn_umap")
clusters_out <- igraph::cluster_walktrap(g)
```

```{r}
igraph_clusters <- factor(clusters_out$membership)
table(igraph_clusters)
```

```{r}
igraph_clusters <- igraph::cut_at(clusters_out, n = 5)
table(igraph_clusters)
```


```{r}
discard_cell <- sce10x_stem_umap[,1] < -10 | sce10x_stem_umap[,1] > 22  | (igraph_clusters ==2 & colData(sce10x_stem)$condition=="aged")
table(discard_cell)
```



```{r}
igraph_clusters[discard_cell] <- 6
```


```{r}
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_stem)$sample)
cluster_by_marker
```



```{r}
colData(sce10x_stem)$clusters <- factor(igraph_clusters)
```

```{r}
sce10x_stem_subset <-
  sce10x_stem[, colData(sce10x_stem)$clusters!=6]
```


```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_stem_subset) %>%
      as_tibble(),
    reducedDim(sce10x_stem_subset, "mnn_umap") %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters
  ),
  size = 1.7,
  alpha = 1
  )
p2  # + facet_wrap(~clusters)
ggsave(file.path(figures_dir, "mnn_umap_stem.pdf"), p2 )
```







## Cluster fap cells

```{r}
sce10x_fap <- sce10x[,colData(sce10x)$clusters_level1 =="fap"]
```



```{r}
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_fap_tumap <-
  tumap(reducedDim(sce10x_fap, 
                   "mnn_corrected"),
        metric = "cosine",
        ret_nn = TRUE)
```

```{r}
set.seed(42)
sce10x_fap_umap <-
  umap(reducedDim(sce10x_fap,
                  "mnn_corrected"),
    metric = "cosine",
    nn_method = sce10x_fap_tumap$nn,
    a = .9,
    b = .75
  )
colnames(sce10x_fap_umap) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_fap, "mnn_umap") <- sce10x_fap_umap
```

```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_fap) %>%
      as_tibble(),
    sce10x_fap_umap %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters_level2
  ),
  size = 1,
  alpha =.6
  )
p2
# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
```



```{r}
set.seed(42)
g <- buildSNNGraph(sce10x_fap, k = 100, use.dimred = "mnn_umap")
clusters_out <- igraph::cluster_walktrap(g)
```

```{r}
igraph_clusters <- factor(clusters_out$membership)
table(igraph_clusters)
```

```{r}
igraph_clusters <- igraph::cut_at(clusters_out, n = 3)
table(igraph_clusters)
```


```{r}
discard_cell <-  sce10x_fap_umap[,1] > 5
table(discard_cell)
```



```{r}
igraph_clusters[discard_cell] <- 4
```


```{r}
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_fap)$sample)
cluster_by_marker
```



```{r}
colData(sce10x_fap)$clusters <- factor(igraph_clusters)
```

```{r}
sce10x_fap_subset <-
  sce10x_fap[, colData(sce10x_fap)$clusters!=4]
```


```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_fap_subset) %>%
      as_tibble(),
    reducedDim(sce10x_fap_subset, "mnn_umap") %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters
  ),
  size = 1.7,
  alpha = 1
  )
p2  # + facet_wrap(~clusters)
ggsave(file.path(figures_dir, "mnn_umap_fap.pdf"), p2 )
```



## Cluster macrophage cells

```{r}
sce10x_macrophage <- sce10x[,colData(sce10x)$clusters_level1 =="macrophage"]
```



```{r}
set.seed(42)
# t-UMAP is equivalent to a = 1, b = 1
# get the nearest neighbor data back
sce10x_macrophage_tumap <-
  tumap(reducedDim(sce10x_macrophage, 
                   "mnn_corrected"),
        metric = "cosine",
        ret_nn = TRUE)
```

```{r}
set.seed(42)
sce10x_macrophage_umap <-
  umap(reducedDim(sce10x_macrophage,
                  "mnn_corrected"),
    metric = "cosine",
    nn_method = sce10x_macrophage_tumap$nn,
    a = 0.75,
    b = 0.65
  )
colnames(sce10x_macrophage_umap) <- paste0("umap", seq(1, 2))
reducedDim(sce10x_macrophage, "mnn_umap") <- sce10x_macrophage_umap
```

```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_macrophage) %>%
      as_tibble(),
    sce10x_macrophage_umap %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters_level2
  ),
  size = 1,
  alpha =.6
  )
p2
# ggsave(file.path(figures_dir, "umap_cosine_a160_b33_17928_cells_markers.pdf"), p2 )
```



```{r}
set.seed(42)
g <- buildSNNGraph(sce10x_macrophage, k = 50, use.dimred = "mnn_umap")
clusters_out <- igraph::cluster_walktrap(g)
```

```{r}
igraph_clusters <- factor(clusters_out$membership)
table(igraph_clusters)
```

```{r}
igraph_clusters <- igraph::cut_at(clusters_out, n = 5)
table(igraph_clusters)
```


```{r}
cluster_by_marker <-
  table(igraph_clusters, colData(sce10x_macrophage)$sample)
cluster_by_marker
```



```{r}
colData(sce10x_macrophage)$clusters <- factor(igraph_clusters)
```

```{r}
sce10x_macrophage_subset <-
  sce10x_macrophage[, colData(sce10x_macrophage)$clusters %in% c(1,2, 3)]
```


```{r fig.height=10, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x_macrophage_subset) %>%
      as_tibble(),
    reducedDim(sce10x_macrophage_subset, "mnn_umap") %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = clusters
  ),
  size = 1.7,
  alpha = 1
  )
p2  # + facet_wrap(~clusters)
ggsave(file.path(figures_dir, "mnn_umap_macrophage.pdf"), p2 )
```



## Save data


```{r}
meta_dt <-
  rbind(colData(sce10x_stem_subset), colData(sce10x_fap_subset), colData(sce10x_macrophage_subset)) %>%
  as_tibble(rownames = "cell") %>%
  mutate(cluster_level1_1= paste(clusters_level1, clusters, sep="_" ),
         cluster_level2_1= paste(clusters_level2, clusters, sep="_" )) %>%
  select(cell, cluster_level1_1, cluster_level2_1 )
meta_dt
```

```{r}
colData(sce10x) <-
  left_join(colData(sce10x)%>%
  as_tibble(rownames = "cell"),
  meta_dt,
  by="cell") %>%
  replace_na(list(cluster_level2_1 = "discard", cluster_level1_1 = "discard")) %>%
  select(-clusters) %>%
  column_to_rownames("cell") %>%
  DataFrame(.)

``` 

```{r}
discard_idx <- 
  colData(sce10x)$cluster_level2_1 == "discard"
table(discard_idx)
```


```{r}
sce10x <- sce10x[, !discard_idx]
```

```{r}
n_exprs_genes <-
  nexprs(sce10x,
    detection_limit = 0,
    byrow = TRUE
  )
table(n_exprs_genes)[1:20]
```

```{r}
sce10x <- sce10x[n_exprs_genes>0, ]
```



```{r}
saveRDS(
  sce10x,
  file.path(
    data_dir,
    "preprocessed",
    "sce10x_filtered_final.rds"
  )
)
```

## make plots

```{r fig.height=8, fig.width=14}
dt <-
  bind_cols(
    colData(sce10x) %>%
      as_tibble(),
    reducedDim(sce10x, "mnn_umap") %>%
      as_tibble()
  )
p2 <-
  ggplot(dt) +
  geom_point(aes(umap1,
    umap2,
    colour = cluster_level1_1
  ),
  size = .6,
  alpha = 1
  ) + facet_wrap(~condition)
p2 
ggsave(file.path(figures_dir, "mnn_umap_split.pdf"), p2 )
```

```{r}
sessionInfo()
```
