Prepare analysis workflow

Set filepaths and parameters

set.seed(42)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
options(
  readr.show_progress = FALSE,
  digits = 2
)

Load packages

suppressPackageStartupMessages({
  library(scater)
  library(BiocSingular)
  library(scran)
  library(tidyverse)
})

Define file paths

data_dir <- "./data"
figures_dir <- file.path("./figures", "qc_cells")

Load data

sce10x <- readRDS(file.path(data_dir, "preprocessed", "sce10x.rds"))

Filter cells

Plot data

plotColData(sce10x,
  x = "sum",
  by_exprs_values = "counts",
  y = "detected",
  colour_by = "high_mito",
  point_size = .7
) +
  scale_x_log10() +
  scale_y_log10() +
  facet_wrap(~ colData(sce10x)$sample, ncol = 2) +
  geom_vline(aes(xintercept = 100), color = "green", linetype = 2) +
  geom_hline(aes(yintercept = 100), color = "green", linetype = 2)

Discard cells with high mitochondrial percentage

sum_thresh <- colData(sce10x)$sum < 100
detected_thresh <- colData(sce10x)$detected < 100
mito_high <- colData(sce10x)$subsets_mito_percent > 15

discard <-
  (sum_thresh |
    detected_thresh |
    mito_high)
sum(discard)
[1] 5400
table(colData(sce10x)$sample, discard)
       discard
        FALSE TRUE
  aged1  3320  728
  aged2  3751  590
  aged3  2584  819
  aged4  2617  766
  yng1   2544  846
  yng2   3194  793
  yng3   2542  858
sce10x <- sce10x[, !discard]

Add doublet score label

add_doublet_score <- function(sample_name, sce10x) {
  sce10x_subset <- subset(sce10x, colData(sce10x)$sample == sample_name)

  set.seed(42)
  dbl_dens <- doubletCells(sce10x_subset)

  dt <- tibble(
    barcode = colnames(sce10x_subset),
    doublet_score = log10(dbl_dens + 1)
  )

  return(dt)
}
doublet_dt <- map_dfr(unique(sce10x$sample),
  add_doublet_score,
  sce10x = sce10x
)

sce10x$doublet_score <- deframe(doublet_dt)[colnames(sce10x)]
ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_violin(aes(sample, doublet_score))

Discard cells with low counts, low detected number of genes, and outlier unspliced proportions

ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_point(aes(sum_spliced,
    sum_unspliced,
    colour = doublet_score > 3
  ),
  size = 0.7,
  alpha = 0.8
  ) +
  scale_x_log10() +
  scale_y_log10() +
  geom_vline(xintercept = c(1000), color = "green", linetype = 2) +
  geom_hline(yintercept = c(350), color = "green", linetype = 2) +
  geom_vline(xintercept = c(2500), color = "red", linetype = 2) +
  geom_hline(yintercept = c(800), color = "red", linetype = 2) +
  facet_wrap(sample ~ ., ncol = 4)

ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_histogram(aes(unspliced_ov_total), bins = 150) +
  facet_wrap(sample ~ ., ncol = 2) +
  geom_vline(xintercept = c(.15, .5), color = "green", linetype = 2) +
  geom_vline(xintercept = c(.1, 0.4), color = "red", linetype = 2)

ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_point(aes(sum,
    detected,
    colour = doublet_score > 3
  ),
  size = 0.7,
  alpha = 0.8
  ) +
  scale_x_log10() +
  scale_y_log10() +
  facet_wrap(sample ~ ., ncol = 4) +
  geom_hline(aes(yintercept = c(1000)), color = "green", linetype = 2) +
  geom_hline(aes(yintercept = c(1800)), color = "red", linetype = 2) +
  geom_hline(aes(yintercept = c(7000)), color = "grey", linetype = 2)

sum_unspliced_low_yng <- colData(sce10x)$sum_unspliced < 800 &
  colData(sce10x)$condition == "yng"
sum_spliced_low_yng <- colData(sce10x)$sum_spliced < 2500 &
  colData(sce10x)$condition == "yng"
sum_spliced_low_aged <- colData(sce10x)$sum_spliced < 1000 &
  colData(sce10x)$condition == "aged"
sum_unspliced_low_aged <- colData(sce10x)$sum_unspliced < 350 &
  colData(sce10x)$condition == "aged"

unspliced_ratio_aged <-
  (colData(sce10x)$unspliced_ov_total > .5 |
    colData(sce10x)$unspliced_ov_total < 0.15) &
    colData(sce10x)$condition == "aged"
unspliced_ratio_yng <-
  (colData(sce10x)$unspliced_ov_total > .4 |
    colData(sce10x)$unspliced_ov_total < 0.1) &
    colData(sce10x)$condition == "yng"
detected_low_yng <- colData(sce10x)$detected < 1800 &
  colData(sce10x)$condition == "yng"
detected_low_aged <- colData(sce10x)$detected < 1000 &
  colData(sce10x)$condition == "aged"
sum_unspliced_outlier <- colData(sce10x)$sum_unspliced > 7000

discard <-
  (sum_spliced_low_aged |
    sum_spliced_low_yng |
    sum_unspliced_low_aged |
    sum_unspliced_low_yng |
    unspliced_ratio_aged |
    unspliced_ratio_yng |
    detected_low_aged |
    detected_low_yng |
    sum_unspliced_outlier)
sum(discard)
[1] 2601
table(colData(sce10x)$sample, discard)
       discard
        FALSE TRUE
  aged1  2805  515
  aged2  3457  294
  aged3  2220  364
  aged4  2228  389
  yng1   2237  307
  yng2   2777  417
  yng3   2227  315
sce10x <- sce10x[, !discard]

Plot

plotScater(sce10x,
  nfeatures = 500,
  block1 = "sample",
  ncol = 3,
  exprs_values = "counts",
  colour_by = "subsets_mito_percent",
  line_width = 0.3
) +
  geom_hline(aes(yintercept = .75)) +
  geom_hline(aes(yintercept = .65))

outliers_1 <- colData(sce10x)$percent_top_500 > 75 & colData(sce10x)$sample %in% c("aged3", "aged4", "yng2")
outliers_2 <- colData(sce10x)$percent_top_500 > 70 & colData(sce10x)$sample %in% c("yng3")
outliers <- outliers_1 | outliers_2
sum(outliers)
[1] 23
sce10x <- sce10x[, !outliers]
table(colData(sce10x)$sample)

aged1 aged2 aged3 aged4  yng1  yng2  yng3 
 2805  3457  2212  2219  2237  2776  2222 
ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_point(aes(sum_spliced,
    sum_unspliced,
    colour = doublet_score > 3
  ),
  size = 0.7,
  alpha = 0.8
  ) +
  scale_x_log10() +
  scale_y_log10() +
  # geom_vline(xintercept = c(20000, 30000), color = "green", linetype = 2) +
  geom_hline(yintercept = c(8000), color = "green", linetype = 2) +
  facet_wrap(sample ~ ., ncol = 4)

Label genes detected in few cells

n_exprs_genes_s <- nexprs(sce10x,
  detection_limit = 1,
  byrow = TRUE,
  exprs_values = "spliced"
)
head(table(n_exprs_genes_s), 100)
n_exprs_genes_s
    0     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17    18    19    20    21    22    23    24    25    26    27    28    29 
16543  1699   795   519   367   360   237   213   200   164   163   142   132   136   117    95   106    94    86    91    80    68    85    78    66    57    55    51    70    79 
   30    31    32    33    34    35    36    37    38    39    40    41    42    43    44    45    46    47    48    49    50    51    52    53    54    55    56    57    58    59 
   80    54    63    50    56    59    51    53    52    63    46    36    36    37    43    43    54    42    39    35    44    33    40    47    36    33    25    32    47    39 
   60    61    62    63    64    65    66    67    68    69    70    71    72    73    74    75    76    77    78    79    80    81    82    83    84    85    86    87    88    89 
   39    33    43    37    34    31    37    33    34    37    38    27    27    35    23    31    23    28    33    36    28    29    30    30    35    22    29    32    30    24 
   90    91    92    93    94    95    96    97    98    99 
   27    20    25    17    21    27    34    19    28    30 
n_exprs_genes_u <- nexprs(sce10x,
  detection_limit = 1,
  byrow = TRUE,
  exprs_values = "unspliced"
)
head(table(n_exprs_genes_u), 25)
n_exprs_genes_u
    0     1     2     3     4     5     6     7     8     9    10    11    12    13    14    15    16    17    18    19    20    21    22    23    24 
17197  1979  1038   663   558   443   385   285   309   232   260   214   173   161   175   140   148   115   127   129   123   117    92   117    93 
n_exprs_genes <-
  bind_cols(enframe(n_exprs_genes_s, "gene_name", "s"),
    u = n_exprs_genes_u
  ) %>%
  mutate(keep = !(s <= 25 & u <= 25))
n_exprs_genes
rowData(sce10x)$nexprs_l1_u <- n_exprs_genes_u
rowData(sce10x)$nexprs_l1_s <- n_exprs_genes_s
rowData(sce10x)$keep <- !(n_exprs_genes_s <= 25 & n_exprs_genes_u <= 25)
sum(rowData(sce10x)$keep)
[1] 13645

Save the single cell experiment objects

saveRDS(sce10x, file.path(data_dir, "preprocessed", "sce10x_filtered_cells.rds"))
sessionInfo()
R version 4.0.0 (2020-04-24)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_0.5.0               stringr_1.4.0               dplyr_0.8.5                 purrr_0.3.4                 readr_1.3.1                 tidyr_1.0.2                
 [7] tibble_3.0.1                tidyverse_1.3.0             scran_1.16.0                BiocSingular_1.4.0          scater_1.16.0               ggplot2_3.3.0              
[13] SingleCellExperiment_1.10.1 SummarizedExperiment_1.18.1 DelayedArray_0.14.0         matrixStats_0.56.0          Biobase_2.48.0              GenomicRanges_1.40.0       
[19] GenomeInfoDb_1.24.0         IRanges_2.22.1              S4Vectors_0.26.0            BiocGenerics_0.34.0        

loaded via a namespace (and not attached):
 [1] nlme_3.1-147              fs_1.4.1                  bitops_1.0-6              lubridate_1.7.8           httr_1.4.1                rprojroot_1.3-2          
 [7] R.cache_0.14.0            tools_4.0.0               backports_1.1.6           R6_2.4.1                  irlba_2.3.3               vipor_0.4.5              
[13] DBI_1.1.0                 colorspace_1.4-1          withr_2.2.0               tidyselect_1.0.0          gridExtra_2.3             compiler_4.0.0           
[19] cli_2.0.2                 rvest_0.3.5               BiocNeighbors_1.6.0       xml2_1.3.2                labeling_0.3              scales_1.1.0             
[25] digest_0.6.25             rmarkdown_2.1             R.utils_2.9.2             XVector_0.28.0            base64enc_0.1-3           pkgconfig_2.0.3          
[31] htmltools_0.4.0           styler_1.3.2              dbplyr_1.4.3              limma_3.44.1              readxl_1.3.1              rlang_0.4.6              
[37] rstudioapi_0.11           DelayedMatrixStats_1.10.0 farver_2.0.3              generics_0.0.2            jsonlite_1.6.1            BiocParallel_1.22.0      
[43] R.oo_1.23.0               RCurl_1.98-1.2            magrittr_1.5              GenomeInfoDbData_1.2.3    Matrix_1.2-18             fansi_0.4.1              
[49] Rcpp_1.0.4.6              ggbeeswarm_0.6.0          munsell_0.5.0             viridis_0.5.1             lifecycle_0.2.0           R.methodsS3_1.8.0        
[55] stringi_1.4.6             yaml_2.2.1                edgeR_3.30.0              zlibbioc_1.34.0           grid_4.0.0                dqrng_0.2.1              
[61] crayon_1.3.4              lattice_0.20-41           cowplot_1.0.0             haven_2.2.0               hms_0.5.3                 locfit_1.5-9.4           
[67] knitr_1.28                pillar_1.4.3              igraph_1.2.5              reprex_0.3.0              glue_1.4.0                evaluate_0.14            
[73] modelr_0.1.7              vctrs_0.2.4               cellranger_1.1.0          gtable_0.3.0              assertthat_0.2.1          xfun_0.13                
[79] rsvd_1.0.3                broom_0.5.6               viridisLite_0.3.0         beeswarm_0.2.3            statmod_1.4.34            ellipsis_0.3.0           
---
title: "Mouse Muscle Stem Cell Project "
subtitle: "Part 2: filter cells"
author: 
- name: Rick Farouni
  affiliation:
  - &cruk Génome Québec Innovation Centre, McGill University, Montreal, Canada
date: '`r format(Sys.Date(), "%Y-%B-%d")`'
output:
  html_notebook:
    df_print: paged
    code_folding: show
    toc: no
    toc_float: 
      collapsed: false
      smooth_scroll: false
---


# Prepare analysis workflow

## Set filepaths and parameters

```{r setup}
set.seed(42)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
options(
  readr.show_progress = FALSE,
  digits = 2
)
```

## Load packages
```{r}
suppressPackageStartupMessages({
  library(scater)
  library(BiocSingular)
  library(scran)
  library(tidyverse)
})
```

## Define file paths

```{r}
data_dir <- "./data"
figures_dir <- file.path("./figures", "qc_cells")
```


## Load data

```{r}
sce10x <- readRDS(file.path(data_dir, "preprocessed", "sce10x.rds"))
```

# Filter cells

## Plot data

```{r fig.height=18, fig.width=12}
plotColData(sce10x,
  x = "sum",
  by_exprs_values = "counts",
  y = "detected",
  colour_by = "high_mito",
  point_size = .7
) +
  scale_x_log10() +
  scale_y_log10() +
  facet_wrap(~ colData(sce10x)$sample, ncol = 2) +
  geom_vline(aes(xintercept = 100), color = "green", linetype = 2) +
  geom_hline(aes(yintercept = 100), color = "green", linetype = 2)
```

## Discard cells with high mitochondrial percentage

```{r}
sum_thresh <- colData(sce10x)$sum < 100
detected_thresh <- colData(sce10x)$detected < 100
mito_high <- colData(sce10x)$subsets_mito_percent > 15

discard <-
  (sum_thresh |
    detected_thresh |
    mito_high)
sum(discard)
```

```{r}
table(colData(sce10x)$sample, discard)
```

```{r}
sce10x <- sce10x[, !discard]
```



## Add doublet score label

```{r}
add_doublet_score <- function(sample_name, sce10x) {
  sce10x_subset <- subset(sce10x, colData(sce10x)$sample == sample_name)

  set.seed(42)
  dbl_dens <- doubletCells(sce10x_subset)

  dt <- tibble(
    barcode = colnames(sce10x_subset),
    doublet_score = log10(dbl_dens + 1)
  )

  return(dt)
}
```

```{r}
doublet_dt <- map_dfr(unique(sce10x$sample),
  add_doublet_score,
  sce10x = sce10x
)

sce10x$doublet_score <- deframe(doublet_dt)[colnames(sce10x)]
```

```{r}
ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_violin(aes(sample, doublet_score))
```

## Discard cells with low counts, low detected number of genes, and outlier unspliced proportions 

```{r fig.height=10, fig.width=14, message=TRUE, warning=TRUE}
ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_point(aes(sum_spliced,
    sum_unspliced,
    colour = doublet_score > 3
  ),
  size = 0.7,
  alpha = 0.8
  ) +
  scale_x_log10() +
  scale_y_log10() +
  geom_vline(xintercept = c(1000), color = "green", linetype = 2) +
  geom_hline(yintercept = c(350), color = "green", linetype = 2) +
  geom_vline(xintercept = c(2500), color = "red", linetype = 2) +
  geom_hline(yintercept = c(800), color = "red", linetype = 2) +
  facet_wrap(sample ~ ., ncol = 4)
```


```{r fig.height=10, fig.width=12}
ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_histogram(aes(unspliced_ov_total), bins = 150) +
  facet_wrap(sample ~ ., ncol = 2) +
  geom_vline(xintercept = c(.15, .5), color = "green", linetype = 2) +
  geom_vline(xintercept = c(.1, 0.4), color = "red", linetype = 2)
```


```{r fig.height=10, fig.width=14}
ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_point(aes(sum,
    detected,
    colour = doublet_score > 3
  ),
  size = 0.7,
  alpha = 0.8
  ) +
  scale_x_log10() +
  scale_y_log10() +
  facet_wrap(sample ~ ., ncol = 4) +
  geom_hline(aes(yintercept = c(1000)), color = "green", linetype = 2) +
  geom_hline(aes(yintercept = c(1800)), color = "red", linetype = 2) +
  geom_hline(aes(yintercept = c(7000)), color = "grey", linetype = 2)
```



```{r}
sum_unspliced_low_yng <- colData(sce10x)$sum_unspliced < 800 &
  colData(sce10x)$condition == "yng"
sum_spliced_low_yng <- colData(sce10x)$sum_spliced < 2500 &
  colData(sce10x)$condition == "yng"
sum_spliced_low_aged <- colData(sce10x)$sum_spliced < 1000 &
  colData(sce10x)$condition == "aged"
sum_unspliced_low_aged <- colData(sce10x)$sum_unspliced < 350 &
  colData(sce10x)$condition == "aged"

unspliced_ratio_aged <-
  (colData(sce10x)$unspliced_ov_total > .5 |
    colData(sce10x)$unspliced_ov_total < 0.15) &
    colData(sce10x)$condition == "aged"
unspliced_ratio_yng <-
  (colData(sce10x)$unspliced_ov_total > .4 |
    colData(sce10x)$unspliced_ov_total < 0.1) &
    colData(sce10x)$condition == "yng"
detected_low_yng <- colData(sce10x)$detected < 1800 &
  colData(sce10x)$condition == "yng"
detected_low_aged <- colData(sce10x)$detected < 1000 &
  colData(sce10x)$condition == "aged"
sum_unspliced_outlier <- colData(sce10x)$sum_unspliced > 7000

discard <-
  (sum_spliced_low_aged |
    sum_spliced_low_yng |
    sum_unspliced_low_aged |
    sum_unspliced_low_yng |
    unspliced_ratio_aged |
    unspliced_ratio_yng |
    detected_low_aged |
    detected_low_yng |
    sum_unspliced_outlier)
sum(discard)
```

```{r}
table(colData(sce10x)$sample, discard)
```


```{r}
sce10x <- sce10x[, !discard]
```


### Plot

```{r fig.height=14, fig.width=14}
plotScater(sce10x,
  nfeatures = 500,
  block1 = "sample",
  ncol = 3,
  exprs_values = "counts",
  colour_by = "subsets_mito_percent",
  line_width = 0.3
) +
  geom_hline(aes(yintercept = .75)) +
  geom_hline(aes(yintercept = .65))
```



```{r}
outliers_1 <- colData(sce10x)$percent_top_500 > 75 & colData(sce10x)$sample %in% c("aged3", "aged4", "yng2")
outliers_2 <- colData(sce10x)$percent_top_500 > 70 & colData(sce10x)$sample %in% c("yng3")
outliers <- outliers_1 | outliers_2
sum(outliers)
```


```{r}
sce10x <- sce10x[, !outliers]
```



```{r}
table(colData(sce10x)$sample)
```


```{r fig.height=10, fig.width=14}
ggplot(colData(sce10x) %>%
  as_tibble()) +
  geom_point(aes(sum_spliced,
    sum_unspliced,
    colour = doublet_score > 3
  ),
  size = 0.7,
  alpha = 0.8
  ) +
  scale_x_log10() +
  scale_y_log10() +
  # geom_vline(xintercept = c(20000, 30000), color = "green", linetype = 2) +
  geom_hline(yintercept = c(8000), color = "green", linetype = 2) +
  facet_wrap(sample ~ ., ncol = 4)
```

# Label genes detected in few cells

```{r}
n_exprs_genes_s <- nexprs(sce10x,
  detection_limit = 1,
  byrow = TRUE,
  exprs_values = "spliced"
)
head(table(n_exprs_genes_s), 100)
```

```{r}
n_exprs_genes_u <- nexprs(sce10x,
  detection_limit = 1,
  byrow = TRUE,
  exprs_values = "unspliced"
)
head(table(n_exprs_genes_u), 25)
```

```{r}
n_exprs_genes <-
  bind_cols(enframe(n_exprs_genes_s, "gene_name", "s"),
    u = n_exprs_genes_u
  ) %>%
  mutate(keep = !(s <= 25 & u <= 25))
n_exprs_genes
```


```{r}
rowData(sce10x)$nexprs_l1_u <- n_exprs_genes_u
rowData(sce10x)$nexprs_l1_s <- n_exprs_genes_s
rowData(sce10x)$keep <- !(n_exprs_genes_s <= 25 & n_exprs_genes_u <= 25)
```

```{r}
sum(rowData(sce10x)$keep)
```



## Save the single cell experiment objects

```{r}
saveRDS(sce10x, file.path(data_dir, "preprocessed", "sce10x_filtered_cells.rds"))
```

                
```{r}
sessionInfo()
```
