Preparing the environment
Load the required packages:
library(tidyverse)
library(glue)
library(Matrix)
library(TENxPBMCData)
library(scater)
library(scran)
library(SingleR)
library(celldex)
library(bluster)
library(dittoSeq)
library(Nebulosa)
Preparing the data
# Load the pbmc4k peripheral blood mononuclear cell dataset
sce <- TENxPBMCData(dataset = "pbmc4k")
sce
## class: SingleCellExperiment
## dim: 33694 4340
## metadata(0):
## assays(1): counts
## rownames(33694): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
## ENSG00000268674
## rowData names(3): ENSEMBL_ID Symbol_TENx Symbol
## colnames: NULL
## colData names(11): Sample Barcode ... Individual Date_published
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
# Cell metadata
head(colData(sce))
## DataFrame with 6 rows and 11 columns
## Sample Barcode Sequence Library Cell_ranger_version
## <character> <character> <character> <integer> <character>
## 1 pbmc4k AAACCTGAGAAGGCCT-1 AAACCTGAGAAGGCCT 1 v2.1
## 2 pbmc4k AAACCTGAGACAGACC-1 AAACCTGAGACAGACC 1 v2.1
## 3 pbmc4k AAACCTGAGATAGTCA-1 AAACCTGAGATAGTCA 1 v2.1
## 4 pbmc4k AAACCTGAGCGCCTCA-1 AAACCTGAGCGCCTCA 1 v2.1
## 5 pbmc4k AAACCTGAGGCATGGT-1 AAACCTGAGGCATGGT 1 v2.1
## 6 pbmc4k AAACCTGCAAGGTTCT-1 AAACCTGCAAGGTTCT 1 v2.1
## Tissue_status Barcode_type Chemistry Sequence_platform Individual
## <character> <character> <character> <character> <character>
## 1 NA Chromium Chromium_v2 Hiseq4000 HealthyDonor1
## 2 NA Chromium Chromium_v2 Hiseq4000 HealthyDonor1
## 3 NA Chromium Chromium_v2 Hiseq4000 HealthyDonor1
## 4 NA Chromium Chromium_v2 Hiseq4000 HealthyDonor1
## 5 NA Chromium Chromium_v2 Hiseq4000 HealthyDonor1
## 6 NA Chromium Chromium_v2 Hiseq4000 HealthyDonor1
## Date_published
## <character>
## 1 2017-11-08
## 2 2017-11-08
## 3 2017-11-08
## 4 2017-11-08
## 5 2017-11-08
## 6 2017-11-08
# Gene metadata
rowData(sce)
## DataFrame with 33694 rows and 3 columns
## ENSEMBL_ID Symbol_TENx Symbol
## <character> <character> <character>
## ENSG00000243485 ENSG00000243485 RP11-34P13.3 NA
## ENSG00000237613 ENSG00000237613 FAM138A FAM138A
## ENSG00000186092 ENSG00000186092 OR4F5 OR4F5
## ENSG00000238009 ENSG00000238009 RP11-34P13.7 LOC100996442
## ENSG00000239945 ENSG00000239945 RP11-34P13.8 NA
## ... ... ... ...
## ENSG00000277856 ENSG00000277856 AC233755.2 NA
## ENSG00000275063 ENSG00000275063 AC233755.1 LOC102723407
## ENSG00000271254 ENSG00000271254 AC240274.1 LOC102724250
## ENSG00000277475 ENSG00000277475 AC213203.1 NA
## ENSG00000268674 ENSG00000268674 FAM231B FAM231C
# Add column names to the SingleCellExperiment object
colnames(sce) <- sce$Barcode
rownames(colData(sce)) <- colnames(sce)
# Change row names to unique gene symbols
rownames(sce) <- uniquifyFeatureNames(rowData(sce)$ENSEMBL_ID, rowData(sce)$Symbol_TENx)
sce
## class: SingleCellExperiment
## dim: 33694 4340
## metadata(0):
## assays(1): counts
## rownames(33694): RP11-34P13.3 FAM138A ... AC213203.1 FAM231B
## rowData names(3): ENSEMBL_ID Symbol_TENx Symbol
## colnames(4340): AAACCTGAGAAGGCCT-1 AAACCTGAGACAGACC-1 ...
## TTTGTCAGTTAAGACA-1 TTTGTCATCCCAAGAT-1
## colData names(11): Sample Barcode ... Individual Date_published
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
Basic analysis
# Set the RNG seed for reproducible results
set.seed(42)
# Data normalization
quick_clusters <- quickCluster(sce)
table(quick_clusters)
## quick_clusters
## 1 2 3 4 5 6 7 8 9 10 11
## 588 361 264 161 866 365 531 536 360 207 101
sce <- computeSumFactors(sce, clusters = quick_clusters)
sce <- logNormCounts(sce)
# Selecting highly variable genes
dec <- modelGeneVar(sce)
top_hvgs <- getTopHVGs(dec, fdr.threshold = 0.05)
length(top_hvgs)
## [1] 310
# Principal component analysis
sce <- denoisePCA(sce, dec, subset.row = top_hvgs, min.rank = 5, max.rank = 50)
ncol(reducedDim(sce, "PCA"))
## [1] 5
# Dimensionality reduction by UMAP
sce <- runUMAP(sce, dimred = "PCA")
Cell type annotation
Note: Information about all available reference datasets can be found in the celldex package documentation
# Prepare the reference dataset
ref_se <- BlueprintEncodeData()
ref_se
## class: SummarizedExperiment
## dim: 19859 259
## metadata(0):
## assays(1): logcounts
## rownames(19859): TSPAN6 TNMD ... LINC00550 GIMAP1-GIMAP5
## rowData names(0):
## colnames(259): mature.neutrophil
## CD14.positive..CD16.negative.classical.monocyte ...
## epithelial.cell.of.umbilical.artery.1
## dermis.lymphatic.vessel.endothelial.cell.1
## colData names(3): label.main label.fine label.ont
# Annotate cell types using SingleR (main labels)
singler_results <- SingleR(sce, ref_se, labels = ref_se$label.main)
head(singler_results)
## DataFrame with 6 rows and 5 columns
## scores first.labels
## <matrix> <character>
## AAACCTGAGAAGGCCT-1 0.251873:0.118890:0.287805:... Monocytes
## AAACCTGAGACAGACC-1 0.280080:0.133100:0.334590:... Monocytes
## AAACCTGAGATAGTCA-1 0.267468:0.148610:0.298052:... Monocytes
## AAACCTGAGCGCCTCA-1 0.232537:0.167537:0.374265:... CD4+ T-cells
## AAACCTGAGGCATGGT-1 0.211503:0.153752:0.345878:... CD4+ T-cells
## AAACCTGCAAGGTTCT-1 0.218346:0.155343:0.367597:... CD8+ T-cells
## tuning.scores labels pruned.labels
## <DataFrame> <character> <character>
## AAACCTGAGAAGGCCT-1 0.413717:0.00491616 Monocytes Monocytes
## AAACCTGAGACAGACC-1 0.469530:0.40291032 Monocytes Monocytes
## AAACCTGAGATAGTCA-1 0.407217:0.11303653 Monocytes Monocytes
## AAACCTGAGCGCCTCA-1 0.174840:0.13078254 CD8+ T-cells CD8+ T-cells
## AAACCTGAGGCATGGT-1 0.180486:0.06419868 CD4+ T-cells CD4+ T-cells
## AAACCTGCAAGGTTCT-1 0.307063:0.19412309 CD8+ T-cells CD8+ T-cells
# SingleR predictions summary
table(singler_results$labels)
##
## B-cells CD4+ T-cells CD8+ T-cells DC HSC Monocytes
## 616 883 1388 1 14 1201
## NK cells
## 237
table(singler_results$pruned.labels)
##
## B-cells CD4+ T-cells CD8+ T-cells DC HSC Monocytes
## 615 871 1378 1 14 1197
## NK cells
## 237
sum(is.na(singler_results$pruned.labels))
## [1] 27
# SingleR score heatmap
plotScoreHeatmap(singler_results)

# Add cell type information to the SingleCellExperiment object
sce$cell_type_blueprint_main <- singler_results$pruned.labels %>%
fct_explicit_na(na_level = "Other") %>%
fct_lump_min(min = 100, other_level = "Other") %>%
fct_relevel("Other", after = Inf) %>%
fct_drop()
# UMAP plots
dittoDimPlot(sce, "cell_type_blueprint_main", reduction.use = "UMAP", size = 0.5)

dittoDimPlot(sce, "cell_type_blueprint_main", reduction.use = "UMAP", size = 0.1,
split.by = "cell_type_blueprint_main", split.ncol = 3)

Cell clustering
# Set the RNG seed for reproducible results
set.seed(42)
# Detect cell clusters using a graph-based approach
colLabels(sce) <- clusterRows(reducedDim(sce, "PCA"),
NNGraphParam(k = 100, cluster.fun = "louvain"))
table(sce$label)
##
## 1 2 3 4 5 6
## 425 653 815 1338 365 744
# UMAP plot
dittoDimPlot(sce, "label", reduction.use = "UMAP", size = 0.5)

# Cluster vs cell type barplots
dittoBarPlot(sce, "cell_type_blueprint_main", group.by = "label",
scale = "percent")

dittoBarPlot(sce, "cell_type_blueprint_main", group.by = "label",
scale = "count")

Fine-grained cell type annotation
# Annotate cell types using SingleR (fine labels)
sce$cell_type_blueprint_fine <- SingleR(sce, ref_se, labels = ref_se$label.fine)$pruned.labels
table(sce$cell_type_blueprint_fine)
##
## CD4+ T-cells CD4+ Tcm
## 539 102
## CD4+ Tem CD8+ T-cells
## 152 512
## CD8+ Tcm CD8+ Tem
## 540 337
## Class-switched memory B-cells CLP
## 56 6
## DC GMP
## 1 6
## Megakaryocytes Memory B-cells
## 3 151
## MEP Monocytes
## 4 1173
## MPP naive B-cells
## 1 395
## NK cells Plasma cells
## 199 3
## Tregs
## 96
# Add cell type information to the SingleCellExperiment object
sce$cell_type_blueprint_fine <- sce$cell_type_blueprint_fine %>%
fct_explicit_na(na_level = "Other") %>%
fct_lump_min(min = 100, other_level = "Other") %>%
fct_relevel("Other", after = Inf) %>%
fct_drop()
# SingleR predictions summary
table(sce$cell_type_blueprint_fine)
##
## CD4+ T-cells CD4+ Tcm CD4+ Tem CD8+ T-cells CD8+ Tcm
## 539 102 152 512 540
## CD8+ Tem Memory B-cells Monocytes naive B-cells NK cells
## 337 151 1173 395 199
## Other
## 240
# UMAP plot
dittoDimPlot(sce, "cell_type_blueprint_fine", reduction.use = "UMAP", size = 0.5)

# Cluster vs cell type barplots
dittoBarPlot(sce, "cell_type_blueprint_fine", group.by = "label",
scale = "percent")

dittoBarPlot(sce[, colLabels(sce) %in% c(1, 4)],
"cell_type_blueprint_fine", group.by = "label",
scale = "percent")

Using known marker genes
Comparing monocyte clusters
# Identify marker genes differentiating the monocyte clusters
markers_monocyte <- findMarkers(sce, groups = sce$label, test.type = "wilcox",
pval.type = "any", direction = "up",
restrict = c(3, 5))
Cluster 3 markers
head(as.data.frame(markers_monocyte[["3"]]), n = 10)
S100A8 |
1 |
0 |
0 |
0.9835247 |
0.9835247 |
S100A9 |
2 |
0 |
0 |
0.9802067 |
0.9802067 |
S100A12 |
3 |
0 |
0 |
0.9502059 |
0.9502059 |
VCAN |
4 |
0 |
0 |
0.8968216 |
0.8968216 |
LYZ |
5 |
0 |
0 |
0.8560652 |
0.8560652 |
RP11-1143G9.4 |
6 |
0 |
0 |
0.8388638 |
0.8388638 |
MNDA |
7 |
0 |
0 |
0.7860324 |
0.7860324 |
CD14 |
8 |
0 |
0 |
0.7725103 |
0.7725103 |
S100A6 |
9 |
0 |
0 |
0.7752920 |
0.7752920 |
FOS |
10 |
0 |
0 |
0.7712951 |
0.7712951 |
Cluster 5 markers
head(as.data.frame(markers_monocyte[["5"]]), n = 10)
HLA-DPA1 |
1 |
0 |
0 |
0.9390974 |
0.9390974 |
HLA-DPB1 |
2 |
0 |
0 |
0.8934263 |
0.8934263 |
HLA-DQA1 |
3 |
0 |
0 |
0.8725943 |
0.8725943 |
CD74 |
4 |
0 |
0 |
0.8795563 |
0.8795563 |
RHOC |
5 |
0 |
0 |
0.7723473 |
0.7723473 |
HLA-DRB1 |
6 |
0 |
0 |
0.8274510 |
0.8274510 |
FCGR3A |
7 |
0 |
0 |
0.6870426 |
0.6870426 |
PLD4 |
8 |
0 |
0 |
0.6834121 |
0.6834121 |
HLA-DRA |
9 |
0 |
0 |
0.7844844 |
0.7844844 |
HLA-DQB1 |
10 |
0 |
0 |
0.7831818 |
0.7831818 |
CD14+ vs CD16+ monocytes scatter plot
dittoScatterPlot(sce[, colLabels(sce) %in% c(3, 5)],
x.var = "CD14", y.var = "FCGR3A", color.var = "label")

CD14+ vs CD16+ monocytes hex scatter plot
dittoScatterHex(sce[, colLabels(sce) %in% c(3, 5)],
x.var = "CD14", y.var = "FCGR3A",
color.var = "label", max.density = 10)

Comparing CD4+ T cell subpopulations
# Identify marker genes differentiating the CD4+ T cell subpopulations
sce$cell_type_cluster <- paste(sce$cell_type_blueprint_main, sce$label, sep = ".")
markers_cd4 <- findMarkers(sce, groups = sce$cell_type_cluster, test.type = "wilcox",
pval.type = "any", direction = "up",
restrict = c("CD4+ T-cells.1", "CD4+ T-cells.4"))
Cluster 1 markers
head(as.data.frame(markers_cd4[["CD4+ T-cells.1"]]), n = 10)
RP5-1171I10.5 |
1 |
0e+00 |
0.0000000 |
0.6245142 |
0.6245142 |
S100A4 |
2 |
0e+00 |
0.0000303 |
0.6365172 |
0.6365172 |
ITGB1 |
3 |
0e+00 |
0.0000701 |
0.6059043 |
0.6059043 |
LUC7L3 |
4 |
0e+00 |
0.0000929 |
0.6042227 |
0.6042227 |
ANXA2 |
5 |
0e+00 |
0.0002163 |
0.5948430 |
0.5948430 |
SH3BGRL3 |
6 |
0e+00 |
0.0002224 |
0.6247758 |
0.6247758 |
TTC14 |
7 |
1e-07 |
0.0004551 |
0.5833931 |
0.5833931 |
SLFN5 |
8 |
1e-07 |
0.0006059 |
0.5978363 |
0.5978363 |
LINC00152 |
9 |
3e-07 |
0.0009813 |
0.5602765 |
0.5602765 |
NINJ2 |
10 |
3e-07 |
0.0009813 |
0.5286996 |
0.5286996 |
Cluster 4 markers
head(as.data.frame(markers_cd4[["CD4+ T-cells.4"]]), n = 10)
JUNB |
1 |
0 |
0 |
0.9534828 |
0.9534828 |
FOS |
2 |
0 |
0 |
0.9494245 |
0.9494245 |
JUN |
3 |
0 |
0 |
0.9198057 |
0.9198057 |
DUSP1 |
4 |
0 |
0 |
0.9051420 |
0.9051420 |
CD69 |
5 |
0 |
0 |
0.7633333 |
0.7633333 |
CXCR4 |
6 |
0 |
0 |
0.7498617 |
0.7498617 |
BTG2 |
7 |
0 |
0 |
0.7331614 |
0.7331614 |
SLC2A3 |
8 |
0 |
0 |
0.7281614 |
0.7281614 |
TSC22D3 |
9 |
0 |
0 |
0.7262593 |
0.7262593 |
PPP1R15A |
10 |
0 |
0 |
0.6969581 |
0.6969581 |
Memory CD4+ T cell markers density plot
density_plots <- plot_density(sce, c("CD4", "IL7R", "S100A4"), size = 0.5,
joint = TRUE, combine = FALSE)
density_plots[[length(density_plots)]]

Activated CD4+ T cell markers density plot
density_plots <- plot_density(sce, c("CD4", "IL7R", "CD69"), size = 0.5,
joint = TRUE, combine = FALSE)
density_plots[[length(density_plots)]]

Naive CD4+ T cell markers density plot
density_plots <- plot_density(sce, c("CD4", "IL7R", "CCR7"), size = 0.5,
joint = TRUE, combine = FALSE)
density_plots[[length(density_plots)]]

Session Info
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] Nebulosa_1.2.0 patchwork_1.1.1
## [3] dittoSeq_1.4.3 bluster_1.2.1
## [5] celldex_1.2.0 SingleR_1.6.1
## [7] scran_1.20.1 scater_1.20.1
## [9] scuttle_1.2.1 TENxPBMCData_1.10.0
## [11] HDF5Array_1.20.0 rhdf5_2.36.0
## [13] DelayedArray_0.18.0 SingleCellExperiment_1.14.1
## [15] SummarizedExperiment_1.22.0 Biobase_2.52.0
## [17] GenomicRanges_1.44.0 GenomeInfoDb_1.28.4
## [19] IRanges_2.26.0 S4Vectors_0.30.2
## [21] BiocGenerics_0.38.0 MatrixGenerics_1.4.3
## [23] matrixStats_0.61.0 Matrix_1.3-4
## [25] glue_1.4.2 forcats_0.5.1
## [27] stringr_1.4.0 dplyr_1.0.7
## [29] purrr_0.3.4 readr_2.0.2
## [31] tidyr_1.1.4 tibble_3.1.5
## [33] ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] rappdirs_0.3.3 scattermore_0.7
## [3] SeuratObject_4.0.2 bit64_4.0.5
## [5] knitr_1.36 irlba_2.3.3
## [7] data.table_1.14.2 rpart_4.1-15
## [9] KEGGREST_1.32.0 RCurl_1.98-1.5
## [11] generics_0.1.0 ScaledMatrix_1.0.0
## [13] cowplot_1.1.1 RSQLite_2.2.8
## [15] RANN_2.6.1 future_1.22.1
## [17] bit_4.0.4 tzdb_0.1.2
## [19] BiocStyle_2.20.2 spatstat.data_2.1-0
## [21] xml2_1.3.2 lubridate_1.8.0
## [23] httpuv_1.6.3 assertthat_0.2.1
## [25] viridis_0.6.1 xfun_0.26
## [27] hms_1.1.1 jquerylib_0.1.4
## [29] evaluate_0.14 promises_1.2.0.1
## [31] fansi_0.5.0 dbplyr_2.1.1
## [33] readxl_1.3.1 igraph_1.2.6
## [35] DBI_1.1.1 htmlwidgets_1.5.4
## [37] spatstat.geom_2.3-0 ellipsis_0.3.2
## [39] RSpectra_0.16-0 ks_1.13.2
## [41] backports_1.2.1 deldir_1.0-5
## [43] sparseMatrixStats_1.4.2 vctrs_0.3.8
## [45] ROCR_1.0-11 abind_1.4-5
## [47] cachem_1.0.6 withr_2.4.2
## [49] sctransform_0.3.2 mclust_5.4.7
## [51] goftest_1.2-3 cluster_2.1.2
## [53] ExperimentHub_2.0.0 lazyeval_0.2.2
## [55] crayon_1.4.1 labeling_0.4.2
## [57] edgeR_3.34.1 pkgconfig_2.0.3
## [59] nlme_3.1-152 vipor_0.4.5
## [61] rlang_0.4.11 globals_0.14.0
## [63] lifecycle_1.0.1 miniUI_0.1.1.1
## [65] filelock_1.0.2 BiocFileCache_2.0.0
## [67] modelr_0.1.8 rsvd_1.0.5
## [69] AnnotationHub_3.0.1 cellranger_1.1.0
## [71] polyclip_1.10-0 lmtest_0.9-38
## [73] Rhdf5lib_1.14.2 zoo_1.8-9
## [75] reprex_2.0.1 beeswarm_0.4.0
## [77] ggridges_0.5.3 pheatmap_1.0.12
## [79] png_0.1-7 viridisLite_0.4.0
## [81] bitops_1.0-7 KernSmooth_2.23-20
## [83] rhdf5filters_1.4.0 Biostrings_2.60.2
## [85] blob_1.2.2 DelayedMatrixStats_1.14.3
## [87] parallelly_1.28.1 beachmat_2.8.1
## [89] scales_1.1.1 hexbin_1.28.2
## [91] memoise_2.0.0 magrittr_2.0.1
## [93] plyr_1.8.6 ica_1.0-2
## [95] zlibbioc_1.38.0 compiler_4.1.0
## [97] dqrng_0.3.0 RColorBrewer_1.1-2
## [99] fitdistrplus_1.1-6 cli_3.0.1
## [101] XVector_0.32.0 listenv_0.8.0
## [103] pbapply_1.5-0 ggplot.multistats_1.0.0
## [105] MASS_7.3-54 mgcv_1.8-36
## [107] tidyselect_1.1.1 stringi_1.7.5
## [109] highr_0.9 yaml_2.2.1
## [111] BiocSingular_1.8.1 locfit_1.5-9.4
## [113] ggrepel_0.9.1 grid_4.1.0
## [115] sass_0.4.0 tools_4.1.0
## [117] future.apply_1.8.1 rstudioapi_0.13
## [119] metapod_1.0.0 gridExtra_2.3
## [121] farver_2.1.0 Rtsne_0.15
## [123] digest_0.6.28 BiocManager_1.30.16
## [125] shiny_1.7.1 pracma_2.3.3
## [127] Rcpp_1.0.7 broom_0.7.9
## [129] BiocVersion_3.13.1 later_1.3.0
## [131] RcppAnnoy_0.0.19 httr_1.4.2
## [133] AnnotationDbi_1.54.1 colorspace_2.0-2
## [135] rvest_1.0.1 fs_1.5.0
## [137] tensor_1.5 reticulate_1.22
## [139] splines_4.1.0 uwot_0.1.10
## [141] statmod_1.4.36 spatstat.utils_2.2-0
## [143] plotly_4.10.0 xtable_1.8-4
## [145] jsonlite_1.7.2 R6_2.5.1
## [147] pillar_1.6.3 htmltools_0.5.2
## [149] mime_0.12 fastmap_1.1.0
## [151] BiocParallel_1.26.2 BiocNeighbors_1.10.0
## [153] interactiveDisplayBase_1.30.0 codetools_0.2-18
## [155] mvtnorm_1.1-3 utf8_1.2.2
## [157] lattice_0.20-44 bslib_0.3.1
## [159] spatstat.sparse_2.0-0 curl_4.3.2
## [161] ggbeeswarm_0.6.0 leiden_0.3.9
## [163] survival_3.2-11 limma_3.48.3
## [165] rmarkdown_2.11 munsell_0.5.0
## [167] GenomeInfoDbData_1.2.6 haven_2.4.3
## [169] reshape2_1.4.4 gtable_0.3.0
## [171] spatstat.core_2.3-0 Seurat_4.0.4