# install.packages("Seurat")
# install.packages("ggplot2")
library(Seurat)
library(SeuratData)
library(cowplot)
library(dplyr)
library(ggplot2)
set.seed(2023)
We use the CITE-seq dataset from (Stuart, Butler et al., Cell 2019),
which consists of 30,672 cells. The object contains two assays, RNA and
antibody-derived tags (ADT).
- RNA: 17,009 features (genes)
- ADT: 25 features (antibodies)
# Load a Seurat object which contains the RNA and ADT assays.
InstallData("bmcite")
cite <- LoadData(ds = "bmcite")
We would only use 5,000 cells with specific cell types such as “CD4”,
“CD8”, “DC”, and “CD14”. Those are what we show in the lecture.
Note: In order to demonstrate the performance of
multi-omics integration, we have the cell type information in this
dataset. However, we usually don’t have that information in the real
data.
# Only demo few cell types
target.celltype <- c("CD8 Naive", "CD4 Naive", "Naive B",
"cDC2", "pDC", "CD14 Mono", "CD56 bright NK")
target.idx <- which(cite$celltype.l2 %in% target.celltype)
cite <- cite[, target.idx]
# down-sample the larger dataset to 5000 samples
cite <- cite[, sample(1:ncol(cite), 5000)]
cite@assays
#View(cite@assays[["RNA"]]@counts[1:50,1:50] %>% as.matrix())
#View(cite@assays[["ADT"]]@counts[1:25,1:50] %>% as.matrix())
## $RNA
## Assay data with 17009 features for 5000 cells
## Top 10 variable features:
## IGKC, HBA2, HBB, HBA1, IGHA1, IGLC2, JCHAIN, HBM, IGHG1, IGHM
##
## $ADT
## Assay data with 25 features for 5000 cells
## Top 10 variable features:
## CD11a, CD11c, CD123, CD127-IL7Ra, CD14, CD16, CD161, CD19, CD197-CCR7,
## CD25
Due to the inherent levels of noise in single-cell RNA-seq, techniques such as PCA are often used to reduce the dimensionality of the dataset. We then perform pre-processing on both assays by using standard pipelines in Seurat which include normalization, feature selection, and dimensional reduction with PCA.
# (1) scRNA
DefaultAssay(cite) <- 'RNA'
cite <- NormalizeData(cite) %>%
FindVariableFeatures() %>%
ScaleData() %>%
RunPCA(reduction.name = 'pca')
# (2) ADT
# set dimensional reduction name as "apca" to avoid overwriting the PCA result of scRNASeq
DefaultAssay(cite) <- 'ADT'
VariableFeatures(cite) <- rownames(cite[["ADT"]])
cite <- NormalizeData(cite, normalization.method = 'CLR', margin = 2) %>%
ScaleData() %>%
RunPCA(reduction.name = 'apca')
First, we analyze the scRNA in the CITE-Seq data and visualize clustering result on the UMAP plot.
Three-step analysis:
(1) Calculate closest neighbors
(2) Create a UMAP visualization
(3) Perform graph-based clustering
After preprocessing and dimension reduction, for each cell, we calculate its closest neighbors based on the first 30 PCs. Then, we create a UMAP visualization of the data, and perform clustering and visualize these results on the UMAP plot.
# UMAP visualization and clustering based only on RNA data
DefaultAssay(cite) <- "RNA"
# Step 1
cite <- FindNeighbors(cite, dims = 1:30, reduction = 'pca')
# Step 2
cite <- RunUMAP(cite, reduction = 'pca', dims = 1:30, assay = 'RNA',
reduction.name = 'rna.umap', reduction.key = 'rnaUMAP_')
# Step 3
cite <- FindClusters(cite, resolution = 2, graph.name = 'RNA_snn')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5000
## Number of edges: 243139
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6480
## Number of communities: 13
## Elapsed time: 1 seconds
cite$rna_clusters <- Idents(cite)
# Dimensional reduction plot
p1 <- DimPlot(cite, reduction = 'rna.umap',
label = TRUE, repel = TRUE, label.size = 2.5) +
ggtitle("RNA Predicted")
p2 <- DimPlot(cite, reduction = 'rna.umap', group.by = 'celltype.l2',
label = TRUE, repel = TRUE, label.size = 2.5) +
ggtitle("RNA Annotated")
p1+p2 & NoLegend() & theme(plot.title = element_text(hjust = 0.5))
In CITE-seq, we not only have RNA data, but also have ADT data for the same cell. Then, we can analyze ADT data in a similar way as RNA data.
Three-step analysis:
(1) Calculate closest neighbors
(2) Create a UMAP visualization
(3) Perform graph-based clustering
# Step 1
cite <- FindNeighbors(cite, dims = 1:18, reduction = 'apca')
# Step 2
cite <- RunUMAP(cite, reduction = 'apca', dims = 1:18, assay = 'ADT',
reduction.name = 'adt.umap', reduction.key = 'adtUMAP_')
# Step 3
cite <- FindClusters(cite, resolution = 2, graph.name = 'ADT_snn')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 5000
## Number of edges: 191091
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6584
## Number of communities: 18
## Elapsed time: 0 seconds
cite$adt_clusters <- Idents(cite)
# Dimensional reduction plot
p3 <- DimPlot(cite, reduction = 'adt.umap',
label = TRUE, repel = TRUE, label.size = 2.5) +
ggtitle("ADT Predicted")
p4 <- DimPlot(cite, reduction = 'adt.umap', group.by = 'celltype.l2',
label = TRUE, repel = TRUE, label.size = 2.5) +
ggtitle("ADT Annotated")
p3+p4 & NoLegend() & theme(plot.title = element_text(hjust = 0.5))
In CITE-Seq, since we have RNA and ADT data for the same cell, we can apply weighted nearest neighbor (WNN) clustering method to integrate different omics.
For each cell, we calculate its closest neighbors in the dataset based on a weighted combination of RNA and protein similarities.
Three-step analysis:
(1) Calculate closest neighbors
(2) Create a UMAP visualization
(3) Perform graph-based clustering
# Step 1
cite <- FindMultiModalNeighbors(
cite, reduction.list = list("pca", "apca"),
dims.list = list(1:30, 1:18),
modality.weight.name = c("RNA.weight", "ADT.weight"))
# Step 2
cite <- RunUMAP(cite, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_")
# Step 3
cite <- FindClusters(cite, graph.name = "wsnn", algorithm = 3, resolution = 2, verbose = FALSE)
cite$wnn_clusters <- Idents(cite)
# Dimensional reduction plot
p5 <- DimPlot(cite, reduction = 'wnn.umap',
label = TRUE, repel = TRUE, label.size = 2.5) +
ggtitle("WNN Predicted")
p6 <- DimPlot(cite, reduction = 'wnn.umap', group.by = 'celltype.l2',
label = TRUE, repel = TRUE, label.size = 2.5) +
ggtitle("WNN Annotated")
p5+p6 & NoLegend() & theme(plot.title = element_text(hjust = 0.5))
p2+p4+p6 & NoLegend() & theme(plot.title = element_text(hjust = 0.5))
VlnPlot(cite, features = c("RNA.weight", "ADT.weight"),
group.by = 'celltype.l2', sort = TRUE,
pt.size = 0, ncol = 1) + NoLegend()
Reference:
https://satijalab.org/seurat/articles/weighted_nearest_neighbor_analysis.html