# install.packages("Seurat")
# install.packages("ggplot2")
library(Seurat)
library(SeuratData)
library(cowplot)
library(dplyr)
library(ggplot2)
set.seed(2023)

Loading data

We use the CITE-seq dataset from (Stuart, Butler et al., Cell 2019), which consists of 30,672 cells. The object contains two assays, RNA and antibody-derived tags (ADT).
- RNA: 17,009 features (genes)
- ADT: 25 features (antibodies)

# Load a Seurat object which contains the RNA and ADT assays.
InstallData("bmcite")
cite <- LoadData(ds = "bmcite")

We would only use 5,000 cells with specific cell types such as “CD4”, “CD8”, “DC”, and “CD14”. Those are what we show in the lecture.
Note: In order to demonstrate the performance of multi-omics integration, we have the cell type information in this dataset. However, we usually don’t have that information in the real data.

# Only demo few cell types
target.celltype <- c("CD8 Naive", "CD4 Naive", "Naive B",
                     "cDC2", "pDC", "CD14 Mono", "CD56 bright NK")
target.idx <- which(cite$celltype.l2 %in% target.celltype)
cite <- cite[, target.idx]
# down-sample the larger dataset to 5000 samples
cite <- cite[, sample(1:ncol(cite), 5000)]
cite@assays
#View(cite@assays[["RNA"]]@counts[1:50,1:50]  %>% as.matrix())
#View(cite@assays[["ADT"]]@counts[1:25,1:50]  %>% as.matrix())
## $RNA
## Assay data with 17009 features for 5000 cells
## Top 10 variable features:
##  IGKC, HBA2, HBB, HBA1, IGHA1, IGLC2, JCHAIN, HBM, IGHG1, IGHM 
## 
## $ADT
## Assay data with 25 features for 5000 cells
## Top 10 variable features:
##  CD11a, CD11c, CD123, CD127-IL7Ra, CD14, CD16, CD161, CD19, CD197-CCR7,
## CD25

Data pre-processing

Due to the inherent levels of noise in single-cell RNA-seq, techniques such as PCA are often used to reduce the dimensionality of the dataset. We then perform pre-processing on both assays by using standard pipelines in Seurat which include normalization, feature selection, and dimensional reduction with PCA.

# (1) scRNA
DefaultAssay(cite) <- 'RNA'
cite <- NormalizeData(cite) %>% 
  FindVariableFeatures() %>% 
  ScaleData() %>% 
  RunPCA(reduction.name = 'pca')

# (2) ADT
# set dimensional reduction name as "apca" to avoid overwriting the PCA result of scRNASeq
DefaultAssay(cite) <- 'ADT'
VariableFeatures(cite) <- rownames(cite[["ADT"]])
cite <- NormalizeData(cite, normalization.method = 'CLR', margin = 2) %>% 
  ScaleData() %>% 
  RunPCA(reduction.name = 'apca')

scRNA data

First, we analyze the scRNA in the CITE-Seq data and visualize clustering result on the UMAP plot.

Three-step analysis:
(1) Calculate closest neighbors
(2) Create a UMAP visualization
(3) Perform graph-based clustering

Cluster scRNA data

After preprocessing and dimension reduction, for each cell, we calculate its closest neighbors based on the first 30 PCs. Then, we create a UMAP visualization of the data, and perform clustering and visualize these results on the UMAP plot.

# UMAP visualization and clustering based only on RNA data
DefaultAssay(cite) <- "RNA"
# Step 1
cite <- FindNeighbors(cite, dims = 1:30, reduction = 'pca')
# Step 2
cite <- RunUMAP(cite, reduction = 'pca', dims = 1:30, assay = 'RNA', 
                reduction.name = 'rna.umap', reduction.key = 'rnaUMAP_')
# Step 3
cite <- FindClusters(cite, resolution = 2, graph.name = 'RNA_snn')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5000
## Number of edges: 243139
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6480
## Number of communities: 13
## Elapsed time: 1 seconds
cite$rna_clusters <- Idents(cite)  

# Dimensional reduction plot
p1 <- DimPlot(cite, reduction = 'rna.umap', 
              label = TRUE, repel = TRUE, label.size = 2.5) +
      ggtitle("RNA Predicted")
p2 <- DimPlot(cite, reduction = 'rna.umap', group.by = 'celltype.l2', 
              label = TRUE, repel = TRUE, label.size = 2.5) + 
      ggtitle("RNA Annotated")
p1+p2 & NoLegend() & theme(plot.title = element_text(hjust = 0.5))

Surface protein data (ADT)

In CITE-seq, we not only have RNA data, but also have ADT data for the same cell. Then, we can analyze ADT data in a similar way as RNA data.

Three-step analysis:
(1) Calculate closest neighbors
(2) Create a UMAP visualization
(3) Perform graph-based clustering

Cluster surface protein data (ADT)

# Step 1
cite <- FindNeighbors(cite, dims = 1:18, reduction = 'apca')
# Step 2
cite <- RunUMAP(cite, reduction = 'apca', dims = 1:18, assay = 'ADT', 
              reduction.name = 'adt.umap', reduction.key = 'adtUMAP_')
# Step 3
cite <- FindClusters(cite, resolution = 2, graph.name = 'ADT_snn')
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5000
## Number of edges: 191091
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6584
## Number of communities: 18
## Elapsed time: 0 seconds
cite$adt_clusters <- Idents(cite)

# Dimensional reduction plot
p3 <- DimPlot(cite, reduction = 'adt.umap', 
              label = TRUE, repel = TRUE, label.size = 2.5) + 
      ggtitle("ADT Predicted")
p4 <- DimPlot(cite, reduction = 'adt.umap', group.by = 'celltype.l2', 
              label = TRUE, repel = TRUE, label.size = 2.5) + 
      ggtitle("ADT Annotated")
p3+p4 & NoLegend() & theme(plot.title = element_text(hjust = 0.5))

scRNA data + ADT data

In CITE-Seq, since we have RNA and ADT data for the same cell, we can apply weighted nearest neighbor (WNN) clustering method to integrate different omics.

WNN Clustering

For each cell, we calculate its closest neighbors in the dataset based on a weighted combination of RNA and protein similarities.

Three-step analysis:
(1) Calculate closest neighbors
(2) Create a UMAP visualization
(3) Perform graph-based clustering

# Step 1
cite <- FindMultiModalNeighbors(
  cite, reduction.list = list("pca", "apca"), 
  dims.list = list(1:30, 1:18),
  modality.weight.name = c("RNA.weight", "ADT.weight"))
# Step 2
cite <- RunUMAP(cite, nn.name = "weighted.nn", reduction.name = "wnn.umap", reduction.key = "wnnUMAP_")
# Step 3
cite <- FindClusters(cite, graph.name = "wsnn", algorithm = 3, resolution = 2, verbose = FALSE)
cite$wnn_clusters <- Idents(cite)

# Dimensional reduction plot
p5 <- DimPlot(cite, reduction = 'wnn.umap', 
              label = TRUE, repel = TRUE, label.size = 2.5) + 
      ggtitle("WNN Predicted")
p6 <- DimPlot(cite, reduction = 'wnn.umap', group.by = 'celltype.l2', 
              label = TRUE, repel = TRUE, label.size = 2.5) + 
      ggtitle("WNN Annotated")
p5+p6 & NoLegend() & theme(plot.title = element_text(hjust = 0.5))

Data comparison

p2+p4+p6 & NoLegend() & theme(plot.title = element_text(hjust = 0.5))

Cell-specific modality weights

VlnPlot(cite, features = c("RNA.weight", "ADT.weight"), 
        group.by = 'celltype.l2', sort = TRUE, 
        pt.size = 0, ncol = 1) + NoLegend()

Reference:
https://satijalab.org/seurat/articles/weighted_nearest_neighbor_analysis.html