#'
#'
plot.scRNA.features.complexHeatmap=function(s.anno, features, name,width=300, cores){
log=mclapply(seq(dim(s.anno)[1]), plot.scRNA.features, s.anno,features, name, mc.cores = cores, width=width)
}
plot.scRNA.features=function(i, s.anno, features, name, add.annotation=NULL, width=300){
# eccite PBMC dataset:
load(s.anno[i,2])
#HSC, myeloid, B-solu, T-solu, erythroi
# STEP1 find the cluster order, and set them based on lineage:
clusters=Idents(scmat)
if(s.anno[i,3]!=""){
order.clu=as.numeric(unlist(strsplit(s.anno[i,3], ",|, | ,"))) # MODIFY
ord.names=names(clusters[order(match(clusters,order.clu))])
}else{
order.clu=levels(clusters)
# order samples within cluster by umap component with more spread
# this way there is still information within cluster, for heatmaps
coords=Embeddings(scmat[["umap"]])
fit.cluster=levels(Idents(scmat))
ord.names=unlist(lapply(fit.cluster, function(clu){
d=coords[Idents(scmat)%in%clu,]
ind=which.max(c(max(d[,1])-min(d[,1]), max(d[,2])-min(d[,2]))) # take the axis with most spread
names(sort(d[,ind]))
}))
}
features=data.frame(features, stringsAsFactors = F)
if(!grepl("^B:|^N:", features[1,1])){
features[,1]=paste0("N:GEXP:", features[,1])
allfeats=rownames(fm)[!rowSums(fm>0)==0]
}else{
allfeats=gsub("N:GEXP:|N:RPPA:", "", rownames(fm)[!rowSums(fm>0)==0])
}
# text annot created if more columns than 1.
if(dim(features)[2]>1){
features=features[features[,1]%in%allfeats,]
feats=features[,1]
t.annot=apply(features[,2:dim(features)[2], drop=F], 1, paste, collapse=" ")
names(t.annot)=feats
}else{
t.annot=NULL
feats=features[features[,1]%in%allfeats,1]
}
# make a data frame for annotations on top of the heatmap
annotdf=data.frame("cluster"=clusters,"HLAIScore"=as.numeric(fm["N:SAMP:HLAIScore",]), "HLAIIScore"=as.numeric(fm["N:SAMP:HLAIIScore",]), "CytolyticScore"=as.numeric(fm["N:SAMP:CytolyticScore",]))
if(!is.null(add.annotation))annotdf$annotation=add.annotation
name=gsub("__", "_", gsub("[[:punct:]]", "_", paste(name, s.anno[i,1], sep="_")))
# plot using a complexheatmap package using a featurematrix format and custom plotting function
r1=plot.complexHM.fm(feats = feats, text_annot = t.annot, fm.f = fm, annotdf = annotdf, feats.barplot = c("HLAIScore", "HLAIIScore", "CytolyticScore"), order_columns=ord.names, order_rows=F, split.columns = T, plotting.param="/research/groups/sysgen/PROJECTS/students/HEMAP_IMMUNOLOGY/minna_work/scRNA_genelist_plotting/plotting_param_fm.txt", NAME=name, WIDTH = width)
}
plot.gene.seurat=function(s.anno, feature, name,cols=c("grey75", "red"), max=1, min=0, cores=1){
plot.feature=function(i, feature){
# plot proportions as a barplot next to 2D points
load(s.anno[i,2])
if(!feature%in%rownames(scmat))return(NULL)
p <- FeaturePlot(scmat, features = feature, cols = cols, max.cutoff = max, min.cutoff = min, order = T, combine = FALSE)
p=lapply(p, `+`, labs(title = s.anno[i,1]))
if(i>1){
for(i in 1:length(p)) {
p[[i]] <- p[[i]] + NoLegend() + NoAxes()
}
}else{
p[[1]] <- p[[1]] + NoAxes()
}
cowplot::plot_grid(plotlist = p)
}
# plot here lineage proportion per patient
plot.list=mclapply(seq(dim(s.anno)[1]), plot.feature, feature, mc.cores=cores)
plot.list=plot.list[!sapply(plot.list, is.null)]
if(!length(plot.list))return(NULL)
# 74mm * 99mm per panelwidth = 77, height = 74
figure <-multipanelfigure:: multi_panel_figure(width = 170, height = ceiling(length(plot.list)/2)*75+75, rows = ceiling(length(plot.list)/2)+1, columns = 2, panel_label_type = "none", unit = "mm", row_spacing = unit(1, "mm"), column_spacing = unit(4, "mm"))
for(i in seq(plot.list)){
figure <- multipanelfigure::fill_panel(figure, plot.list[[i]])
}
multipanelfigure::save_multi_panel_figure(figure, filename=paste0(name, "_fm_scatterplot.pdf"), limitsize = FALSE)
}
plot.fm.feature=function(s.anno, feature, name, max=3, min=0, cores=1){
plot.feature=function(i, feature){
# plot proportions as a barplot next to 2D points
load(s.anno[i,2])
scmat[[feature]]=fm[rownames(fm)%in%feature,]
p <- FeaturePlot(scmat, features = feature, cols = c("grey75", "red"), max.cutoff = max, min.cutoff = min, order = T, combine = FALSE)
p=lapply(p, `+`, labs(title = s.anno[i,1]))
if(i>1){
for(i in 1:length(p)) {
p[[i]] <- p[[i]] + NoLegend() + NoAxes()
}
}else{
p[[1]] <- p[[1]] + NoAxes()
}
return(p)
}
# plot here lineage proportion per patient
plot.list=unlist(mclapply(seq(dim(s.anno)[1]), plot.feature, feature, mc.cores=cores), recursive = F)
# 74mm * 99mm per panelwidth = 77, height = 74
figure <-multipanelfigure:: multi_panel_figure(width = 170, height = ceiling(length(plot.list)/2)*75+75, rows = ceiling(length(plot.list)/2)+1, columns = 2, panel_label_type = "none", unit = "mm", row_spacing = unit(1, "mm"), column_spacing = unit(4, "mm"))
for(i in seq(plot.list)){
figure <- multipanelfigure::fill_panel(figure, plot.list[[i]])
}
# 74mm * 99mm per panelwidth = 77, height = 74
figure <-multipanelfigure:: multi_panel_figure(width = 170, height = ceiling(length(plot.list)/2)*75+75, rows = ceiling(length(plot.list)/2)+1, columns = 2, panel_label_type = "none", unit = "mm", row_spacing = unit(1, "mm"), column_spacing = unit(4, "mm"))
for(i in seq(plot.list)){
figure <- multipanelfigure::fill_panel(figure, plot.list[[i]])
}
multipanelfigure::save_multi_panel_figure(figure, filename=paste0(name, "_fm_scatterplot.pdf"), limitsize = FALSE)
}
plot.several.fm.feature=function(s.anno.object, features, name, min=0, max=5, cores=5){
load(s.anno.object)
features=features[features%in%rownames(fm)]
plot.feature=function(i, features){
# plot proportions as a barplot next to 2D points
scmat[[features[i]]]=fm[rownames(fm)%in%features[i],]
p <- FeaturePlot(scmat, features = features[i], cols = c("grey75", "red"), max.cutoff = max, min.cutoff = min, order = T, combine = FALSE)
p=lapply(p, `+`, labs(title = features[i]))
if(i>1){
for(i in 1:length(p)) {
p[[i]] <- p[[i]] + NoLegend() + NoAxes()
}
}else{
p[[1]] <- p[[1]] + NoAxes()
}
return(p)
}
# plot here lineage proportion per patient
plot.list=unlist(mclapply(seq(features), plot.feature, features, mc.cores=cores), recursive = F)
# 74mm * 99mm per panelwidth = 77, height = 74
figure <-multipanelfigure:: multi_panel_figure(width = 170, height = ceiling(length(plot.list)/2)*75+75, rows = ceiling(length(plot.list)/2)+1, columns = 2, panel_label_type = "none", unit = "mm", row_spacing = unit(1, "mm"), column_spacing = unit(4, "mm"))
for(i in seq(plot.list)){
figure <- multipanelfigure::fill_panel(figure, plot.list[[i]])
}
multipanelfigure::save_multi_panel_figure(figure, filename=paste0(name, "_fm_scatterplot.pdf"), limitsize = FALSE)
}
plot.seurat.ident=function(s.anno, colorv, name, cores=1){
plot.feature=function(i){
# plot proportions as a barplot next to 2D points
load(s.anno[i,2])
cells=gsub(" \\d+$", "", levels(Idents(scmat)))
p=DimPlot(scmat, group.by = c("ident"), cols = colorv[match(cells, colorv[,1]),2], ncol = 1, label = T, repel = T) + NoLegend() + NoAxes()
}
# plot here lineage proportion per patient
plot.list=mclapply(seq(dim(s.anno)[1]), plot.feature, mc.cores=cores)
# 74mm * 99mm per panelwidth = 77, height = 74
figure <-multipanelfigure:: multi_panel_figure(width = 170, height = ceiling(length(plot.list)/2)*75+75, rows = ceiling(length(plot.list)/2)+1, columns = 2, panel_label_type = "none", unit = "mm", row_spacing = unit(1, "mm"), column_spacing = unit(4, "mm"))
for(i in seq(plot.list)){
figure <- multipanelfigure::fill_panel(figure, plot.list[[i]])
}
multipanelfigure::save_multi_panel_figure(figure, filename=paste0(name, "_fm_scatterplot.pdf"), limitsize = FALSE)
}
#'
plot.multi.scatter.scRNA=function(s.anno, scmat=NULL, colors.group, identityVector.samples.list=NULL, seurat.feature=NULL, name="scRNA", cores=7, SIZE=0.1, rasterize=T, width=77, height = 74, text.size=10){
# tools:
library(Matrix)
library(Seurat)
source("/research/users/ppolonen/git_home/common_scripts/visualisation/plotting_functions.R")
library(data.table)
library(ComplexHeatmap)
library(circlize)
library(parallel)
library(ggplot2)
# plot here lineage proportion per patient
plot.list=mclapply(seq(dim(s.anno)[1]), function(i){
# plot proportions as a barplot next to 2D points
load(s.anno[i,2])
coords=Embeddings(scmat[["umap"]])
identityVector=unlist(strsplit(s.anno$Type[i], ", |,"))
clusters=unlist(strsplit(s.anno$Clusters[i], ", |,"))
clusters.samples=as.character(Idents(scmat))
if(is.null(identityVector.samples.list)){
identityVector.samples=clusters.samples
for(j in seq(clusters)){
identityVector.samples[clusters.samples%in%clusters[j]]=identityVector[j]
}
}else{
identityVector.samples=as.character(identityVector.samples.list[[i]]) # must be in a list
}
if(!is.null(seurat.feature)){
identityVector.samples=as.character(scmat[[seurat.feature]][,1]) # must be in a list
}
name.sample=s.anno$Sample[i]
# take the colors:
colorv=colors.group[match(unique(identityVector.samples), colors.group[,1]),2]
a=plot.scatter(x = coords[,1], y = coords[,2], group = identityVector.samples, colorv = colorv, namev=name, main = name, add.proportions = T, SIZE = SIZE, add.legend=F, rasterize=rasterize, width=width, height = height, text.size=text.size)
return(a)
}, mc.cores=cores)
# add legend last
add.legend=get.only.legend(group = colors.group[,1], colorv = colors.group[,2])
nrows=ceiling(length(plot.list)/2)+1
# 74mm * 99mm per panelwidth = 77, height = 74
figure <-multipanelfigure:: multi_panel_figure(width = 2*width+width*0.2, height = ceiling(length(plot.list)/2)*height+120, rows = nrows, columns = 2, panel_label_type = "none", unit = "mm", row_spacing = unit(1, "mm"), column_spacing = unit(4, "mm"))
rowi=1
for(i in seq(plot.list)){
figure <- multipanelfigure::fill_panel(figure, plot.list[[i]])
}
figure <- multipanelfigure::fill_panel(figure,row = nrows,column = 1:2, add.legend)
multipanelfigure::save_multi_panel_figure(figure, filename=paste0(name, "_fm_scatterplot.pdf"), limitsize = FALSE)
}
plot.scatter.seurat=function(scmat, colors.group,clusters="", identityVector="", identityVector.samples.list=NULL, seurat.feature=NULL, lv=NULL, name="scRNA", cores=7, SIZE=0.1, rasterize=T, width=77, height=74, text.size=10, add.proportions=T, add.density=F, add.labels=F){
# tools:
library(Matrix)
library(Seurat)
source("/research/users/ppolonen/git_home/common_scripts/visualisation/plotting_functions.R")
library(data.table)
library(ComplexHeatmap)
library(circlize)
library(parallel)
library(ggplot2)
coords=Embeddings(scmat[["umap"]])
clusters.samples=as.character(Idents(scmat))
if(is.null(identityVector.samples.list)){
identityVector.samples=clusters.samples
for(j in seq(clusters)){
identityVector.samples[clusters.samples%in%clusters[j]]=identityVector[j]
}
}else{
identityVector.samples=as.character(identityVector.samples.list[[i]]) # must be in a list
}
if(!is.null(seurat.feature)){
identityVector.samples=as.character(scmat[[seurat.feature]][,1]) # must be in a list
ind=order(match(identityVector.samples, colors.group[,1]))
# order everything:
identityVector.samples=identityVector.samples[ind]
coords=coords[ind,]
lv=lv[ind]
}
# take the colors:
colorv=colors.group[match(unique(identityVector.samples), colors.group[,1]),2]
plot.list=list(plot.scatter(x = coords[,1], y = coords[,2], group = identityVector.samples, colorv = colorv, lv = lv, namev=name, main = name, add.proportions = add.proportions, add.density = add.density, add.labels = add.labels, SIZE = SIZE, add.legend=T, rasterize=rasterize, width=width, height = height, text.size=text.size))
figure <-multipanelfigure:: multi_panel_figure(width = width+50, height = height, rows = 1, columns = 1, panel_label_type = "none", unit = "mm", row_spacing = unit(1, "mm"), column_spacing = unit(4, "mm"))
for(i in seq(plot.list)){
figure <- multipanelfigure::fill_panel(figure, plot.list[[i]])
}
multipanelfigure::save_multi_panel_figure(figure, filename=paste0(name, "_fm_scatterplot.pdf"), limitsize = FALSE)
}
#'
#'
compute.FDR.scRNA=function(i, s.anno, nperm=1000, geneset, nCores=6){
library(AUCell)
library(Matrix)
library(Seurat)
load(s.anno[i,2])
# take only genes that are in the matrix
geneset=geneset[geneset%in%rownames(scmat)]
# Random
geneset.random=lapply(seq(nperm), function(i, genes){
sample(rownames(scmat), length(geneset))
})
names(geneset.random)=paste0("Random",seq(nperm))
geneset.observed=list("MDS_signature"=geneset)
data_n <- data.matrix(GetAssayData(scmat, assay = "RNA"))
cells_rankings <- AUCell_buildRankings(data_n, nCores=nCores, plotStats=TRUE)
AUC.random <- AUCell_calcAUC(geneset.random, cells_rankings, nCores = nCores, nrow(cells_rankings)*0.05)
AUC.observed <- AUCell_calcAUC(geneset.observed, cells_rankings, nCores = nCores, nrow(cells_rankings)*0.05)
# Alternative: assign cells according to the 'automatic' threshold
cells_assignment <- AUCell_exploreThresholds(AUC.observed, plotHist=T, assignCells=TRUE)
# how many times random higher than observed, divide by the number of permutations == eFDR
y=as.numeric(getAUC(AUC.observed))
x=getAUC(AUC.random)
FDR=sapply(seq(y), function(i){
sum(x[,i]>y[i])/nperm
})
empirical.FDR=data.frame(t(FDR), check.names = F)
rownames(empirical.FDR)=paste0(names(geneset.observed), "_eFDR_", nperm, "_", length(geneset))
colnames(empirical.FDR)=colnames(scmat)
a=scmat[["SingleR.label"]]
table(a[rownames(a)%in%cells_assignment$MDS_signature$assignment,])
table(a[empirical.FDR<0.05,])
scmat[["empirical.FDR"]]=as.numeric(empirical.FDR)
DimPlot(scmat, group.by = "empirical.FDR")
scmat[["y"]]=as.numeric(y)>0.04
DimPlot(scmat, group.by = "y")
return(list(empirical.FDR, y))
}
get.only.legend=function(colorv, group, position="right", direction="vertical"){
library(ggplot2)
dat=data.frame(group, colorv)
levels(dat$group)=group
myColors <- colorv
names(myColors) <- levels(group)
colScale <- scale_colour_manual(name = "group",values = myColors)
dat$x=dim(dat)[1]
dat$y=dim(dat)[1]
legend <- cowplot::get_legend(ggplot(dat, aes(x, y, colour = group)) + theme_bw() +
geom_point(size=0.6) + colScale +theme(legend.position = position, legend.direction = direction, legend.title = element_blank(),
legend.key=element_blank(), legend.box.background=element_blank(),
legend.text=element_text(size = 10, face = "bold", family="Helvetica"))+
theme(plot.margin=unit(c(1,1,1,1), "mm"))+
guides(colour = guide_legend(override.aes = list(size=3))))
return(legend)
}
statistical.comparisons.scRNA=function(i, s.anno, genelist.statistical.analysis=NULL, test.use="wilcox"){
# plot proportions as a barplot next to 2D points
load(s.anno[i,2])
coords=Embeddings(scmat[["umap"]])
identityVector=gsub(" ", "", unlist(strsplit(s.anno$Type[i], ", |,")))
clusters=gsub(" ", "", unlist(strsplit(s.anno$Clusters[i], ", |,")))
clusters.samples=as.character(Idents(scmat))
identityVector.samples=clusters.samples
for(j in seq(clusters)){
identityVector.samples[clusters.samples%in%clusters[j]]=identityVector[j]
}
name.sample=s.anno$Sample[i]
if(is.null(genelist.statistical.analysis))genelist.statistical.analysis=rownames(scmat)
# Find the DE genes per factor and within factor:
statistical.analysis=do.call(rbind, mclapply(unique(identityVector.samples), function(id){
clusters2test=unique(clusters.samples[identityVector.samples%in%id])
# clusters2test vs Rest:
markers=FindMarkers(scmat, ident.1 = clusters2test, group.by = "seurat_clusters", test.use = test.use, logfc.threshold =0.01, only.pos = T, features = genelist.statistical.analysis)
#DE in clusters within category
cluster.markers=FindAllMarkers(scmat[,clusters.samples%in%clusters2test], test.use = test.use, logfc.threshold = 0.01, only.pos = T, features = genelist.statistical.analysis)
# make genelists with annotation:
markers$test=paste0(id, " vs. Rest")
markers$gene=rownames(markers)
cluster.markers$test=paste0(id, " ", cluster.markers$cluster, " vs. Rest")
cluster.markers=cluster.markers[,!colnames(cluster.markers)%in%"cluster"]
df.statistics=plyr::rbind.fill(list(markers, cluster.markers))
df.statistics$test.used=test.use
return(df.statistics)
}))
return(statistical.analysis)
}
#' @param seurat.object Seurat class object or data matrix that can be converted to Seurat object. Can also be a list of data matrices (RNA+citeseq etc.), Must be named by assay type with four uppercase letters (PROT, etc) (also found from seurat object with this name).
#' @param regress.cell.label Character vector with as many columns as seurat.object Contains batch information for each cell. Uses fastMNN/SCTtransform to batch correct the data (usually different samples or experiments).
#' @param check.pcs Plot jackstraw and elbowplot to optimize the number of PCA componens
#' @param plot.umap Umap plot for clusters and for sample ID, if batch corrected also batch clustering
#' @param resolution Resolution parameter for clustering
#' @param nFeature.min Filter cells with < nFeature.min
#' @param nFeature.max Filter cells with > nFeature.max
#' @param percent.mitoDNA Filter cells with mitochondrial genes > percent.mitoDNA
#' @param singleR Annotate each cell using singleR built in annotations. Mainly uses Encode+blueprint annotations.
#' @param cores Number of cores to use for Seurat/singleR/MNNcorrect
#' @param add.gm.score Adds geneset score (using geometric mean) to fm, must be a named list of genes. Geneset name is added to fm and seurat object.
#' @param ... Pass parameters to Seurat tools
sc.data.analysis=function(scmat, name="scRNA_object", nr.pcs=30, regress.cell.label=NULL, batch.correction.method=c("SCTtransform","MNNcorrect"), auto.order=F, normalize.input=T, check.pcs=T, plot.umap=T, resolution=0.5, nFeature.min=0, nFeature.max=10000, percent.mitoDNA=25, singleR=T, singleR.reference=c("blueprint_encode", "HPCA"), cores=6, add.gm.score=NULL, ...){
# determine computational resources:
future::plan("multiprocess", workers = cores)
options(future.globals.maxSize= 5e+09)
# set method for batch correction
batch.correction.method=match.arg(batch.correction.method)
# list of data matrix, can be citeseq data:
if(class(scmat)=="list"){
cat("Input is list, splitting to data matrix by type and adding to Seurat", sep="\n\n")
list.additional=lapply(2:length(scmat), function(i)CreateAssayObject(scmat[[i]]))
names(list.additional)=names(scmat)[2:length(scmat)]
scmat <- CreateSeuratObject(scmat[[1]])
}else{
list.additional=NULL
# if class is not seurat object
if(!class(scmat)=="Seurat"){
cat("Input is data matrix, converting to Seurat object", sep="\n\n")
scmat <- CreateSeuratObject(scmat)
}else{
cat("Input is Seurat object", sep="\n\n")
}
}
DefaultAssay(object = scmat) <- "RNA"
# is it a vector of pcs or single value?
if(length(nr.pcs)==1)nr.pcs=seq(nr.pcs)
# QC
scmat=PercentageFeatureSet(scmat, pattern = "^MT-", col.name = "percent.mt")
r1=VlnPlot(scmat, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
ggplot2::ggsave(r1, filename = paste0(name, "_QC.pdf"), width = 12)
# Filtering
nFeature_RNA <- FetchData(object = scmat, vars = "nFeature_RNA")
pct.mt <- FetchData(object = scmat, vars = "percent.mt")
filt=which(x = nFeature_RNA > nFeature.min & nFeature_RNA < nFeature.max & pct.mt < percent.mitoDNA)
cat(paste("Dimension before/after filtering:", paste(paste(dim(scmat), collapse="x"), paste(dim(scmat[, filt]), collapse="x"), collapse = "/")), sep="\n\n")
scmat=scmat[, filt]
if(normalize.input){
assay="SCT"
scmat=SCTransform(scmat, variable.features.n = 3000) #,...)
# seurat standard processing
scmat=RunPCA(scmat, npcs = 100)
reduction="pca"
}else{
assay="RNA"
# assume that the data is normalized
scmat <- FindVariableFeatures(object = scmat)
scmat <- ScaleData(object = scmat)
scmat=RunPCA(scmat, npcs = 100)
reduction="pca"
}
# set default assay to use:
DefaultAssay(object = scmat) <- assay
# use SCTransform to remove batch effect, used in umap and clustering
# else use SCT to normalize data
if(!is.null(regress.cell.label)&batch.correction.method=="SCTtransform"){
batch.df=data.frame("batch"=regress.cell.label[filt])
rownames(batch.df)=colnames(scmat)
scmat[["batch"]] = batch.df
scmat=SCTransform(scmat,vars.to.regress="batch", variable.features.n = 3000)
# seurat standard processing
scmat=RunPCA(scmat, npcs = 100)
name=paste0(name, "_", batch.correction.method)
reduction="pca"
}
# optimize pcs number:
if(check.pcs){
# setting here original RNA to define PCAs, SCT not working yet, wait for a fix
# also the issue here how to deal with the mnnCorrect?
# DefaultAssay(object = scmat) <- "RNA"
# scmat <- NormalizeData(scmat,display.progress = FALSE)
# scmat <- FindVariableFeatures(scmat,do.plot = F,display.progress = FALSE)
# scmat <- ScaleData(scmat)
# scmat=RunPCA(scmat, npcs = 100)
# determine how many pcs should be used:
r3=ElbowPlot(scmat, ndims = 100, reduction = "pca")
ggplot2::ggsave(r3, filename = paste0(name, "_elbow.pdf"))
# find optimal number of PCs to use:
scmat <- JackStraw(scmat, num.replicate = 100, dims = 100, reduction = "pca")
scmat <- ScoreJackStraw(scmat, dims = 1:100, reduction = "pca")
r4=JackStrawPlot(scmat, dims = 1:100)
ggplot2::ggsave(r4, filename = paste0(name, "_Jackstraw.pdf"), width = 12)
cat("PCs checking done", sep="\n\n")
}
# use MNN to remove batch effect, used in umap and clustering:
if(!is.null(regress.cell.label)&batch.correction.method=="MNNcorrect"){
library(BiocParallel)
library(batchelor)
cat("FastMNN batch correction...", sep="\n\n")
if(is.factor(regress.cell.label)&!auto.order){
order=levels(regress.cell.label)
cat("FastMNN batch correction order:", sep="\n\n")
cat(order, sep="-->")
cat("", sep="\n\n")
}
if(!is.factor(regress.cell.label)&!auto.order){
order=unique(regress.cell.label)
cat("FastMNN batch correction order:", sep="\n\n")
cat(order, sep="-->")
cat("", sep="\n\n")
}
batch.df=data.frame("batch"=regress.cell.label[filt])
rownames(batch.df)=colnames(scmat)
scmat[["batch"]] = batch.df
# http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/fast_mnn.html
scmat <- NormalizeData(scmat)
scmat <- FindVariableFeatures(scmat)
object.list = SplitObject(scmat, split.by = "batch")[order(order)]
scmat <- SeuratWrappers::RunFastMNN(object.list, reduction.name = "mnnCorrect", auto.order=auto.order, BPPARAM=MulticoreParam(cores))
# set defaults
reduction="mnnCorrect"
cat("FastMNN batch correction... done", sep="\n\n")
}
# Basic seurat processing with umap:
scmat=RunUMAP(scmat, dims = nr.pcs, reduction = reduction)
scmat=FindNeighbors(scmat, dims = nr.pcs, reduction = reduction)
scmat=FindClusters(scmat, resolution = resolution)
# add celltype automatically
if(singleR){
cat("Annotating with singleR", sep="\n\n")
if(!file.exists(paste0(name, "_singler_object.Rdata"))){
library("SingleR")
counts <- GetAssayData(scmat, assay = assay, slot = "counts")
if(singleR.reference=="blueprint_encode"){
if(dim(counts)[2]<100000){
singler = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(blueprint_encode), project.name=name, fine.tune = T,do.signatures = T, numCores=cores)
}else{
cat("SingleR in batches", sep="\n\n")
singler = CreateBigSingleRObject.custom(counts, annot = NULL, clusters=Idents(scmat), N=30000, ref.list = list(blueprint_encode), xy = Embeddings(scmat[["umap"]]), project.name=name, fine.tune = T, do.signatures = T, numCores=cores)
}
}
if(singleR.reference=="HPCA"){
if(dim(counts)[2]<100000){
singler = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(hpca), project.name=name, fine.tune = T,do.signatures = T, numCores=cores)
}else{
cat("SingleR in batches", sep="\n\n")
singler = CreateBigSingleRObject.custom(counts, annot = NULL, clusters=Idents(scmat), N=30000, ref.list = list(hpca), project.name=name, xy = Embeddings(scmat[["umap"]]), fine.tune = T,do.signatures = T, numCores=cores)
}
}
# add original identifiers
singler$meta.data$orig.ident = scmat@meta.data$orig.ident # the original identities, if not supplied in 'annot'
## if using Seurat v3.0 and over use:
singler$meta.data$xy = scmat@reductions$umap@cell.embeddings # the tSNE coordinates
singler$meta.data$clusters = scmat@active.ident # the Seurat clusters (if 'clusters' not provided)
# add annot from hpca
# singler2 = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(hpca), project.name=name, fine.tune = T,do.signatures = T, numCores=cores)
# singler$singler[[1]]$SingleR.single$labels[singler2$singler[[1]]$SingleR.single$labels%in%"Pre-B_cell_CD34-"]="pre-B-cells"
# singler$singler[[1]]$SingleR.single$labels[singler2$singler[[1]]$SingleR.single$labels%in%"Pro-B_cell_CD34+"]="pro-B-cells"
# singler$singler[[1]]$SingleR.clusters$labels[singler2$singler[[1]]$SingleR.clusters$labels%in%"Pre-B_cell_CD34-"]="pre-B-cells"
# singler$singler[[1]]$SingleR.clusters$labels[singler2$singler[[1]]$SingleR.clusters$labels%in%"Pro-B_cell_CD34+"]="pro-B-cells"
# these names are not intuitive, replace them, mentioned that these are actually naive http://comphealth.ucsf.edu/SingleR/SupplementaryInformation2.html:
singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%"CD4+ T-cells"]="CD4+ Tn"
singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%"CD8+ T-cells"]="CD8+ Tn"
singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%"CD4+ T-cells"]="CD4+ Tn"
singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%"CD8+ T-cells"]="CD8+ Tn"
save(singler, file=paste0(name, "_singler_object.Rdata"))
}else{
library("SingleR")
load(paste0(name, "_singler_object.Rdata"))
cat(paste0("Loaded existing SingleR object (", paste0(name, "_singler_object.Rdata"), "), remove/rename it if you want to re-compute."), sep="\n\n")
}
# can be added as cluster id
cluster=singler$singler[[1]]$SingleR.clusters$labels
cluster.main=singler$singler[[1]]$SingleR.clusters.main$labels
# order based on cell type and add celltype to cluster:
lineage=c("HSC","MPP","CLP","CMP","GMP","MEP","Monocytes","DC","Macrophages","Macrophages M1","Macrophages M2","CD4+ Tn","CD4+ T-cells", "CD4+ Tcm", "CD4+ Tem","CD8+ Tn","CD8+ T-cells","CD8+ Tcm","CD8+ Tem","NK cells","Tregs","naive B-cells","Memory B-cells","Class-switched memory B-cells","Plasma cells","Endothelial cells","Neutrophils","Eosinophils","Fibroblasts","Smooth muscle","Erythrocytes","Megakaryocytes")
lineage=unique(c(lineage,blueprint_encode$types, hpca$types))
lineage=lineage[lineage%in%singler$singler[[1]]$SingleR.single$labels]
scmat[["SingleR.label.main"]]=singler$singler[[1]]$SingleR.single.main$labels
scmat[["SingleR.label.cluster"]]=paste(singler$singler[[1]]$SingleR.single$labels, Idents(scmat))
scmat[["SingleR.label"]]=factor(singler$singler[[1]]$SingleR.single$labels, levels=lineage[lineage%in%unique(singler$singler[[1]]$SingleR.single$labels)])
identityVector.samples=as.character(Idents(scmat))
clusters.samples=Idents(scmat)
for(j in seq(cluster)){
identityVector.samples[clusters.samples%in%rownames(cluster)[j]]=paste0(cluster[j], ":", rownames(cluster)[j])
}
# order and cluster identity;
cluster=cluster[order(match(cluster[,1],lineage)),,drop=F]
cat(paste(levels(Idents(scmat)), collapse=","), paste(cluster, collapse=","), sep="\t\t")
scmat[["SingleR.cluster"]]=gsub(":.*.", "", Idents(scmat)) # just the "cluster label" per cell
# order seurat clusters too:
Idents(scmat)=factor(identityVector.samples, levels=paste0(cluster, ":", rownames(cluster)))
r4=DimPlot(object = scmat, group.by = "SingleR.label", reduction = 'umap' , label = TRUE, pt.size = 0.5, repel = T) + NoLegend() + NoAxes()
ggplot2::ggsave(r4, filename = paste0(name, "_UMAP_singleR.pdf"), width = 6, height = 6)
cat("\nSingleR done", sep="\n\n")
# new singleR:
# counts <- GetAssayData(scmat, assay = "RNA", slot = "counts")
#
# library(SingleR)
# library(scater)
#
# ref=NovershternHematopoieticData()
#
# common <- intersect(rownames(counts), rownames(ref))
# ref <- ref[common,]
# counts <- counts[common,]
# counts.log <- logNormCounts(counts)
#
# singler <- SingleR(test = counts.log, ref = ref, labels = ref$label.main, numCores=cores)
#
# # order based on cell type and add celltype to cluster:
# lineage=c("HSC","MPP","CLP","CMP","GMP","MEP","Monocytes","DC","Macrophages","Macrophages M1","Macrophages M2","CD4+ Tn","CD4+ T-cells", "CD4+ Tcm", "CD4+ Tem","CD8+ Tn","CD8+ T-cells","CD8+ Tcm","CD8+ Tem","NK cells","Tregs","naive B-cells","Memory B-cells","Class-switched memory B-cells","Plasma cells","Endothelial cells","Neutrophils","Eosinophils","Fibroblasts","Smooth muscle","Erythrocytes","Megakaryocytes")
# lineage=unique(c(lineage,ref$label.main))
#
# scmat[["SingleR.label.main"]]=singler$first.labels
# scmat[["SingleR.label"]]=factor(singler$labels, levels=lineage[lineage%in%unique(singler$labels)])
#
# r4=DimPlot(object = scmat, group.by = "SingleR.label", reduction = 'umap' , label = TRUE, pt.size = 0.5, repel = T) + NoLegend() + NoAxes()
# ggplot2::ggsave(r4, filename = paste0(name, "_UMAP_singleR.pdf"), width = 6, height = 6)
#
# cat("\nSingleR done", sep="\n\n")
}
# plot clusters and samples
if(plot.umap){
if(!is.null(regress.cell.label)){
r6=DimPlot(scmat, group.by = c("batch"), ncol = 2, label = T, repel = T) + NoLegend() + NoAxes()
ggplot2::ggsave(r6, filename = paste0(name, "_UMAP_batch.pdf"), width = 6, height = 6)
r6=DimPlot(scmat, group.by = c("ident"), ncol = 2, label = T, repel = T) + NoLegend() + NoAxes()
ggplot2::ggsave(r6, filename = paste0(name, "_UMAP_clusters.pdf"), width = 6, height = 6)
}else{
r6=DimPlot(scmat, group.by = c("ident"), ncol = 1, label = T, repel = T) + NoLegend() + NoAxes()
ggplot2::ggsave(r6, filename = paste0(name, "_UMAP_clusters.pdf"), width = 6, height = 6)
}
}
# add immunoscores to FM format data matrix
fm.f <- GetAssayData(scmat)
# add gm based score:
add.scores=list(HLAIScore=c("B2M", "HLA-A", "HLA-B","HLA-C"), HLAIIScore=c("HLA-DMA","HLA-DMB","HLA-DPA1","HLA-DPB1","HLA-DRA","HLA-DRB1"), CytolyticScore=c("GZMA", "PRF1", "GNLY", "GZMH", "GZMM"))
if(!is.null(add.gm.score))add.scores=append(add.scores, add.gm.score)
gm.objects=do.call(rbind, lapply(seq(add.scores), function(i){
dat3=fm.f[rownames(fm.f)%in%add.scores[[i]],]
gm=t(apply(dat3, 2, gm_mean)) # done to normalized values
rownames(gm)=names(add.scores)[i]
return(gm)
}))
# also add to seurat object:
for(i in seq(add.scores)){
scmat[[names(add.scores)[i]]] <- gm.objects[i,]
}
# gene expression and scores to fm:
rownames(fm.f)=paste0("N:GEXP:", rownames(fm.f))
rownames(gm.objects)=paste0("N:SAMP:", rownames(gm.objects))
fm=rbind(gm.objects, fm.f)
# DE analysis:
markers.all=FindAllMarkers(scmat, only.pos = T)#, ...)
# go through each data matrix, add to seurat and fm if more matrices
if(!is.null(list.additional)){
for(i in seq(list.additional)){
mat.add=list.additional[[i]]
mat.add=subset(mat.add,cells = colnames(scmat))
scmat[[names(list.additional)[i]]] <- mat.add
scmat <- NormalizeData(scmat, assay = names(list.additional)[i], normalization.method = "CLR") # test scttransform too
scmat <- ScaleData(scmat, assay = names(list.additional)[i])
add.data.normalized <- data.matrix(GetAssayData(scmat, assay = names(list.additional)[i], slot = "data"))
rownames(add.data.normalized)=paste0("N:",names(list.additional)[i], ":", rownames(add.data.normalized))
fm=rbind(fm, add.data.normalized)
}
}
save(list=c("fm", "scmat", "markers.all"), file=paste0(name, "_scRNA.Rdata"))
return(paste0(name, "_scRNA.Rdata"))
}
# make sure data is on a linear non-log scale
# check 0 values, if a lot between 0-1 better to use add.one. If counts, remove works too quite well
# zero fix methods: https://arxiv.org/pdf/1806.06403.pdf
gm_mean=function(x, na.rm=TRUE, zero.handle=c("remove", "add.one", "zero.propagate")){
zero.handle=match.arg(zero.handle)
if(any(x < 0, na.rm = TRUE)){
warning("Negative values produced NaN - is the data on linear - non-log scale?")
return(NaN)
}
if(zero.handle=="remove"){
return(exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)))
}
if(zero.handle=="add.one"){
return(exp(mean(log(x+1), na.rm = na.rm))-1)
}
if(zero.handle=="zero.propagate"){
if(any(x == 0, na.rm = TRUE)){
return(0)
}
return(exp(mean(log(x), na.rm = na.rm)))
}
}
get.cluster.label.singleR=function(singler){
# how to add this?
cluster=singler$singler[[1]]$SingleR.clusters$labels
cluster.main=singler$singler[[1]]$SingleR.clusters.main$labels
# order based on cell type:
# cat(paste0("'", colors.group[,1], "'"), sep=",")
lineage=c('HSC','MPP','CLP','CMP','GMP','MEP','Monocytes','DC','Macrophages','Macrophages M1','Macrophages M2','CD4+ T-cells','CD8+ T-cells','Tregs','T_cell:gamma-delta','T_cell:CD4+_Naive','T_cell:CD8+_naive','T_cell:Treg:Naive','CD4+ Tcm','CD4+ Tem','CD8+ Tcm','CD8+ Tem','NK cells','naive B-cells','Memory B-cells','B-cells','Class-switched memory B-cells','Plasma cells','Endothelial cells','Neutrophils','Eosinophils','Fibroblasts','Smooth muscle','Erythroblast','Erythrocytes','Megakaryocytes')
cluster=cluster[order(match(cluster[,1],lineage)),,drop=F]
# order and cluster identity;
cat(paste(rownames(cluster), collapse=","), paste(cluster, collapse=","), sep="\t")
cat("", sep="\n")
return(cluster)
}
# run cellphoneDB
run.cellphonedb=function(scmat, celltype, out=getwd(), cores=10, threshold=0.05){
if(!dir.exists(out))dir.create(out, recursive = T)
if(is.null(celltype)){
celltype=gsub(" ", "_", as.character(Idents(scmat)))
}
setwd(out)
# input1:
df=data.frame("Cell"=gsub("\\.", "_", colnames(scmat)), "cell_type"=celltype, stringsAsFactors = F)
df$cell_type[scmat[["SingleR.label"]]=="Tregs"]="Tregs"
meta=file.path(out, "meta.tsv")
write.table(df, meta, row.names = F, col.names = T, quote = F, sep="\t")
# input2:
counts <- data.matrix(GetAssayData(scmat, assay = "SCT", slot = "data"))
countsfile=tempfile()
# gene symbol was not working...
library("org.Hs.eg.db") # remember to install it if you don't have it already
ensambleid <- mapIds(org.Hs.eg.db, keys = rownames(counts), keytype = "SYMBOL", column="ENSEMBL")
rownames(counts)=ensambleid
counts=counts[!is.na(rownames(counts)),]
countsfile=file.path(out, "counts.tsv")
write.table(t(c("Gene", gsub("\\.", "_", colnames(counts)))), countsfile, row.names = F, col.names = F, quote = F, sep="\t")
write.table(counts, countsfile, row.names = T, col.names = F, quote = F, sep="\t", append=T)
system(paste0("cellphonedb method statistical_analysis ", meta, " ", countsfile, " --iterations=1000 --threads=", cores, " --threshold=", threshold))
# make heatmap of interactions:
system(paste0("cellphonedb plot heatmap_plot ", meta))
deconvoluted=data.table::fread(file.path(out, "out/deconvoluted.txt"), data.table = F)
pvalues=data.table::fread(file.path(out, "out/pvalues.txt"), data.table = F)
significant_means.txt=data.table::fread(file.path(out, "out/significant_means.txt"), data.table = F)
means=data.table::fread(file.path(out, "out/means.txt"), data.table = F)
unlink(countsfile)
unlink(meta)
return(list("deconvoluted"=deconvoluted, "pvalues"=pvalues, "significant_means"=significant_means.txt, "means"=means))
}
# found bug in reference list, was missing, also min genes was 200 not 0 causing errors...
CreateBigSingleRObject.custom=function (counts, annot = NULL, project.name, xy, clusters, N = 10000,
min.genes = 0, technology = "10X", species = "Human", citation = "",
ref.list = list(), normalize.gene.length = F, variable.genes = "de",
fine.tune = T, reduce.file.size = T, do.signatures = F, do.main.types = T,
temp.dir = getwd(), numCores = SingleR.numCores){
n = ncol(counts)
s = seq(1, n, by = N)
dir.create(paste0(temp.dir, "/singler.temp/"), showWarnings = FALSE)
for (i in s) {
print(i)
A = seq(i, min(i + N - 1, n))
singler = CreateSinglerObject(counts[, A], annot = annot[A], ref.list = ref.list,
project.name = project.name, min.genes = min.genes,
technology = technology, species = species, citation = citation,
do.signatures = do.signatures, clusters = NULL, numCores = numCores)
save(singler, file = paste0(temp.dir, "/singler.temp/",
project.name, ".", i, ".RData"))
}
singler.objects.file <- list.files(paste0(temp.dir, "/singler.temp/"),
pattern = "RData", full.names = T)
singler.objects = list()
for (i in 1:length(singler.objects.file)) {
load(singler.objects.file[[i]])
singler.objects[[i]] = singler
}
singler = SingleR.Combine.custom(singler.objects, order = colnames(counts),expr = counts, clusters = clusters, xy = xy)
singler
}
# bug also in this...
SingleR.Combine.custom=function (singler.list, order = NULL, clusters = NULL, expr = NULL, xy = NULL)
{
singler = c()
singler$singler = singler.list[[1]]$singler
for (j in 1:length(singler.list[[1]]$singler)) {
singler$singler[[j]]$SingleR.cluster = c()
singler$singler[[j]]$SingleR.cluster.main = c()
singler$singler[[j]]$SingleR.single$clusters = c()
}
singler$meta.data = singler.list[[1]]$meta.data
singler$meta.data$clusters = c()
singler$meta.data$xy = c()
singler$meta.data$data.sets = rep(singler$meta.data$project.name,
length(singler$meta.data$orig.ident))
for (i in 2:length(singler.list)) {
for (j in 1:length(singler$singler)) {
if (singler.list[[i]]$singler[[j]]$about$RefData !=
singler.list[[1]]$singler[[j]]$about$RefData) {
stop("The objects are not ordered by the same reference data.")
}
singler$singler[[j]]$about$Organism = c(singler$singler[[j]]$about$Organism,
singler.list[[i]]$singler[[j]]$about$Organism)
singler$singler[[j]]$about$Citation = c(singler$singler[[j]]$about$Citation,
singler.list[[i]]$singler[[j]]$about$Citation)
singler$singler[[j]]$about$Technology = c(singler$singler[[j]]$about$Technology,
singler.list[[i]]$singler[[j]]$about$Technology)
singler$singler[[j]]$SingleR.single$labels = rbind(singler$singler[[j]]$SingleR.single$labels,
singler.list[[i]]$singler[[j]]$SingleR.single$labels)
if (!is.null(singler$singler[[j]]$SingleR.single$labels1)) {
singler$singler[[j]]$SingleR.single$labels1 = rbind(singler$singler[[j]]$SingleR.single$labels1,
singler.list[[i]]$singler[[j]]$SingleR.single$labels1)
}
singler$singler[[j]]$SingleR.single$scores = rbind(singler$singler[[j]]$SingleR.single$scores,
singler.list[[i]]$singler[[j]]$SingleR.single$scores)
singler$singler[[j]]$SingleR.single.main$labels = rbind(singler$singler[[j]]$SingleR.single.main$labels,
singler.list[[i]]$singler[[j]]$SingleR.single.main$labels)
if (!is.null(singler$singler[[j]]$SingleR.single.main$labels1)) {
singler$singler[[j]]$SingleR.single.main$labels1 = rbind(singler$singler[[j]]$SingleR.single.main$labels1,
singler.list[[i]]$singler[[j]]$SingleR.single.main$labels1)
}
singler$singler[[j]]$SingleR.single.main$scores = rbind(singler$singler[[j]]$SingleR.single.main$scores,
singler.list[[i]]$singler[[j]]$SingleR.single.main$scores)
singler$singler[[j]]$SingleR.single$cell.names = c(singler$singler[[j]]$SingleR.single$cell.names,
singler.list[[i]]$singler[[j]]$SingleR.single$cell.names)
singler$singler[[j]]$SingleR.single.main$cell.names = c(singler$singler[[j]]$SingleR.single.main$cell.names,
singler.list[[i]]$singler[[j]]$SingleR.single.main$cell.names)
if (!is.null(singler$singler[[j]]$SingleR.single.main$pval)) {
singler$singler[[j]]$SingleR.single.main$pval = c(singler$singler[[j]]$SingleR.single.main$pval,
singler.list[[i]]$singler[[j]]$SingleR.single.main$pval)
}
if (!is.null(singler$singler[[j]]$SingleR.single$pval)) {
singler$singler[[j]]$SingleR.single$pval = c(singler$singler[[j]]$SingleR.single$pval,
singler.list[[i]]$singler[[j]]$SingleR.single$pval)
}
}
singler$meta.data$project.name = paste(singler$meta.data$project.name,
singler.list[[i]]$meta.data$project.name, sep = "+")
singler$meta.data$orig.ident = c(singler$meta.data$orig.ident,
singler.list[[i]]$meta.data$orig.ident)
singler$meta.data$data.sets = c(singler$meta.data$data.sets,
rep(singler.list[[i]]$meta.data$project.name, length(singler.list[[i]]$meta.data$orig.ident)))
}
for (j in 1:length(singler$singler)) {
if (!is.null(order)) {
singler$singler[[j]]$SingleR.single$labels = singler$singler[[j]]$SingleR.single$labels[order,
]
if (!is.null(singler$singler[[j]]$SingleR.single$labels1)) {
singler$singler[[j]]$SingleR.single$labels1 = singler$singler[[j]]$SingleR.single$labels1[order,
]
}
singler$singler[[j]]$SingleR.single$scores = singler$singler[[j]]$SingleR.single$scores[order,
]
singler$singler[[j]]$SingleR.single$cell.names = singler$singler[[j]]$SingleR.single$cell.names[order]
singler$singler[[j]]$SingleR.single.main$labels = singler$singler[[j]]$SingleR.single.main$labels[order,
]
if (!is.null(singler$singler[[j]]$SingleR.single.main$labels1)) {
singler$singler[[j]]$SingleR.single.main$labels1 = singler$singler[[j]]$SingleR.single.main$labels1[order,
]
}
singler$singler[[j]]$SingleR.single.main$scores = singler$singler[[j]]$SingleR.single.main$scores[order,
]
singler$singler[[j]]$SingleR.single.main$cell.names = singler$singler[[j]]$SingleR.single.main$cell.names[order]
if (!is.null(singler$singler[[j]]$SingleR.single$pval)) {
singler$singler[[j]]$SingleR.single$pval = singler$singler[[j]]$SingleR.single$pval[order]
singler$singler[[j]]$SingleR.single.main$pval = singler$singler[[j]]$SingleR.single.main$pval[order]
}
}
}
if (!is.null(clusters) && !is.null(expr)) {
for (j in 1:length(singler$singler)) {
if (is.character(singler$singler[[j]]$about$RefData)) {
ref = get(tolower(singler$singler[[j]]$about$RefData))
}
else {
ref = singler$singler[[j]]$about$RefData
}
singler$singler[[j]]$SingleR.clusters = SingleR("cluster",
expr, ref$data, types = ref$types, clusters = factor(clusters),
sd.thres = ref$sd.thres, genes = "de", fine.tune = T)
singler$singler[[j]]$SingleR.clusters.main = SingleR("cluster",
expr, ref$data, types = ref$main_types, clusters = factor(clusters),
sd.thres = ref$sd.thres, genes = "de", fine.tune = T)
}
singler$meta.data$clusters = clusters
if (!is.null(xy)) {
singler$meta.data$xy = xy
}
}
singler
}