--- a
+++ b/preprocessing/Preprocessing_scRNA_HCA.R
@@ -0,0 +1,269 @@
+# # HumanCellAtlas data
+# file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HCA.h5ad"
+# library(Seurat)
+# hca <- ReadH5AD(file)
+# Idents(hca) <- "louvain"
+# save(hca, file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA.Rdata")
+
+# file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HCA_raw.h5ad"
+# 
+# counts=t(data.table::fread("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/X_allgenes.csv.gz", data.table = F))
+# rownames(counts)=scan("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/var_allgenes.txt","genes")
+# colnames(counts)=scan("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/data/raw/obs_allgenes.txt","sample")
+# save(counts, file="/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA_rawCounts.Rdata")
+
+#******************************************* all cells *********************************************
+
+library(Matrix)
+library(Seurat)
+library(dplyr)
+source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R")
+
+load("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA_rawCounts.Rdata")
+load("/research/groups/biowhat_share/public_data/scRNAseq/Human_Cell_Atlas_Preview_Datasets/scanpy/hg19/new_2019/cellranger3/HumanCellAtlas_scRNA.Rdata")
+
+setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/")
+
+name="HumanCellAtlas"
+
+clusters=Idents(hca)
+hca[["rnaClusterID"]] <- clusters
+
+#
+library("HGNChelper")
+update=checkGeneSymbols(rownames(data))
+genelist1=update[,1]
+genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
+rownames(data)=genelist1
+
+# try own processing:
+batch=gsub("_HiSeq_.*.", "", colnames(scmat))
+name=paste0(colnames(counts), "-", gsub("_HiSeq_.*.", "", colnames(counts)))
+
+test1=sc.data.analysis(scmat = counts[rowSums(counts)>1000,name%in%colnames(scmat)], regress.cell.label = batch, batch.correction.method = "MNNcorrect", name="HCA", nr.pcs = 50, check.pcs=F, plot.umap = T, cores = 10, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000)
+
+
+# sbatch
+library(Matrix)
+library(Seurat)
+library(dplyr)
+source("functions.scRNA.analysis.R")
+
+load("HumanCellAtlas_scRNA_rawCounts.Rdata")
+load("HumanCellAtlas_scRNA.Rdata")
+
+name="HumanCellAtlas"
+
+clusters=Idents(hca)
+hca[["rnaClusterID"]] <- clusters
+
+#
+library("HGNChelper")
+update=checkGeneSymbols(rownames(data))
+genelist1=update[,1]
+genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
+rownames(data)=genelist1
+
+# try own processing:
+batch=gsub("_HiSeq_.*.", "", colnames(scmat))
+name=paste0(colnames(counts), "-", gsub("_HiSeq_.*.", "", colnames(counts)))
+
+test1=sc.data.analysis(scmat = counts[,name%in%colnames(scmat)], regress.cell.label = batch, batch.correction.method = "MNNcorrect", name="HCA", nr.pcs = 50, check.pcs=F, plot.umap = T, cores = 10, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000)
+
+
+#******************************************* only TNK cells *********************************************
+library(Matrix)
+library(Seurat)
+library(dplyr)
+source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R")
+
+setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/")
+
+name="HumanCellAtlas_TNK"
+
+load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata")
+scmat=scmat[,grepl("Treg|NK|CD", scmat[["SingleR.label"]][,1])]
+
+data=GetAssayData(scmat, assay = "RNA", slot = "counts")
+
+library("HGNChelper")
+update=checkGeneSymbols(rownames(data))
+genelist1=update[,1]
+genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
+rownames(data)=genelist1
+
+# try own processing:
+batch=gsub("_HiSeq_.*.", "", colnames(scmat))
+
+test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1)
+#************************************************************************************************************
+
+
+#******************************************* only NK cells *********************************************
+library(Matrix)
+library(Seurat)
+library(dplyr)
+source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R")
+
+setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/")
+
+name="HumanCellAtlas_NK"
+
+load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata")
+scmat=scmat[,grepl("NK", scmat[["SingleR.label"]][,1])]
+
+# try own processing:
+batch=gsub("_HiSeq_.*.", "", colnames(scmat))
+
+data=GetAssayData(scmat, assay = "RNA", slot = "counts")
+
+library("HGNChelper")
+update=checkGeneSymbols(rownames(data))
+genelist1=update[,1]
+genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
+rownames(data)=genelist1
+
+test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T,  percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1)
+#************************************************************************************************************
+
+
+#******************************************* only T cells *********************************************
+library(Matrix)
+library(Seurat)
+library(dplyr)
+source("/research/users/ppolonen/git_home/common_scripts/scRNA/functions.scRNA.analysis.R")
+
+setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/")
+
+name="HumanCellAtlas_Tcells"
+
+load("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/data/HCA_scRNA/myrun/HCA_scRNA.Rdata")
+scmat=scmat[,grepl("Treg|CD", scmat[["SingleR.label"]][,1])]
+
+data=GetAssayData(scmat, assay = "RNA", slot = "counts")
+
+library("HGNChelper")
+update=checkGeneSymbols(rownames(data))
+genelist1=update[,1]
+genelist1[!is.na(update[,3])]=update[!is.na(update[,3]),3]
+rownames(data)=genelist1
+
+# try own processing:
+batch=gsub("_HiSeq_.*.", "", colnames(scmat))
+
+test1=sc.data.analysis(scmat = data, regress.cell.label = batch, batch.correction.method = "MNNcorrect", name=name, nr.pcs = 50, check.pcs=F, plot.umap = T, percent.mitoDNA = 10, nFeature.min = 0, nFeature.max = 6000, cores = 4, resolution = 1)
+#************************************************************************************************************
+
+
+
+# add names and RNA cluster
+clusters=Idents(hca)
+hca[["rnaClusterID"]] <- clusters
+
+# take 50% of cells from each cluster, and preserve proportions
+filt=unlist(lapply(unique(clusters), function(clust){
+  set.seed(1)
+  ind=which(clusters%in%clust)
+  nr.samples=table(clusters)/2
+  sample(colnames(hca)[ind], nr.samples[names(nr.samples)%in%clust])
+}))
+
+scmat=subset(hca, cells=filt)
+
+# old:
+library("SingleR")
+cores=8
+counts <- data.matrix(GetAssayData(scmat, assay = "RNA"))
+
+# singler = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(blueprint_encode), project.name=name, fine.tune = T,do.signatures = T, numCores=cores)
+# singler2 = CreateSinglerObject(counts, annot = NULL, clusters=Idents(scmat), ref.list = list(hpca), project.name=name, fine.tune = T,do.signatures = T, numCores=cores)
+
+# save(singler, file=paste0(name, "_singler_object_blueprint_encode.Rdata"))
+# save(singler2, file=paste0(name, "_singler_object_hpca.Rdata"))
+
+load(paste0(name, "_singler_object_blueprint_encode.Rdata"))
+load(paste0(name, "_singler_object_hpca.Rdata"))
+
+# add original identifiers
+singler$meta.data$orig.ident = scmat@meta.data$orig.ident # the original identities, if not supplied in 'annot'
+
+## if using Seurat v3.0 and over use:
+singler$meta.data$xy = scmat@reductions$umap@cell.embeddings # the tSNE coordinates
+singler$meta.data$clusters = scmat@active.ident # the Seurat clusters (if 'clusters' not provided)
+
+# these names are not intuitive, replace them, mentioned that these are actually naive http://comphealth.ucsf.edu/SingleR/SupplementaryInformation2.html:
+singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%"CD4+ T-cells"]="CD4+ Tn"
+singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%"CD8+ T-cells"]="CD8+ Tn"
+singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%"CD4+ T-cells"]="CD4+ Tn"
+singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%"CD8+ T-cells"]="CD8+ Tn"
+
+# change B-cells
+singler$singler[[1]]$SingleR.single$labels[singler$singler[[1]]$SingleR.single$labels%in%c("B-cells","Class-switched memory B-cells", "naive B-cells", "Memory B-cells")]="B-cells"
+singler$singler[[1]]$SingleR.clusters$labels[singler$singler[[1]]$SingleR.clusters$labels%in%c("B-cells","Class-switched memory B-cells", "naive B-cells", "Memory B-cells")]="B-cells"
+
+scmat[["SingleR.label.main"]]=singler$singler[[1]]$SingleR.single.main$labels
+scmat[["SingleR.label"]]=singler$singler[[1]]$SingleR.single$labels
+scmat[["SingleR.label.cluster"]]=paste(singler$singler[[1]]$SingleR.single$labels, Idents(scmat))
+
+# can be added as cluster id
+cluster=singler$singler[[1]]$SingleR.clusters$labels
+cluster.main=singler$singler[[1]]$SingleR.clusters.main$labels
+
+# order based on cell type and add celltype to cluster:
+lineage=c("HSC", "MPP","GMP","CLP","CMP","MEP","Monocytes","DC","Macrophages","Macrophages M1","Macrophages M2","CD4+ Tn","CD4+ T-cells", "CD4+ Tcm", "CD4+ Tem","CD8+ Tn","CD8+ T-cells","CD8+ Tcm","CD8+ Tem","NK cells","Tregs", "Pre-B_cell", "Pro-B_cell", "naive B-cells", "Memory B-cells","Class-switched memory B-cells","Plasma cells","Endothelial cells","Neutrophils","Eosinophils","Fibroblasts","Smooth muscle","Erythrocytes","Megakaryocytes")
+lineage=unique(c(lineage,blueprint_encode$types))
+lineage=lineage[lineage%in%cluster]
+
+identityVector.samples=as.character(Idents(scmat))
+clusters.samples=Idents(scmat)
+
+for(j in seq(cluster)){
+  identityVector.samples[clusters.samples%in%rownames(cluster)[j]]=paste(cluster[j], rownames(cluster)[j])
+}
+
+# order and cluster identity;
+cluster=cluster[order(match(cluster[,1],lineage)),,drop=F]
+cat(paste(levels(Idents(scmat)), collapse=","), paste(cluster, collapse=","), sep="\t\t")   
+
+# order seurat clusters too:
+Idents(scmat)=factor(identityVector.samples, levels=paste(cluster, rownames(cluster)))
+
+r4=DimPlot(object = scmat, group.by = "SingleR.label", reduction = 'umap' , label = TRUE,  pt.size = 0.5, repel = T)  + NoLegend() + NoAxes()
+ggplot2::ggsave(r4, filename = paste0(name, "_UMAP_singleR.pdf"), width = 6, height = 6)
+
+
+save(singler, file=paste0(name, "_singler_object.Rdata"))
+cat("\nSingleR done", sep="\n\n")
+
+# add immunoscores to FM format data matrix
+fm.f <- data.matrix(GetAssayData(scmat, assay = "RNA"))
+
+# add gm based score:
+add.scores=list(HLAIScore=c("B2M", "HLA-A", "HLA-B","HLA-C"), HLAIIScore=c("HLA-DMA","HLA-DMB","HLA-DPA1","HLA-DPB1","HLA-DRA","HLA-DRB1"), CytolyticScore=c("GZMA", "PRF1", "GNLY", "GZMH", "GZMM"))
+
+gm.objects=do.call(rbind, lapply(seq(add.scores), function(i){
+  dat3=fm.f[rownames(fm.f)%in%add.scores[[i]],]
+  gm=t(apply(dat3, 2, gm_mean)) # done to normalized values
+  rownames(gm)=names(add.scores)[i]
+  return(gm)
+}))
+
+# also add to seurat object:
+for(i in seq(add.scores)){
+  scmat[[names(add.scores)[i]]] <- gm.objects[i,]
+}
+
+# gene expression and scores to fm:
+rownames(fm.f)=paste0("N:GEXP:", rownames(fm.f))
+rownames(gm.objects)=paste0("N:SAMP:", rownames(gm.objects))
+fm=rbind(gm.objects, fm.f)
+
+# DE analysis:
+# markers.all=FindAllMarkers(scmat, only.pos = T)#, ...)
+
+save(list=c("fm", "scmat"), file=paste0(name, "_scRNA.Rdata"))
+# save(list=c("fm", "scmat", "markers.all"), file=paste0(name, "_scRNA.Rdata"))
+
+
+r5=DimPlot(object = scmat, reduction = 'umap', label=T)
+ggplot2::ggsave(r5, filename = paste0(name, "_UMAP_clusters.pdf"))
\ No newline at end of file