--- a +++ b/Statistical_analysis_CGA_discovery_Hemap.R @@ -0,0 +1,194 @@ +GIT_HOME="/research/users/ppolonen/git_home/ImmunogenomicLandscape-BloodCancers/" +source(file.path(GIT_HOME, "common_scripts/visualisation/plotting_functions.R")) + +library(parallel) +library(org.Hs.eg.db) +library(seqinr) +library(reshape2) + +setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/Published_data_figures") + +# load annotations +annot = get(load("Hemap_immunology_Annotations.Rdata")) + +# Genes that are expressed in normal cells: +normal_res=get(load("dufva_mw_hgt_pval_normal_expressed.Rdata")) +OE_cancer=get(load("dufva_mw_hgt_pval_cancer_overexpressed.Rdata")) + +# T/NK failed genes +tnk_hgt=get(load("dufva_mw_hgt_pval_all_TNKspecific_genes.Rdata")) +tnk_genes=tnk_hgt[tnk_hgt$name%in%"TNK"&tnk_hgt$adj.pvalue>3&tnk_hgt$FoldChange>1.5,1] + +normal_res=normal_res[!normal_res[,1]%in%tnk_genes,] +OE_cancer=OE_cancer[!OE_cancer[,1]%in%tnk_genes,] + +# profiles +profile=get(load("mixtureM_profile.Rdata")) +profile[profile==-1] = 0 +profile=profile[,colnames(profile)%in%annot$GSM.identifier..sample.] + +# filter using hemap genes: +# annotations +load("data9544_with_gene_symbols.RData") +data = t(data[rownames(data)%in%annot$GSM.identifier..sample.,]) + +# take only high expressed into account +profile[data<5]=0 + +#******************************************************************************* +#******************************************************************************* +#************************** genes expressed/overexpressed in normal cells************************** +genes_normals=unique(normal_res[normal_res$adj.pvalue>2&normal_res$FoldChange>0&!grepl("_CD34", normal_res$name),1]) + +#************************** Genes overexpressed in CD34+ cancer ********************** +# Genes expressed in CD34 cells: +# filter out StemCell expressed genes: +genes_CD34_2=unique(OE_cancer[OE_cancer$adj.pvalue>2&OE_cancer$FoldChange>1.5&grepl("_CD34", OE_cancer$name),]) +genes_CD34=genes_CD34_2[!grepl("StemCell|CD8+Tcell|NKCell", genes_CD34_2$fails),] + +#************************** Genes overexpressed in cancer ********************** +overexpressed_cancer=OE_cancer[OE_cancer$adj.pvalue>2&OE_cancer$fails=="",] + +# 1st condition, high in cancer (at least 5% patients), but low in normals +rm=unique(normal_res[normal_res$pvalue>=2,1]) # low expression in normals +overexpressed_cancer=overexpressed_cancer[!overexpressed_cancer[,1]%in%rm,] + +# 2nd condition, expressed in normals, but highly expressed in cancer +overexpressed_cancer2=OE_cancer[OE_cancer$adj.pvalue>2&OE_cancer$FoldChange_max>=1.25&OE_cancer$fails=="",] +overexpressed_cancer2=overexpressed_cancer2[!overexpressed_cancer2[,1]%in%overexpressed_cancer[,1],] + +# 3rd condition, low in both, but expressed in portion of cancer samples, has to be over 5% of the patients +overexpressed_cancer3=OE_cancer[OE_cancer$adj.pvalue>50&OE_cancer$FoldChange_max<1.25&!OE_cancer$fails==""&!OE_cancer[,1]%in%normal_res[normal_res$adj.pvalue>2,1],] +overexpressed_cancer3=overexpressed_cancer3[!(overexpressed_cancer3[,1]%in%overexpressed_cancer[,1]|overexpressed_cancer3[,1]%in%overexpressed_cancer2[,1]),] + +#******************************************************************************* +#******************************************************************************* +#******************************************************************************* +rm=unique(normal_res[normal_res$pvalue>=3,1]) # not expressed in normal sample category +allnormals=rownames(profile)[rowSums(profile[,annot$MAINCLASS%in%"NonCancerHealthy"]==1)>200] # might be missed from hgt test if expressed in lot of the samples + +expressed_cancer=unique(OE_cancer[OE_cancer$adj.pvalue>2,1]) +expressed_cancer=expressed_cancer[!(expressed_cancer%in%rm|expressed_cancer%in%allnormals)] + +#****************** Genes that are expressed in over 5% of the pure cancer samples ****************** +# CD34 positive, also add for the test +Sys.setlocale('LC_ALL','C') +cd34=get.logical(annovector = list(annot$colorClass), filterv = grepl("CD34", annot$Sample.isolation, ignore.case = T), PREFIX = "CD34") +cd34=cd34[!names(cd34)%in%c("Erythroid_CD34", "StemCell_CD34")] + +# can use here all samples +subclass=get.logical(annovector = list(annot$subclasses), filterv = annot$Sample.type%in%c("Prolif", "Cancer")) +tbly=get.logical(annovector = list(annot$tbLY), filterv = annot$Sample.type%in%c("Prolif", "Cancer")) +tbly=tbly[names(tbly)%in%c("Lymphoma_BCL_CHL ", "Lymphoma_BCL_MCL", "Lymphoma_TCL_PTCLNOS", "Lymphoma_BCL_FL", "Lymphoma_BCL_MALT", "Lymphoma_TCL_AILT", "Lymphoma_TCL_ALCL", "Lymphoma_BCL_DLBCL_PMBL", "Lymphoma_TCL_CTCL_MF")] + +list_cancers=lapply(c(subclass, tbly, cd34), list) +list_cancers=unlist(list_cancers, recursive=F) +list_cancers=list_cancers[sapply(list_cancers, sum)>20] + +# also must be expressed in at least 5% of the patients, high enough +FUN_TEST <- function(gene, logicalVectors) { + + D = as.numeric(profile[rownames(profile)%in%gene,]) + + test=sapply(logicalVectors, function(v){ + dd=D[v] + res=(sum(dd[dd==1])/length(dd)) + }) + answ=data.frame(gene, "percentage"=as.numeric(test), "name"=names(test), stringsAsFactors = F) + + return(answ) +} +# filter out genes, expressed in any cancer over 5% +number_expressed=do.call(rbind, mclapply(rownames(profile), FUN_TEST, list_cancers, mc.cores=4)) + +expressed_cancer2=unique(number_expressed[as.numeric(number_expressed$percentage)>=0.05,1]) +expressed_cancer3=unique(number_expressed[as.numeric(number_expressed$percentage)>=0.10,1]) + + +#************************ read GTEX database data ************************ +gtex=read.delim("GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_median_rpkm.gct", header=T, stringsAsFactors = F, skip=2) + +# process matrix +n=gtex[!duplicated(gtex[,2]),2] +gtex=gtex[!duplicated(gtex[,2]),3:dim(gtex)[2]] +rownames(gtex)=n + +gtex=gtex[rownames(gtex)%in%rownames(profile),] + +# test where the genes in hemap are expressed in GTEX +d=gtex[,colnames(gtex)%in%"Testis", drop=F] +d2=gtex[,!colnames(gtex)%in%c("Testis", "Ovary"), drop=F] + +d=d[as.numeric(d[,1])>2,,drop=F] +filt=rownames(d2)[rowSums(d2>0.25)==0] +expressed_testis=rownames(d)[rownames(d)%in%filt] + +# test where the genes in hemap are expressed in GTEX +d3=gtex +filt=rownames(d3)[rowSums(d3>0.25)==0] +gtex_notexpressed=rownames(d3)[rownames(d3)%in%filt] + + +#**************************************************************** + +# genelists to compare +genes_normals # genes expressed in normal cells, filter out, unless overexpressed in cancer +expressed_testis # germline genes +genes_CD34 # these are good candidates +expressed_cancer # these have to be expressed in cancer, not expressed in normals +expressed_cancer2 # expressed quite highly in at least 5% +expressed_cancer3 # expressed quite highly in at least 10% +overexpressed_cancer # low normal expressed genes, must be over 10% +overexpressed_cancer2 # overexpressed genes +overexpressed_cancer3 # expressed in a portion of cancer cells +gtex_notexpressed # not expressed in GTEX + +# remove genes that are overexpressed from normals +genes_normals=genes_normals[!genes_normals%in%overexpressed_cancer|genes_normals%in%overexpressed_cancer2] + +# remove normals from testis list +expressed_testis=expressed_testis[!expressed_testis%in%genes_normals] + +# expressed_cancer is only interesting, if they are testis antigens +expressed_cancer_testis=expressed_cancer[expressed_cancer%in%expressed_testis&expressed_cancer%in%expressed_cancer2] + +testis_df=number_expressed[number_expressed[,1]%in%expressed_cancer_testis&number_expressed[,2]>0.05,] + +# overexpressed highly, or not expressed in normals and in 10% patients +overexpressed_cancer=overexpressed_cancer[overexpressed_cancer%in%expressed_cancer3,] + +# These genes are expresed anywhere else +cancer_overexpressed_notGTEX=rbind(overexpressed_cancer[overexpressed_cancer[,1]%in%gtex_notexpressed,], overexpressed_cancer2[overexpressed_cancer2[,1]%in%gtex_notexpressed,]) + +geneLS=list(expressed_testis, genes_CD34[,1], expressed_cancer_testis, overexpressed_cancer[,1], overexpressed_cancer2[,1], cancer_overexpressed_notGTEX[,1]) +geneLS2=list(expressed_testis, genes_CD34, expressed_cancer_testis, overexpressed_cancer) + +# We can rename our list vectors +names(geneLS) <- c("expressed_testis", "genes_CD34", "expressed_cancer_testis", "overexpressed_cancer","overexpressed_cancer2", "cancer_overexpressed_notGTEX") +names(geneLS2) <- c("expressed_testis", "genes_CD34", "expressed_cancer_testis", "overexpressed_cancer") + +a.df=rbind(overexpressed_cancer, overexpressed_cancer2, cancer_overexpressed_notGTEX, genes_CD34) + +# remove linc and AS RNA +a.df=a.df[!grepl("LINC|-AS", a.df[,1]),] +testis_df=testis_df[!grepl("LINC|-AS", testis_df[,1]),] + +# CCLE filter if gene is not expressed: +ccle=get(load("CCLE_RNASEQ_SYMBOL_RPKM.Rdata")) + +# CCLE annot: +annot=read.delim("CCLE_sample_info_file_2012-10-18.txt", header=T, stringsAsFactors = F) +find=intersect(annot$CCLE.name, colnames(ccle)) +ccle=ccle[,match(find, colnames(ccle))] +annot=annot[match(find, annot$CCLE.name),] +ccle=ccle[,grepl("HAEMATOPOIETIC_AND_LYMPHOID_TISSUE", colnames(ccle))] + +ccle=ccle[rownames(ccle)%in%testis_df[,1],] + +# at least 6 cell lines express over 1 rpkm +keep <- rowSums(ccle>0.5)>=5 +unique(testis_df[,1])[!unique(testis_df[,1])%in%unique(testis_df[testis_df[,1]%in%rownames(ccle)[keep],1])] +testis_df=testis_df[testis_df[,1]%in%rownames(ccle)[keep],] + +write.table(a.df, "antigen_df.txt", quote = F, row.names = F, col.names = T, sep="\t") +write.table(testis_df, "t.antigen_df.txt", quote = F, row.names = F, col.names = T, sep="\t") \ No newline at end of file