[8e0848]: / Statistical_analysis_CGA_discovery_Hemap.R

Download this file

194 lines (144 with data), 9.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
GIT_HOME="/research/users/ppolonen/git_home/ImmunogenomicLandscape-BloodCancers/"
source(file.path(GIT_HOME, "common_scripts/visualisation/plotting_functions.R"))
library(parallel)
library(org.Hs.eg.db)
library(seqinr)
library(reshape2)
setwd("/research/groups/sysgen/PROJECTS/HEMAP_IMMUNOLOGY/petri_work/HEMAP_IMMUNOLOGY/Published_data_figures")
# load annotations
annot = get(load("Hemap_immunology_Annotations.Rdata"))
# Genes that are expressed in normal cells:
normal_res=get(load("dufva_mw_hgt_pval_normal_expressed.Rdata"))
OE_cancer=get(load("dufva_mw_hgt_pval_cancer_overexpressed.Rdata"))
# T/NK failed genes
tnk_hgt=get(load("dufva_mw_hgt_pval_all_TNKspecific_genes.Rdata"))
tnk_genes=tnk_hgt[tnk_hgt$name%in%"TNK"&tnk_hgt$adj.pvalue>3&tnk_hgt$FoldChange>1.5,1]
normal_res=normal_res[!normal_res[,1]%in%tnk_genes,]
OE_cancer=OE_cancer[!OE_cancer[,1]%in%tnk_genes,]
# profiles
profile=get(load("mixtureM_profile.Rdata"))
profile[profile==-1] = 0
profile=profile[,colnames(profile)%in%annot$GSM.identifier..sample.]
# filter using hemap genes:
# annotations
load("data9544_with_gene_symbols.RData")
data = t(data[rownames(data)%in%annot$GSM.identifier..sample.,])
# take only high expressed into account
profile[data<5]=0
#*******************************************************************************
#*******************************************************************************
#************************** genes expressed/overexpressed in normal cells**************************
genes_normals=unique(normal_res[normal_res$adj.pvalue>2&normal_res$FoldChange>0&!grepl("_CD34", normal_res$name),1])
#************************** Genes overexpressed in CD34+ cancer **********************
# Genes expressed in CD34 cells:
# filter out StemCell expressed genes:
genes_CD34_2=unique(OE_cancer[OE_cancer$adj.pvalue>2&OE_cancer$FoldChange>1.5&grepl("_CD34", OE_cancer$name),])
genes_CD34=genes_CD34_2[!grepl("StemCell|CD8+Tcell|NKCell", genes_CD34_2$fails),]
#************************** Genes overexpressed in cancer **********************
overexpressed_cancer=OE_cancer[OE_cancer$adj.pvalue>2&OE_cancer$fails=="",]
# 1st condition, high in cancer (at least 5% patients), but low in normals
rm=unique(normal_res[normal_res$pvalue>=2,1]) # low expression in normals
overexpressed_cancer=overexpressed_cancer[!overexpressed_cancer[,1]%in%rm,]
# 2nd condition, expressed in normals, but highly expressed in cancer
overexpressed_cancer2=OE_cancer[OE_cancer$adj.pvalue>2&OE_cancer$FoldChange_max>=1.25&OE_cancer$fails=="",]
overexpressed_cancer2=overexpressed_cancer2[!overexpressed_cancer2[,1]%in%overexpressed_cancer[,1],]
# 3rd condition, low in both, but expressed in portion of cancer samples, has to be over 5% of the patients
overexpressed_cancer3=OE_cancer[OE_cancer$adj.pvalue>50&OE_cancer$FoldChange_max<1.25&!OE_cancer$fails==""&!OE_cancer[,1]%in%normal_res[normal_res$adj.pvalue>2,1],]
overexpressed_cancer3=overexpressed_cancer3[!(overexpressed_cancer3[,1]%in%overexpressed_cancer[,1]|overexpressed_cancer3[,1]%in%overexpressed_cancer2[,1]),]
#*******************************************************************************
#*******************************************************************************
#*******************************************************************************
rm=unique(normal_res[normal_res$pvalue>=3,1]) # not expressed in normal sample category
allnormals=rownames(profile)[rowSums(profile[,annot$MAINCLASS%in%"NonCancerHealthy"]==1)>200] # might be missed from hgt test if expressed in lot of the samples
expressed_cancer=unique(OE_cancer[OE_cancer$adj.pvalue>2,1])
expressed_cancer=expressed_cancer[!(expressed_cancer%in%rm|expressed_cancer%in%allnormals)]
#****************** Genes that are expressed in over 5% of the pure cancer samples ******************
# CD34 positive, also add for the test
Sys.setlocale('LC_ALL','C')
cd34=get.logical(annovector = list(annot$colorClass), filterv = grepl("CD34", annot$Sample.isolation, ignore.case = T), PREFIX = "CD34")
cd34=cd34[!names(cd34)%in%c("Erythroid_CD34", "StemCell_CD34")]
# can use here all samples
subclass=get.logical(annovector = list(annot$subclasses), filterv = annot$Sample.type%in%c("Prolif", "Cancer"))
tbly=get.logical(annovector = list(annot$tbLY), filterv = annot$Sample.type%in%c("Prolif", "Cancer"))
tbly=tbly[names(tbly)%in%c("Lymphoma_BCL_CHL ", "Lymphoma_BCL_MCL", "Lymphoma_TCL_PTCLNOS", "Lymphoma_BCL_FL", "Lymphoma_BCL_MALT", "Lymphoma_TCL_AILT", "Lymphoma_TCL_ALCL", "Lymphoma_BCL_DLBCL_PMBL", "Lymphoma_TCL_CTCL_MF")]
list_cancers=lapply(c(subclass, tbly, cd34), list)
list_cancers=unlist(list_cancers, recursive=F)
list_cancers=list_cancers[sapply(list_cancers, sum)>20]
# also must be expressed in at least 5% of the patients, high enough
FUN_TEST <- function(gene, logicalVectors) {
D = as.numeric(profile[rownames(profile)%in%gene,])
test=sapply(logicalVectors, function(v){
dd=D[v]
res=(sum(dd[dd==1])/length(dd))
})
answ=data.frame(gene, "percentage"=as.numeric(test), "name"=names(test), stringsAsFactors = F)
return(answ)
}
# filter out genes, expressed in any cancer over 5%
number_expressed=do.call(rbind, mclapply(rownames(profile), FUN_TEST, list_cancers, mc.cores=4))
expressed_cancer2=unique(number_expressed[as.numeric(number_expressed$percentage)>=0.05,1])
expressed_cancer3=unique(number_expressed[as.numeric(number_expressed$percentage)>=0.10,1])
#************************ read GTEX database data ************************
gtex=read.delim("GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_median_rpkm.gct", header=T, stringsAsFactors = F, skip=2)
# process matrix
n=gtex[!duplicated(gtex[,2]),2]
gtex=gtex[!duplicated(gtex[,2]),3:dim(gtex)[2]]
rownames(gtex)=n
gtex=gtex[rownames(gtex)%in%rownames(profile),]
# test where the genes in hemap are expressed in GTEX
d=gtex[,colnames(gtex)%in%"Testis", drop=F]
d2=gtex[,!colnames(gtex)%in%c("Testis", "Ovary"), drop=F]
d=d[as.numeric(d[,1])>2,,drop=F]
filt=rownames(d2)[rowSums(d2>0.25)==0]
expressed_testis=rownames(d)[rownames(d)%in%filt]
# test where the genes in hemap are expressed in GTEX
d3=gtex
filt=rownames(d3)[rowSums(d3>0.25)==0]
gtex_notexpressed=rownames(d3)[rownames(d3)%in%filt]
#****************************************************************
# genelists to compare
genes_normals # genes expressed in normal cells, filter out, unless overexpressed in cancer
expressed_testis # germline genes
genes_CD34 # these are good candidates
expressed_cancer # these have to be expressed in cancer, not expressed in normals
expressed_cancer2 # expressed quite highly in at least 5%
expressed_cancer3 # expressed quite highly in at least 10%
overexpressed_cancer # low normal expressed genes, must be over 10%
overexpressed_cancer2 # overexpressed genes
overexpressed_cancer3 # expressed in a portion of cancer cells
gtex_notexpressed # not expressed in GTEX
# remove genes that are overexpressed from normals
genes_normals=genes_normals[!genes_normals%in%overexpressed_cancer|genes_normals%in%overexpressed_cancer2]
# remove normals from testis list
expressed_testis=expressed_testis[!expressed_testis%in%genes_normals]
# expressed_cancer is only interesting, if they are testis antigens
expressed_cancer_testis=expressed_cancer[expressed_cancer%in%expressed_testis&expressed_cancer%in%expressed_cancer2]
testis_df=number_expressed[number_expressed[,1]%in%expressed_cancer_testis&number_expressed[,2]>0.05,]
# overexpressed highly, or not expressed in normals and in 10% patients
overexpressed_cancer=overexpressed_cancer[overexpressed_cancer%in%expressed_cancer3,]
# These genes are expresed anywhere else
cancer_overexpressed_notGTEX=rbind(overexpressed_cancer[overexpressed_cancer[,1]%in%gtex_notexpressed,], overexpressed_cancer2[overexpressed_cancer2[,1]%in%gtex_notexpressed,])
geneLS=list(expressed_testis, genes_CD34[,1], expressed_cancer_testis, overexpressed_cancer[,1], overexpressed_cancer2[,1], cancer_overexpressed_notGTEX[,1])
geneLS2=list(expressed_testis, genes_CD34, expressed_cancer_testis, overexpressed_cancer)
# We can rename our list vectors
names(geneLS) <- c("expressed_testis", "genes_CD34", "expressed_cancer_testis", "overexpressed_cancer","overexpressed_cancer2", "cancer_overexpressed_notGTEX")
names(geneLS2) <- c("expressed_testis", "genes_CD34", "expressed_cancer_testis", "overexpressed_cancer")
a.df=rbind(overexpressed_cancer, overexpressed_cancer2, cancer_overexpressed_notGTEX, genes_CD34)
# remove linc and AS RNA
a.df=a.df[!grepl("LINC|-AS", a.df[,1]),]
testis_df=testis_df[!grepl("LINC|-AS", testis_df[,1]),]
# CCLE filter if gene is not expressed:
ccle=get(load("CCLE_RNASEQ_SYMBOL_RPKM.Rdata"))
# CCLE annot:
annot=read.delim("CCLE_sample_info_file_2012-10-18.txt", header=T, stringsAsFactors = F)
find=intersect(annot$CCLE.name, colnames(ccle))
ccle=ccle[,match(find, colnames(ccle))]
annot=annot[match(find, annot$CCLE.name),]
ccle=ccle[,grepl("HAEMATOPOIETIC_AND_LYMPHOID_TISSUE", colnames(ccle))]
ccle=ccle[rownames(ccle)%in%testis_df[,1],]
# at least 6 cell lines express over 1 rpkm
keep <- rowSums(ccle>0.5)>=5
unique(testis_df[,1])[!unique(testis_df[,1])%in%unique(testis_df[testis_df[,1]%in%rownames(ccle)[keep],1])]
testis_df=testis_df[testis_df[,1]%in%rownames(ccle)[keep],]
write.table(a.df, "antigen_df.txt", quote = F, row.names = F, col.names = T, sep="\t")
write.table(testis_df, "t.antigen_df.txt", quote = F, row.names = F, col.names = T, sep="\t")