|
a |
|
b/Analysis_4.r |
|
|
1 |
# tissue recognition analysis |
|
|
2 |
# protein data from 12-09-17 |
|
|
3 |
library(xlsx) |
|
|
4 |
library(randomForest) |
|
|
5 |
library(bmrm) |
|
|
6 |
library(caret) |
|
|
7 |
library(dplyr) |
|
|
8 |
library(data.table) |
|
|
9 |
library(Boruta) |
|
|
10 |
|
|
|
11 |
#============= |
|
|
12 |
# runs RF for tissue recognition with 10-fold CV. |
|
|
13 |
# use predefined iterations and training-test assignments |
|
|
14 |
# Tissue is in sampleType column of the dat object |
|
|
15 |
# keep cancers with more than 150 as is, for those that are less, repeat them several times and sample the rest |
|
|
16 |
runRF_balancedSample = function(dat, fold, BalanceSampleCount = 150) |
|
|
17 |
{ |
|
|
18 |
require(bmrm) |
|
|
19 |
require(randomForest) |
|
|
20 |
require(caret) |
|
|
21 |
require(dplyr) |
|
|
22 |
# NIter = unique(iterFold$iteration) |
|
|
23 |
#print(head(dat)) |
|
|
24 |
AllpredictionsNIter_tissue = c() |
|
|
25 |
MyIndices <- fold |
|
|
26 |
nTissue = nlevels(as.factor(dat$sampleType)) |
|
|
27 |
Allpredictions = vector(length = length(MyIndices)) |
|
|
28 |
Allpredictions_votes = matrix(nrow = length(MyIndices),ncol = nTissue) |
|
|
29 |
|
|
|
30 |
for( i in unique(MyIndices)){ |
|
|
31 |
trainSamples <- which(MyIndices!=i) |
|
|
32 |
testSamples <- which(MyIndices==i) |
|
|
33 |
#print(table(dat$sampleType[trainSamples])) |
|
|
34 |
trainSamplesResampled <- tapply(trainSamples, |
|
|
35 |
dat$sampleType[trainSamples], |
|
|
36 |
FUN = function(x) |
|
|
37 |
if (length(x)>BalanceSampleCount)x else { |
|
|
38 |
s = rep(x, floor(BalanceSampleCount/length(x))); |
|
|
39 |
return(c(s,sample(x,size = BalanceSampleCount-length(s))))}) %>% |
|
|
40 |
unlist() |
|
|
41 |
|
|
|
42 |
ThisRF <- randomForest(as.factor(sampleType)~., |
|
|
43 |
data =dat[trainSamplesResampled,], |
|
|
44 |
cutoff = rep(1/nTissue,nTissue)) |
|
|
45 |
|
|
|
46 |
Allpredictions[testSamples] <- predict(ThisRF,dat[testSamples,]) %>% as.character() |
|
|
47 |
|
|
|
48 |
Allpredictions_votes[testSamples,] <- predict(ThisRF,dat[testSamples,], type = 'vote') |
|
|
49 |
|
|
|
50 |
} |
|
|
51 |
names(Allpredictions) = rownames(Allpredictions_votes)=rownames(dat) |
|
|
52 |
colnames(Allpredictions_votes) = levels(as.factor(dat$sampleType)) |
|
|
53 |
conf = confusionMatrix(Allpredictions,reference = dat$sampleType) |
|
|
54 |
AllpredictionsNIter_tissue <- list(prediction = Allpredictions, |
|
|
55 |
confusion = conf, votes = Allpredictions_votes) |
|
|
56 |
print(conf$overall[1]) |
|
|
57 |
return(AllpredictionsNIter_tissue) |
|
|
58 |
} |
|
|
59 |
|
|
|
60 |
# extracts positive samples and folds for an iteration and runs RF for this iteration |
|
|
61 |
runOneIter = function(obj, colNames = colnames(obj), addInfo = NULL, tumSamp, sampType) |
|
|
62 |
{ |
|
|
63 |
# positive samples only |
|
|
64 |
bahmanSamp = rownames(obj)[which(obj$LogitCall)] # |
|
|
65 |
s = intersect(tumSamp,rownames(obj)[which(obj$LogitCall)]) # |
|
|
66 |
print(length(s)) |
|
|
67 |
if (!is.null(addInfo)) prot = data.frame(sampleType = sampType[s], obj[s,colNames],addInfo[s,]) else |
|
|
68 |
prot = cbind(sampleType = obj[s,'Tissue'],obj[s,colNames]) |
|
|
69 |
return(runRF_balancedSample(prot, fold = obj[s,'fold'])) |
|
|
70 |
} |
|
|
71 |
|
|
|
72 |
#=================================================== |
|
|
73 |
# loading results from cancer detection |
|
|
74 |
load('Results/ForLuda8proteins.rda') |
|
|
75 |
|
|
|
76 |
#============================== |
|
|
77 |
# read in protein data |
|
|
78 |
protData = read.xlsx2(file = 'MEGA Protein Dataset for Random Forest, December 9, 2017.xlsx', sheetIndex = 1, row.names = 1, check.names = F, stringsAsFactors = F) |
|
|
79 |
|
|
|
80 |
# remove Vitronectin, LRG-1, Stage, Sample Type |
|
|
81 |
protOnly = protData %>% select(-`Vitronectin`, -`LRG-1`, -`Stage`, -`Sample Type`) |
|
|
82 |
|
|
|
83 |
protOnly = sapply(protOnly, as.numeric) |
|
|
84 |
#protOnly = t(do.call('rbind',protOnly)) |
|
|
85 |
rownames(protOnly) = rownames(protData) |
|
|
86 |
colnames(protOnly) = gsub('-','',colnames(protOnly)) |
|
|
87 |
colnames(protOnly) = gsub(' ','',colnames(protOnly)) |
|
|
88 |
protOnly[is.na(protOnly)] = 0 |
|
|
89 |
|
|
|
90 |
#================================== |
|
|
91 |
# create tumor type object |
|
|
92 |
sampleType = as.character(protData$`Sample Type`) |
|
|
93 |
# combine esophagus and stomach cancers to one group |
|
|
94 |
sampleType[which(sampleType%in%c('Esophagus','Stomach'))] = 'st_eso' |
|
|
95 |
names(sampleType) = rownames(protData) |
|
|
96 |
|
|
|
97 |
# IDs of tumor and normal samples |
|
|
98 |
tumSamp = names(sampleType)[which(sampleType != 'Normal')] # 1005 |
|
|
99 |
normSamp = names(sampleType)[which(sampleType == 'Normal')] # 812 |
|
|
100 |
|
|
|
101 |
#==================================== |
|
|
102 |
# patient info |
|
|
103 |
sampInfo <- read.xlsx2("aar3247_Suppl. Excel_seq3_v1.xlsx",sheetIndex = 4, |
|
|
104 |
stringsAsFactors = F, startRow = 3, endRow = 1820) |
|
|
105 |
# gender |
|
|
106 |
gender = sampInfo[c("Plasma.sample.ID..","Sex")] |
|
|
107 |
gender = gender[!duplicated(gender[,1]),] |
|
|
108 |
n = gender[,1] |
|
|
109 |
gender = gender[,2] |
|
|
110 |
names(gender) = n |
|
|
111 |
gender[which(gender %in% c('Female','female'))] = 'F' |
|
|
112 |
gender[which(gender %in% c('Male','male'))] = 'M' |
|
|
113 |
gender[which(gender %in% c('unknown',''))] = NA |
|
|
114 |
|
|
|
115 |
#============================= |
|
|
116 |
# add matrix with omegas per gene per patient. |
|
|
117 |
#============================= |
|
|
118 |
# make 10 matrices from Analysis_1 |
|
|
119 |
omegaPerPatient = omegaPerPatientK13 = vector(mode = 'list',length = 10) |
|
|
120 |
s = c(tumSamp, normSamp) |
|
|
121 |
for(i in 1:10) |
|
|
122 |
{ |
|
|
123 |
load(paste0('allResults_fromLuMethod_',i,'_20171209.rda')) |
|
|
124 |
perPat = split(allResults,allResults$Template) |
|
|
125 |
perPat = lapply(perPat, function(x) x[which(!x[,'blacklist']),c('Gene','omega')]) |
|
|
126 |
# get maximum omega for each gene |
|
|
127 |
perPatMaxOmega = sapply(perPat, function(x) tapply(x$omega, INDEX = x$Gene, FUN = max, na.rm = T)) |
|
|
128 |
genes = unique(allResults$Gene) |
|
|
129 |
s1 = union(s,names(perPatMaxOmega)) |
|
|
130 |
omegaPerPatient[[i]] = matrix(0,ncol = length(genes), nrow = length(s1), dimnames = list(s1,genes)) |
|
|
131 |
for(j in names(perPatMaxOmega)) omegaPerPatient[[i]][j,names(perPatMaxOmega[[j]])] = perPatMaxOmega[[j]] |
|
|
132 |
|
|
|
133 |
# separate KRAS codon 13 |
|
|
134 |
resultKRAS13 = allResults %>% filter(Gene == 'KRAS', codonStart == 13) |
|
|
135 |
resultNoK13 = allResults %>% filter(Gene != 'KRAS' | codonStart != 13) |
|
|
136 |
resultKRAS13[,'Gene'] = 'KRAS13' |
|
|
137 |
allResults = rbind(resultNoK13,resultKRAS13) |
|
|
138 |
# get maximum omega for each gene |
|
|
139 |
perPatMaxOmega = sapply(perPat, function(x) tapply(x$omega, INDEX = x$Gene, FUN = max, na.rm = T)) |
|
|
140 |
genes = unique(allResults$Gene) |
|
|
141 |
s1 = union(s,names(perPatMaxOmega)) |
|
|
142 |
omegaPerPatientK13[[i]] = matrix(0,ncol = length(genes), nrow = length(s1), dimnames = list(s1,genes)) |
|
|
143 |
for(j in names(perPatMaxOmega)) omegaPerPatientK13[[i]][j,names(perPatMaxOmega[[j]])] = perPatMaxOmega[[j]] |
|
|
144 |
} |
|
|
145 |
|
|
|
146 |
#=========================== |
|
|
147 |
# run all proteins, gender and mutations |
|
|
148 |
set.seed(12485) |
|
|
149 |
res8 = vector(length = 10) |
|
|
150 |
for(i in 1:10) |
|
|
151 |
{ |
|
|
152 |
res8[i] = list(runOneIter(cbind(protOnly,ForLuda[[i]][rownames(protOnly),c('LogitCall','fold')]), |
|
|
153 |
colNames = 1:39, |
|
|
154 |
addInfo = data.frame(gender = gender[rownames(protOnly)],omegaPerPatient[[i]][rownames(protOnly),]), |
|
|
155 |
tumSamp = tumSamp, sampType = sampleType)) |
|
|
156 |
} |
|
|
157 |
|
|
|
158 |
# save results for the paper |
|
|
159 |
tab = sapply(res8, FUN = function(x) c(x$confusion$overall[1], x$confusion$byClass[,1])) |
|
|
160 |
tab = cbind(tab,mean = apply(tab,1,mean)) |
|
|
161 |
# mean accuracy 0.6968313 |
|
|
162 |
xlsFile = paste('Results/tissueRecog_results_',Sys.Date(),'.xlsx') |
|
|
163 |
write.xlsx(tab, file = xlsFile, sheetName = 'prot + mut') |
|
|
164 |
# confusion matrix with fractions |
|
|
165 |
conf = res8[[4]]$confusion$table |
|
|
166 |
write.csv(sweep(conf, 2, apply(conf,2,sum),'/'), file = paste('Results/tissueRecog_results_',Sys.Date(),'.csv'), quote = F) |
|
|
167 |
# per samples votes |
|
|
168 |
tab = cbind(res8[[4]]$votes, prediction = res8[[4]]$prediction, tissue = sampleType[names(res8[[4]]$prediction)]) |
|
|
169 |
write.xlsx(tab, file = xlsFile, sheetName = 'Iteration 4. Votes', append = T) |
|
|
170 |
|