[b8d756]: / Analysis_4.r

Download this file

171 lines (151 with data), 7.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# tissue recognition analysis
# protein data from 12-09-17
library(xlsx)
library(randomForest)
library(bmrm)
library(caret)
library(dplyr)
library(data.table)
library(Boruta)
#=============
# runs RF for tissue recognition with 10-fold CV.
# use predefined iterations and training-test assignments
# Tissue is in sampleType column of the dat object
# keep cancers with more than 150 as is, for those that are less, repeat them several times and sample the rest
runRF_balancedSample = function(dat, fold, BalanceSampleCount = 150)
{
require(bmrm)
require(randomForest)
require(caret)
require(dplyr)
# NIter = unique(iterFold$iteration)
#print(head(dat))
AllpredictionsNIter_tissue = c()
MyIndices <- fold
nTissue = nlevels(as.factor(dat$sampleType))
Allpredictions = vector(length = length(MyIndices))
Allpredictions_votes = matrix(nrow = length(MyIndices),ncol = nTissue)
for( i in unique(MyIndices)){
trainSamples <- which(MyIndices!=i)
testSamples <- which(MyIndices==i)
#print(table(dat$sampleType[trainSamples]))
trainSamplesResampled <- tapply(trainSamples,
dat$sampleType[trainSamples],
FUN = function(x)
if (length(x)>BalanceSampleCount)x else {
s = rep(x, floor(BalanceSampleCount/length(x)));
return(c(s,sample(x,size = BalanceSampleCount-length(s))))}) %>%
unlist()
ThisRF <- randomForest(as.factor(sampleType)~.,
data =dat[trainSamplesResampled,],
cutoff = rep(1/nTissue,nTissue))
Allpredictions[testSamples] <- predict(ThisRF,dat[testSamples,]) %>% as.character()
Allpredictions_votes[testSamples,] <- predict(ThisRF,dat[testSamples,], type = 'vote')
}
names(Allpredictions) = rownames(Allpredictions_votes)=rownames(dat)
colnames(Allpredictions_votes) = levels(as.factor(dat$sampleType))
conf = confusionMatrix(Allpredictions,reference = dat$sampleType)
AllpredictionsNIter_tissue <- list(prediction = Allpredictions,
confusion = conf, votes = Allpredictions_votes)
print(conf$overall[1])
return(AllpredictionsNIter_tissue)
}
# extracts positive samples and folds for an iteration and runs RF for this iteration
runOneIter = function(obj, colNames = colnames(obj), addInfo = NULL, tumSamp, sampType)
{
# positive samples only
bahmanSamp = rownames(obj)[which(obj$LogitCall)] #
s = intersect(tumSamp,rownames(obj)[which(obj$LogitCall)]) #
print(length(s))
if (!is.null(addInfo)) prot = data.frame(sampleType = sampType[s], obj[s,colNames],addInfo[s,]) else
prot = cbind(sampleType = obj[s,'Tissue'],obj[s,colNames])
return(runRF_balancedSample(prot, fold = obj[s,'fold']))
}
#===================================================
# loading results from cancer detection
load('Results/ForLuda8proteins.rda')
#==============================
# read in protein data
protData = read.xlsx2(file = 'MEGA Protein Dataset for Random Forest, December 9, 2017.xlsx', sheetIndex = 1, row.names = 1, check.names = F, stringsAsFactors = F)
# remove Vitronectin, LRG-1, Stage, Sample Type
protOnly = protData %>% select(-`Vitronectin`, -`LRG-1`, -`Stage`, -`Sample Type`)
protOnly = sapply(protOnly, as.numeric)
#protOnly = t(do.call('rbind',protOnly))
rownames(protOnly) = rownames(protData)
colnames(protOnly) = gsub('-','',colnames(protOnly))
colnames(protOnly) = gsub(' ','',colnames(protOnly))
protOnly[is.na(protOnly)] = 0
#==================================
# create tumor type object
sampleType = as.character(protData$`Sample Type`)
# combine esophagus and stomach cancers to one group
sampleType[which(sampleType%in%c('Esophagus','Stomach'))] = 'st_eso'
names(sampleType) = rownames(protData)
# IDs of tumor and normal samples
tumSamp = names(sampleType)[which(sampleType != 'Normal')] # 1005
normSamp = names(sampleType)[which(sampleType == 'Normal')] # 812
#====================================
# patient info
sampInfo <- read.xlsx2("aar3247_Suppl. Excel_seq3_v1.xlsx",sheetIndex = 4,
stringsAsFactors = F, startRow = 3, endRow = 1820)
# gender
gender = sampInfo[c("Plasma.sample.ID..","Sex")]
gender = gender[!duplicated(gender[,1]),]
n = gender[,1]
gender = gender[,2]
names(gender) = n
gender[which(gender %in% c('Female','female'))] = 'F'
gender[which(gender %in% c('Male','male'))] = 'M'
gender[which(gender %in% c('unknown',''))] = NA
#=============================
# add matrix with omegas per gene per patient.
#=============================
# make 10 matrices from Analysis_1
omegaPerPatient = omegaPerPatientK13 = vector(mode = 'list',length = 10)
s = c(tumSamp, normSamp)
for(i in 1:10)
{
load(paste0('allResults_fromLuMethod_',i,'_20171209.rda'))
perPat = split(allResults,allResults$Template)
perPat = lapply(perPat, function(x) x[which(!x[,'blacklist']),c('Gene','omega')])
# get maximum omega for each gene
perPatMaxOmega = sapply(perPat, function(x) tapply(x$omega, INDEX = x$Gene, FUN = max, na.rm = T))
genes = unique(allResults$Gene)
s1 = union(s,names(perPatMaxOmega))
omegaPerPatient[[i]] = matrix(0,ncol = length(genes), nrow = length(s1), dimnames = list(s1,genes))
for(j in names(perPatMaxOmega)) omegaPerPatient[[i]][j,names(perPatMaxOmega[[j]])] = perPatMaxOmega[[j]]
# separate KRAS codon 13
resultKRAS13 = allResults %>% filter(Gene == 'KRAS', codonStart == 13)
resultNoK13 = allResults %>% filter(Gene != 'KRAS' | codonStart != 13)
resultKRAS13[,'Gene'] = 'KRAS13'
allResults = rbind(resultNoK13,resultKRAS13)
# get maximum omega for each gene
perPatMaxOmega = sapply(perPat, function(x) tapply(x$omega, INDEX = x$Gene, FUN = max, na.rm = T))
genes = unique(allResults$Gene)
s1 = union(s,names(perPatMaxOmega))
omegaPerPatientK13[[i]] = matrix(0,ncol = length(genes), nrow = length(s1), dimnames = list(s1,genes))
for(j in names(perPatMaxOmega)) omegaPerPatientK13[[i]][j,names(perPatMaxOmega[[j]])] = perPatMaxOmega[[j]]
}
#===========================
# run all proteins, gender and mutations
set.seed(12485)
res8 = vector(length = 10)
for(i in 1:10)
{
res8[i] = list(runOneIter(cbind(protOnly,ForLuda[[i]][rownames(protOnly),c('LogitCall','fold')]),
colNames = 1:39,
addInfo = data.frame(gender = gender[rownames(protOnly)],omegaPerPatient[[i]][rownames(protOnly),]),
tumSamp = tumSamp, sampType = sampleType))
}
# save results for the paper
tab = sapply(res8, FUN = function(x) c(x$confusion$overall[1], x$confusion$byClass[,1]))
tab = cbind(tab,mean = apply(tab,1,mean))
# mean accuracy 0.6968313
xlsFile = paste('Results/tissueRecog_results_',Sys.Date(),'.xlsx')
write.xlsx(tab, file = xlsFile, sheetName = 'prot + mut')
# confusion matrix with fractions
conf = res8[[4]]$confusion$table
write.csv(sweep(conf, 2, apply(conf,2,sum),'/'), file = paste('Results/tissueRecog_results_',Sys.Date(),'.csv'), quote = F)
# per samples votes
tab = cbind(res8[[4]]$votes, prediction = res8[[4]]$prediction, tissue = sampleType[names(res8[[4]]$prediction)])
write.xlsx(tab, file = xlsFile, sheetName = 'Iteration 4. Votes', append = T)