Diff of /Analysis_4.r [000000] .. [b8d756]

Switch to unified view

a b/Analysis_4.r
1
# tissue recognition analysis
2
# protein data from 12-09-17
3
library(xlsx)
4
library(randomForest)
5
library(bmrm)
6
library(caret)
7
library(dplyr)
8
library(data.table)
9
library(Boruta)
10
11
#=============  
12
# runs RF for tissue recognition with 10-fold CV.
13
# use predefined iterations and training-test assignments
14
# Tissue is in sampleType column of the dat object
15
# keep cancers with more than 150 as is, for those that are less, repeat them several times and sample the rest
16
runRF_balancedSample = function(dat, fold, BalanceSampleCount = 150)
17
{
18
    require(bmrm)
19
    require(randomForest)
20
    require(caret)
21
    require(dplyr)
22
#   NIter = unique(iterFold$iteration)
23
#print(head(dat))
24
    AllpredictionsNIter_tissue = c()
25
      MyIndices <- fold
26
      nTissue = nlevels(as.factor(dat$sampleType))
27
      Allpredictions = vector(length = length(MyIndices)) 
28
      Allpredictions_votes = matrix(nrow = length(MyIndices),ncol = nTissue)
29
      
30
      for( i in unique(MyIndices)){
31
        trainSamples <- which(MyIndices!=i)
32
        testSamples <- which(MyIndices==i)
33
#print(table(dat$sampleType[trainSamples]))
34
    trainSamplesResampled <- tapply(trainSamples,
35
                    dat$sampleType[trainSamples],
36
                    FUN = function(x)
37
                      if (length(x)>BalanceSampleCount)x else {
38
                        s = rep(x, floor(BalanceSampleCount/length(x)));
39
                        return(c(s,sample(x,size = BalanceSampleCount-length(s))))})  %>%
40
                  unlist()
41
    
42
    ThisRF <- randomForest(as.factor(sampleType)~.,
43
                           data =dat[trainSamplesResampled,],
44
                           cutoff = rep(1/nTissue,nTissue))
45
46
    Allpredictions[testSamples] <- predict(ThisRF,dat[testSamples,]) %>% as.character()
47
48
    Allpredictions_votes[testSamples,] <- predict(ThisRF,dat[testSamples,], type = 'vote')
49
        
50
      }
51
      names(Allpredictions) = rownames(Allpredictions_votes)=rownames(dat)
52
      colnames(Allpredictions_votes) = levels(as.factor(dat$sampleType))
53
      conf = confusionMatrix(Allpredictions,reference = dat$sampleType)
54
      AllpredictionsNIter_tissue <- list(prediction = Allpredictions, 
55
        confusion = conf, votes = Allpredictions_votes)
56
    print(conf$overall[1])
57
    return(AllpredictionsNIter_tissue)
58
}
59
60
# extracts positive samples and folds for an iteration and runs RF for this iteration
61
runOneIter = function(obj, colNames = colnames(obj), addInfo = NULL, tumSamp, sampType)
62
{
63
# positive samples only
64
    bahmanSamp = rownames(obj)[which(obj$LogitCall)] # 
65
    s = intersect(tumSamp,rownames(obj)[which(obj$LogitCall)]) # 
66
    print(length(s))
67
    if (!is.null(addInfo)) prot = data.frame(sampleType = sampType[s], obj[s,colNames],addInfo[s,]) else 
68
        prot = cbind(sampleType = obj[s,'Tissue'],obj[s,colNames])
69
    return(runRF_balancedSample(prot, fold = obj[s,'fold']))
70
}
71
72
#===================================================
73
# loading results from cancer detection
74
load('Results/ForLuda8proteins.rda')
75
76
#==============================
77
# read in protein data
78
protData = read.xlsx2(file = 'MEGA Protein Dataset for Random Forest, December 9, 2017.xlsx', sheetIndex = 1, row.names = 1, check.names = F, stringsAsFactors = F)
79
80
# remove Vitronectin, LRG-1, Stage, Sample Type
81
protOnly = protData %>% select(-`Vitronectin`, -`LRG-1`, -`Stage`, -`Sample Type`)
82
83
protOnly = sapply(protOnly, as.numeric)
84
#protOnly = t(do.call('rbind',protOnly))
85
rownames(protOnly) = rownames(protData)
86
colnames(protOnly) = gsub('-','',colnames(protOnly))
87
colnames(protOnly) = gsub(' ','',colnames(protOnly))
88
protOnly[is.na(protOnly)] = 0
89
90
#==================================
91
# create tumor type object
92
sampleType = as.character(protData$`Sample Type`)
93
# combine esophagus and stomach cancers to one group
94
sampleType[which(sampleType%in%c('Esophagus','Stomach'))] = 'st_eso'
95
names(sampleType) = rownames(protData)
96
97
# IDs of tumor and normal samples
98
tumSamp = names(sampleType)[which(sampleType != 'Normal')] # 1005
99
normSamp = names(sampleType)[which(sampleType == 'Normal')] # 812
100
101
#====================================
102
# patient info
103
sampInfo <- read.xlsx2("aar3247_Suppl. Excel_seq3_v1.xlsx",sheetIndex = 4,
104
                                           stringsAsFactors = F, startRow = 3, endRow = 1820)
105
# gender
106
gender = sampInfo[c("Plasma.sample.ID..","Sex")]
107
gender = gender[!duplicated(gender[,1]),]
108
n = gender[,1]
109
gender = gender[,2]
110
names(gender) = n
111
gender[which(gender %in% c('Female','female'))] = 'F'
112
gender[which(gender %in% c('Male','male'))] = 'M'
113
gender[which(gender %in% c('unknown',''))] = NA
114
115
#=============================
116
# add matrix with omegas per gene per patient.
117
#=============================
118
# make 10 matrices from Analysis_1
119
omegaPerPatient = omegaPerPatientK13 = vector(mode = 'list',length = 10)
120
s = c(tumSamp, normSamp)
121
for(i in 1:10)
122
{
123
    load(paste0('allResults_fromLuMethod_',i,'_20171209.rda'))
124
    perPat = split(allResults,allResults$Template)
125
    perPat = lapply(perPat, function(x) x[which(!x[,'blacklist']),c('Gene','omega')])
126
    # get maximum omega for each gene
127
    perPatMaxOmega = sapply(perPat, function(x) tapply(x$omega, INDEX = x$Gene, FUN = max, na.rm = T))
128
    genes = unique(allResults$Gene)
129
    s1 = union(s,names(perPatMaxOmega))
130
    omegaPerPatient[[i]] = matrix(0,ncol = length(genes), nrow = length(s1), dimnames = list(s1,genes))
131
    for(j in names(perPatMaxOmega)) omegaPerPatient[[i]][j,names(perPatMaxOmega[[j]])] = perPatMaxOmega[[j]]
132
    
133
    # separate KRAS codon 13
134
    resultKRAS13 = allResults %>% filter(Gene == 'KRAS', codonStart == 13)
135
    resultNoK13 = allResults %>% filter(Gene != 'KRAS' | codonStart != 13)
136
    resultKRAS13[,'Gene'] = 'KRAS13'
137
    allResults = rbind(resultNoK13,resultKRAS13)
138
    # get maximum omega for each gene
139
    perPatMaxOmega = sapply(perPat, function(x) tapply(x$omega, INDEX = x$Gene, FUN = max, na.rm = T))
140
    genes = unique(allResults$Gene)
141
    s1 = union(s,names(perPatMaxOmega))
142
    omegaPerPatientK13[[i]] = matrix(0,ncol = length(genes), nrow = length(s1), dimnames = list(s1,genes))
143
    for(j in names(perPatMaxOmega)) omegaPerPatientK13[[i]][j,names(perPatMaxOmega[[j]])] = perPatMaxOmega[[j]]
144
}
145
146
#===========================
147
# run all proteins, gender and mutations
148
set.seed(12485)
149
res8 = vector(length = 10)
150
for(i in 1:10)
151
{
152
    res8[i] = list(runOneIter(cbind(protOnly,ForLuda[[i]][rownames(protOnly),c('LogitCall','fold')]),
153
            colNames = 1:39, 
154
            addInfo = data.frame(gender = gender[rownames(protOnly)],omegaPerPatient[[i]][rownames(protOnly),]), 
155
            tumSamp = tumSamp, sampType = sampleType))
156
}
157
158
# save results for the paper
159
tab = sapply(res8, FUN = function(x) c(x$confusion$overall[1], x$confusion$byClass[,1])) 
160
tab = cbind(tab,mean = apply(tab,1,mean))
161
# mean accuracy 0.6968313
162
xlsFile = paste('Results/tissueRecog_results_',Sys.Date(),'.xlsx')
163
write.xlsx(tab, file = xlsFile, sheetName = 'prot +  mut')
164
# confusion matrix with fractions
165
conf = res8[[4]]$confusion$table
166
write.csv(sweep(conf, 2, apply(conf,2,sum),'/'), file = paste('Results/tissueRecog_results_',Sys.Date(),'.csv'), quote = F)
167
# per samples votes
168
tab = cbind(res8[[4]]$votes, prediction = res8[[4]]$prediction, tissue = sampleType[names(res8[[4]]$prediction)])
169
write.xlsx(tab, file = xlsFile, sheetName = 'Iteration 4. Votes', append = T)
170