|
a |
|
b/common_scripts/statistics/statistics_wrappers.R |
|
|
1 |
wrapper.wilcoxtest=function(genelist, data, logicalVectors, logicalVector_normals=NULL, ALTERNATIVE="greater", adj.method="BH", CORES=1, prettynum=T){ |
|
|
2 |
library(parallel) |
|
|
3 |
data=do.call(rbind, mclapply(genelist, TestGeneWilcox, data, logicalVectors, logicalVector_normals=logicalVector_normals, ALTERNATIVE, prettynum, mc.cores = CORES)) |
|
|
4 |
data$adj.p=p.adjust(data$p, method = adj.method) |
|
|
5 |
|
|
|
6 |
data$adj.method=adj.method |
|
|
7 |
|
|
|
8 |
# signifcode |
|
|
9 |
data$signifCode="" |
|
|
10 |
data$signifCode[as.numeric(data$adj.p)<0.1]="." |
|
|
11 |
data$signifCode[as.numeric(data$adj.p)<0.05]="*" |
|
|
12 |
data$signifCode[as.numeric(data$adj.p)<0.01]="**" |
|
|
13 |
data$signifCode[as.numeric(data$adj.p)<0.001]="***" |
|
|
14 |
data$signifCode[as.numeric(data$adj.p)<0.0001]="****" |
|
|
15 |
|
|
|
16 |
if(prettynum){ |
|
|
17 |
data$adj.p=prettyNum(signif(data$adj.p,2)) |
|
|
18 |
} |
|
|
19 |
|
|
|
20 |
if(adj.method=="BH"){ |
|
|
21 |
colnames(data)[colnames(data)%in%"adj.p"]="FDR" |
|
|
22 |
} |
|
|
23 |
|
|
|
24 |
data=data[,c(1:5, 7,9, 6,8)] |
|
|
25 |
|
|
|
26 |
return(data) |
|
|
27 |
} |
|
|
28 |
|
|
|
29 |
FUN_GOTHROUGH=function(i, genelist, list_cancers, logicalVector_normals, TEST_FAILS=F, HG_PVAL=0.001, MW_PVAL=0.001, ALTERNATIVE="greater"){ |
|
|
30 |
name=names(list_cancers)[i] |
|
|
31 |
logicalVector=as.logical(unlist(list_cancers[[i]])) |
|
|
32 |
logicalVector_normals=logicalVector_normals[!names(logicalVector_normals)%in%name] |
|
|
33 |
|
|
|
34 |
result=FUN_HGT_MW_TEST(genelist, logicalVector,logicalVector_normals, name, TEST_FAILS=F, data = data, profile = profile,HG_PVAL = HG_PVAL, MW_PVAL=MW_PVAL, ALTERNATIVE = ALTERNATIVE, CORES = 10) |
|
|
35 |
} |
|
|
36 |
|
|
|
37 |
fisher.wrapper=function(lv.list1, lv.list2, alternative="two.sided", method="bonferroni",log10=F, prettyNumbers=F,orderdata=F, cores=1){ |
|
|
38 |
|
|
|
39 |
n1=names(lv.list1) |
|
|
40 |
n2=names(lv.list2) |
|
|
41 |
|
|
|
42 |
res=do.call(rbind,mclapply(seq(lv.list1), function(i){ |
|
|
43 |
do.call(rbind,lapply(seq(lv.list2), function(j){ |
|
|
44 |
data.frame("featureA"=n1[i], "featureB"=n2[j],fisher.2x2(lv.list1[[i]], lv.list2[[j]])) |
|
|
45 |
})) |
|
|
46 |
}, mc.cores=cores)) |
|
|
47 |
|
|
|
48 |
res$adj.p=p.adjust(res$p, method=method) |
|
|
49 |
res$signifCode="" |
|
|
50 |
res$signifCode[as.numeric(res$adj.p)<0.1]="." |
|
|
51 |
res$signifCode[as.numeric(res$adj.p)<0.05]="*" |
|
|
52 |
res$signifCode[as.numeric(res$adj.p)<0.01]="**" |
|
|
53 |
res$signifCode[as.numeric(res$adj.p)<0.001]="***" |
|
|
54 |
res$signifCode[as.numeric(res$adj.p)<0.0001]="****" |
|
|
55 |
|
|
|
56 |
if(log10){ |
|
|
57 |
res$p=abs(log10(res$p)) |
|
|
58 |
res$adj.p=abs(log10(res$p)) |
|
|
59 |
} |
|
|
60 |
|
|
|
61 |
if(prettyNumbers){ |
|
|
62 |
res$log.odds=prettyNum(signif(res$log.odds,2)) |
|
|
63 |
res$p=prettyNum(signif(res$p,2)) |
|
|
64 |
res$adj.p=prettyNum(signif(res$adj.p,2)) |
|
|
65 |
} |
|
|
66 |
|
|
|
67 |
if(orderdata)res=res[order(res$featureA, as.numeric(res$adj.p), decreasing = F),] |
|
|
68 |
return(res) |
|
|
69 |
} |
|
|
70 |
|
|
|
71 |
fisher.matrix.pairwise=function(E, plot=F, alternative="two.sided"){ |
|
|
72 |
# perform a fisher exact test for each pairs of gene and saves the oddsRatio and p-value |
|
|
73 |
res <- NULL |
|
|
74 |
for(i in seq(dim(E)[2])){ |
|
|
75 |
for(j in seq(dim(E)[2])){ |
|
|
76 |
if(i!=j){ |
|
|
77 |
f <- fisher.test(E[,i], E[,j], alternative = alternative) |
|
|
78 |
res <- rbind(res,cbind(geneA=colnames(E)[i],geneB=colnames(E)[j],oddsRatio=f$estimate,pvalue=f$p.value)) |
|
|
79 |
} |
|
|
80 |
} |
|
|
81 |
} |
|
|
82 |
# some formatting |
|
|
83 |
res <- as.data.frame(res) |
|
|
84 |
res$geneA <- factor(res$geneA,levels=unique(res$geneA)) |
|
|
85 |
res$geneB <- factor(res$geneB,levels=unique(res$geneB)) |
|
|
86 |
res$oddsRatio <- as.numeric(as.character(res$oddsRatio)) |
|
|
87 |
|
|
|
88 |
# avoid inf values |
|
|
89 |
res$oddsRatio[res$oddsRatio>10]=10 |
|
|
90 |
res$oddsRatio[res$oddsRatio<=0]=0.01 |
|
|
91 |
|
|
|
92 |
res$pvalue <- as.numeric(as.character(res$pvalue)) |
|
|
93 |
# use p.adjust to correct for multi testing using a FDR |
|
|
94 |
res <- cbind(res,fdr=p.adjust(res$pvalue,"fdr")) |
|
|
95 |
# change the FDR in labels for plotting |
|
|
96 |
res$stars <- cut(res$fdr, breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", "")) |
|
|
97 |
|
|
|
98 |
if(plot){ |
|
|
99 |
# plot with ggplot 2 |
|
|
100 |
require(ggplot2) |
|
|
101 |
require(cowplot) # not necessary but the plot is nicer |
|
|
102 |
p <- ggplot(res, aes(geneA, geneB)) + geom_tile(aes(fill = log2(oddsRatio+0.01)),colour = "white") + scale_fill_gradient2(midpoint=1) + geom_text(aes(label=stars), color="black", size=5) |
|
|
103 |
p |
|
|
104 |
return(p) |
|
|
105 |
} |
|
|
106 |
|
|
|
107 |
return(res) |
|
|
108 |
} |
|
|
109 |
|
|
|
110 |
fun.get.cox=function(i, logicalv, DATA, univariate=T, pretty=T, TIME, STATUS){ |
|
|
111 |
logicalvector=logicalv[[i]] |
|
|
112 |
n=names(logicalv)[i] |
|
|
113 |
|
|
|
114 |
# remove if all are NA |
|
|
115 |
DATA=DATA[,!apply(DATA[logicalvector,,drop=F], 2, function(v)all(is.na(v)))] |
|
|
116 |
|
|
|
117 |
if(univariate){ |
|
|
118 |
univ=do.call(rbind, lapply(seq(dim(DATA)[2]), function(j){ |
|
|
119 |
y=Surv(TIME[logicalvector], STATUS[logicalvector]) |
|
|
120 |
fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA)[j], collapse="+"))), DATA[logicalvector,,drop=F]) |
|
|
121 |
|
|
|
122 |
b=cox.zph(fit)[[1]][3] # p.value |
|
|
123 |
|
|
|
124 |
a=summary(fit) |
|
|
125 |
|
|
|
126 |
pval=a$coefficients[1,5] |
|
|
127 |
coef=a$conf.int[c(1,3,4)] |
|
|
128 |
v=data.frame(colnames(DATA)[j], t(coef), pval,"","", a$concordance[1], b<0.05, stringsAsFactors = F) |
|
|
129 |
rownames(v)=NULL |
|
|
130 |
return(v) |
|
|
131 |
})) |
|
|
132 |
|
|
|
133 |
# adjust P-value depending on number of genes tested: |
|
|
134 |
univ[,6]=p.adjust(univ[,5], method="BH") |
|
|
135 |
|
|
|
136 |
}else{ |
|
|
137 |
y=Surv(TIME[logicalvector], STATUS[logicalvector]) |
|
|
138 |
fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA), collapse="+"))), DATA[logicalvector,]) |
|
|
139 |
|
|
|
140 |
b=cox.zph(fit)$table[,3] # p.value |
|
|
141 |
b=b[!names(b)%in%"GLOBAL"] |
|
|
142 |
|
|
|
143 |
a=summary(fit) |
|
|
144 |
|
|
|
145 |
pval=a$coefficients[,5] |
|
|
146 |
coef=a$conf.int[,c(1,3,4)] |
|
|
147 |
univ=data.frame(rownames(a$coefficients), coef, pval, "", "", a$concordance[1], b<0.05, stringsAsFactors = F) |
|
|
148 |
rownames(univ)=NULL |
|
|
149 |
|
|
|
150 |
# no need to adjust P-value here: |
|
|
151 |
univ[,6]=univ[,5] |
|
|
152 |
} |
|
|
153 |
|
|
|
154 |
univ=univ[order(univ[,5]),] |
|
|
155 |
|
|
|
156 |
univ[,7][as.numeric(univ[,6 ])<0.1]="." |
|
|
157 |
univ[,7][as.numeric(univ[,6 ])<0.05]="*" |
|
|
158 |
univ[,7][as.numeric(univ[,6 ])<0.01]="**" |
|
|
159 |
univ[,7][as.numeric(univ[,6 ])<0.001]="***" |
|
|
160 |
univ[,7][as.numeric(univ[,6 ])<0.0001]="****" |
|
|
161 |
univ[,7][is.na(univ[,7])]="" |
|
|
162 |
|
|
|
163 |
val=data.frame(univ, n, stringsAsFactors = F) |
|
|
164 |
colnames(val)=c("Feature", "exp(coef)", "lower .95", "upper .95", "P", "Adj.P", "Signif", "concordance", "zph P < 0.05", "Name") |
|
|
165 |
|
|
|
166 |
if(pretty){ |
|
|
167 |
for(i in c(2:6,8)){ |
|
|
168 |
val[,i]=prettyNum(signif(val[,i],2)) |
|
|
169 |
} |
|
|
170 |
} |
|
|
171 |
|
|
|
172 |
return(val) |
|
|
173 |
} |
|
|
174 |
|
|
|
175 |
fun.cox.elasticnet=function(DATA_ORG, time, status, cores=8, nfold=10,min.elnet=0.01,max.elnet=0.99, summary.km="85th_15th_percentile", percentage=0.25, REPEATS=100){ |
|
|
176 |
library(glmnet) |
|
|
177 |
|
|
|
178 |
# run regularized regression |
|
|
179 |
a <- seq(min.elnet, max.elnet, 0.05) |
|
|
180 |
|
|
|
181 |
if(percentage==0){ |
|
|
182 |
y=Surv(time, status) |
|
|
183 |
|
|
|
184 |
formdf1 <- as.formula(paste(" ~ ", paste(colnames(DATA_ORG),collapse="+"))) |
|
|
185 |
x=model.matrix(formdf1, data=DATA_ORG) |
|
|
186 |
|
|
|
187 |
# go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples |
|
|
188 |
enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){ |
|
|
189 |
|
|
|
190 |
s <- do.call(rbind, lapply(a, function(i){ |
|
|
191 |
cv <- cv.glmnet(x=x,y=y, family = "cox", nfold = nfold, type.measure = "deviance", alpha = i, standardize=F) |
|
|
192 |
data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i) |
|
|
193 |
})) |
|
|
194 |
|
|
|
195 |
return(s) |
|
|
196 |
}, mc.cores=cores)) |
|
|
197 |
|
|
|
198 |
# find mean alpha and lambda using all repeats |
|
|
199 |
search_m=do.call(rbind, lapply(a, function(alpha){ |
|
|
200 |
cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha]) |
|
|
201 |
lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha]) |
|
|
202 |
data.frame(alpha, cvm, lambda) |
|
|
203 |
})) |
|
|
204 |
|
|
|
205 |
# minimum cvm error: |
|
|
206 |
cv3 <- search_m[search_m$cvm == min(search_m$cvm), ] |
|
|
207 |
|
|
|
208 |
print(cv3$alpha) |
|
|
209 |
print(cv3$lambda) |
|
|
210 |
print(cv3$cvm) |
|
|
211 |
|
|
|
212 |
# fit model with tuned alpha, use lambda sequence here, not single value |
|
|
213 |
md3 <- glmnet(x=x,y=y, family = "cox", alpha = cv3$alpha, standardize=F) |
|
|
214 |
|
|
|
215 |
coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F] |
|
|
216 |
coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F] |
|
|
217 |
active_coefficients <- coefficients[,1] != 0 |
|
|
218 |
|
|
|
219 |
coefficients_all=coefficients[active_coefficients,,drop=F] |
|
|
220 |
|
|
|
221 |
comprisk=DATA_ORG[,match(rownames(coefficients_all), colnames(DATA_ORG))] |
|
|
222 |
risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk))) |
|
|
223 |
|
|
|
224 |
df=data.frame(x=risk_patient) |
|
|
225 |
ggplot(df, aes(x=x))+ |
|
|
226 |
geom_density(color="lightblue", fill="lightblue") |
|
|
227 |
|
|
|
228 |
fun.kapplanMeier(time, status, CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk") |
|
|
229 |
|
|
|
230 |
a=list(summary(coxph(y ~ PI.train, data.frame("PI.train"=risk_patient)))) |
|
|
231 |
c=list(coefficients_all) |
|
|
232 |
|
|
|
233 |
out=c(a,c) |
|
|
234 |
names(out)=c("PI.test", "coefficients") |
|
|
235 |
return(out) |
|
|
236 |
}else{ |
|
|
237 |
print("Splitting to test and train data") |
|
|
238 |
# divide data so that there are equal proportions of outcomes with training and test set! |
|
|
239 |
nr.samples=floor(percentage*dim(DATA_ORG)[1]) |
|
|
240 |
nr.events=floor(table(status)*percentage) |
|
|
241 |
|
|
|
242 |
set.seed(0) |
|
|
243 |
x <- which(status==1) |
|
|
244 |
vf <- logical(length(status==1)) |
|
|
245 |
vf[x[sample.int(length(x), nr.events[2])]] <- TRUE |
|
|
246 |
|
|
|
247 |
set.seed(0) |
|
|
248 |
x <- which(status==0) |
|
|
249 |
vf2 <- logical(length(status==0)) |
|
|
250 |
vf2[x[sample.int(length(x), nr.events[1])]] <- TRUE |
|
|
251 |
|
|
|
252 |
find=vf|vf2 |
|
|
253 |
|
|
|
254 |
# define training and testing datasets |
|
|
255 |
TRAIN=DATA_ORG[!find,] |
|
|
256 |
TEST=DATA_ORG[find,] |
|
|
257 |
|
|
|
258 |
ytrain=Surv(time[!find], status[!find]) |
|
|
259 |
ytest=Surv(time[find], status[find]) |
|
|
260 |
|
|
|
261 |
# this must look the same: |
|
|
262 |
log=fun.kapplanMeier(time, status, GROUPS=(rownames(DATA_ORG)%in%rownames(TRAIN)*1), MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=T, NAME = "These must be similar\nand high pval: 1 TRAIN, 0 TEST") |
|
|
263 |
|
|
|
264 |
formdf1 <- as.formula(paste(" ~ ", paste(colnames(TRAIN),collapse="+"))) |
|
|
265 |
x=model.matrix(formdf1, data=TRAIN) |
|
|
266 |
|
|
|
267 |
# go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples |
|
|
268 |
enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){ |
|
|
269 |
|
|
|
270 |
s <- do.call(rbind, lapply(a, function(i){ |
|
|
271 |
cv <- cv.glmnet(x=x,y=ytrain, family = "cox", nfold = floor((dim(DATA_ORG)[1]/10)), type.measure = "deviance", alpha = i, standardize=F) |
|
|
272 |
data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i) |
|
|
273 |
})) |
|
|
274 |
|
|
|
275 |
return(s) |
|
|
276 |
}, mc.cores=cores)) |
|
|
277 |
|
|
|
278 |
# find mean alpha and lambda using all repeats |
|
|
279 |
search_m=do.call(rbind, lapply(a, function(alpha){ |
|
|
280 |
cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha]) |
|
|
281 |
lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha]) |
|
|
282 |
data.frame(alpha, cvm, lambda) |
|
|
283 |
})) |
|
|
284 |
|
|
|
285 |
# minimum cvm error: |
|
|
286 |
cv3 <- search_m[search_m$cvm == min(search_m$cvm), ] |
|
|
287 |
|
|
|
288 |
print(cv3$alpha) |
|
|
289 |
print(cv3$lambda) |
|
|
290 |
print(cv3$cvm) |
|
|
291 |
|
|
|
292 |
# fit model with tuned alpha, use lambda sequence here, not single value |
|
|
293 |
md3 <- glmnet(x=x,y=ytrain, family = "cox", alpha = cv3$alpha, standardize=F) |
|
|
294 |
|
|
|
295 |
coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F] |
|
|
296 |
coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F] |
|
|
297 |
active_coefficients <- coefficients[,1] != 0 |
|
|
298 |
|
|
|
299 |
coefficients_all=coefficients[active_coefficients,,drop=F] |
|
|
300 |
comprisk=TRAIN[,match(rownames(coefficients_all), colnames(TRAIN))] |
|
|
301 |
risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk))) |
|
|
302 |
|
|
|
303 |
comprisk=TEST[,match(rownames(coefficients_all), colnames(TEST))] |
|
|
304 |
risk_patient_test=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk))) |
|
|
305 |
|
|
|
306 |
df=data.frame(x=risk_patient) |
|
|
307 |
ggplot(df, aes(x=x))+ |
|
|
308 |
geom_density(color="lightblue", fill="lightblue") |
|
|
309 |
|
|
|
310 |
df=data.frame(x=risk_patient_test) |
|
|
311 |
ggplot(df, aes(x=x))+ |
|
|
312 |
geom_density(color="indianred", fill="indianred") |
|
|
313 |
|
|
|
314 |
fun.kapplanMeier(time[!find], status[!find], CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk") |
|
|
315 |
fun.kapplanMeier(time[find], status[find], CONTINUOUS=risk_patient_test, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "validated risk") |
|
|
316 |
|
|
|
317 |
a=list(summary(coxph(ytrain ~ PI.train, data.frame("PI.train"=risk_patient)))) |
|
|
318 |
b=list(summary(coxph(ytest ~ PI.test, data.frame("PI.test"=risk_patient_test)))) |
|
|
319 |
c=list(coefficients_all) |
|
|
320 |
|
|
|
321 |
out=c(a,b,c) |
|
|
322 |
names(out)=c("PI.train", "PI.test", "coefficients") |
|
|
323 |
return(out) |
|
|
324 |
} |
|
|
325 |
} |