|
a |
|
b/integration_unsupervised.R |
|
|
1 |
########################################################################## |
|
|
2 |
### Script to perform integrative Clustering of several omics datasets ### |
|
|
3 |
########################################################################## |
|
|
4 |
|
|
|
5 |
# get options |
|
|
6 |
library("optparse") |
|
|
7 |
|
|
|
8 |
option_list = list( |
|
|
9 |
make_option(c("-d", "--DNA"), type="character", default=NULL, help="file with mutation data [default= %default]", metavar="character"), |
|
|
10 |
make_option(c("-r", "--RNA"), type="character", default=NULL, help="file with expression data [default= %default]", metavar="character"), |
|
|
11 |
make_option(c("-k", "--Kmin"), type="numeric", default=2, help="minimum number of clusters [default= %default]", metavar="numeric"), |
|
|
12 |
make_option(c("-K", "--Kmax"), type="numeric", default=6, help="maximum number of clusters [default= %default]", metavar="numeric"), |
|
|
13 |
make_option(c("-m", "--methylation"), type="character", default=NULL, help="file with methylation data [default= %default]", metavar="character"), |
|
|
14 |
make_option(c("-C", "--CNV"), type="character", default=NULL, help="file with copy number data [default= %default]", metavar="character"), |
|
|
15 |
make_option(c("-o", "--out"), type="character", default="out", help="output directory name [default= %default]", metavar="character"), |
|
|
16 |
make_option(c("-c", "--cores"), type="numeric", default=1, help="number of cores for statistical computation [default= %default]", metavar="numeric") |
|
|
17 |
); |
|
|
18 |
|
|
|
19 |
opt_parser = OptionParser(option_list=option_list); |
|
|
20 |
opt = parse_args(opt_parser); |
|
|
21 |
|
|
|
22 |
Kmin = as.numeric(opt$Kmin) |
|
|
23 |
Kmax = as.numeric(opt$Kmax) |
|
|
24 |
cores = as.numeric(opt$cores) |
|
|
25 |
|
|
|
26 |
require(iClusterPlus) |
|
|
27 |
require(gplots) |
|
|
28 |
require(lattice) |
|
|
29 |
# read data |
|
|
30 |
Ddata = NULL |
|
|
31 |
Cdata = NULL |
|
|
32 |
Rdata = NULL |
|
|
33 |
Mdata = NULL |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
dtl = vector("list",4) |
|
|
37 |
ntypes = 0 |
|
|
38 |
type = c() |
|
|
39 |
row.order = c() |
|
|
40 |
plot.chr = c() |
|
|
41 |
sparse = c() |
|
|
42 |
cap = c() |
|
|
43 |
if(!is.null(opt$DNA)){ |
|
|
44 |
ntypes = ntypes +1 |
|
|
45 |
Ddata = read.table(opt$DNA,h=T,row.names = 1) |
|
|
46 |
dtl[[ntypes]] = t(Ddata) |
|
|
47 |
type = c(type,"binomial") |
|
|
48 |
row.order = c(row.order,T) |
|
|
49 |
plot.chr = c(plot.chr,F) |
|
|
50 |
sparse = c(sparse,T) |
|
|
51 |
cap = c(cap,T) |
|
|
52 |
} |
|
|
53 |
if(!is.null(opt$CNV)){ |
|
|
54 |
ntypes = ntypes +1 |
|
|
55 |
dtl[[ntypes]] = t(Cdata) |
|
|
56 |
type = c(type,"gaussian") |
|
|
57 |
row.order = c(row.order,T) |
|
|
58 |
plot.chr = c(plot.chr,T) |
|
|
59 |
sparse = c(sparse,T) |
|
|
60 |
cap = c(cap,T) |
|
|
61 |
} |
|
|
62 |
if(!is.null(opt$RNA)){ |
|
|
63 |
ntypes = ntypes +1 |
|
|
64 |
Rdata = read.table(opt$RNA,h=T,row.names = 1,check.names = T) |
|
|
65 |
dtl[[ntypes]] = t(Rdata) |
|
|
66 |
type = c(type,"gaussian") |
|
|
67 |
row.order = c(row.order,T) |
|
|
68 |
plot.chr = c(plot.chr,F) |
|
|
69 |
sparse = c(sparse,T) |
|
|
70 |
cap = c(cap,T) |
|
|
71 |
} |
|
|
72 |
if(!is.null(opt$methylation)){ |
|
|
73 |
ntypes = ntypes +1 |
|
|
74 |
Mdata = read.table(opt$methylation,h=T,row.names = 1) |
|
|
75 |
dtl[[ntypes]] = t(Mdata) |
|
|
76 |
type = c(type,"gaussian") |
|
|
77 |
row.order = c(row.order,T) |
|
|
78 |
plot.chr = c(plot.chr,F) |
|
|
79 |
sparse = c(sparse,T) |
|
|
80 |
cap = c(cap,T) |
|
|
81 |
} |
|
|
82 |
|
|
|
83 |
print(dtl) |
|
|
84 |
print(type) |
|
|
85 |
|
|
|
86 |
# tune lambda |
|
|
87 |
cv.fitl = vector("list",Kmax-Kmin+1) |
|
|
88 |
for(k in (Kmin:Kmax-1) ){ |
|
|
89 |
cv.fitl[[k]] = tune.iClusterPlus(cpus=cores,dt1=dtl[[1]],dt2=dtl[[2]],dt3=dtl[[3]],dt4=dtl[[4]], type=type, n.lambda=NULL,K=k,maxiter=20) |
|
|
90 |
cv.fittmp = cv.fitl[[k]] |
|
|
91 |
save(cv.fittmp, file=paste(opt$out,"cv.fit.K",k+1,".Rdata",sep="")) |
|
|
92 |
} |
|
|
93 |
warnings() |
|
|
94 |
nLambda = nrow(cv.fitl[[1]]$lambda) |
|
|
95 |
nK = length(cv.fitl) |
|
|
96 |
BIC = getBIC(cv.fitl) |
|
|
97 |
devR = getDevR(cv.fitl) |
|
|
98 |
|
|
|
99 |
minBICid = apply(BIC,2,which.min) |
|
|
100 |
devRatMinBIC = rep(NA,nK) |
|
|
101 |
for(i in 1:nK){ |
|
|
102 |
devRatMinBIC[i] = devR[minBICid[i],i] |
|
|
103 |
} |
|
|
104 |
|
|
|
105 |
#best.fitl=sapply( Kmin:Kmax-1 , function(k) cv.fitl[[k]]$fit[[which.min(BIC[,k])]] ) |
|
|
106 |
clusters=getClusters(cv.fitl) |
|
|
107 |
colnames(clusters)=paste("K=",Kmin:Kmax,sep="") |
|
|
108 |
clusters2 = clusters |
|
|
109 |
k = Kmin |
|
|
110 |
while(k<Kmax){ |
|
|
111 |
print(k) |
|
|
112 |
clusters2 = clusters |
|
|
113 |
for(i in 1:max(clusters[,k-1]) ){ |
|
|
114 |
print(i) |
|
|
115 |
tatmp = table(clusters[clusters[,k-1]==i,k]) |
|
|
116 |
newid = as.numeric(names(tatmp)[which.max(tatmp)]) |
|
|
117 |
clusters2[clusters[,k]==newid,k] = i |
|
|
118 |
clusters2[clusters[,k]==i,k] = newid |
|
|
119 |
print(clusters2) |
|
|
120 |
clusters = clusters2 |
|
|
121 |
} |
|
|
122 |
k = k+1 |
|
|
123 |
} |
|
|
124 |
|
|
|
125 |
svg(paste(opt$out,"iClusters.svg",sep=""),h=6,w=8*2) |
|
|
126 |
par(family="Times",mfrow=c(1,2)) |
|
|
127 |
plot(Kmin:Kmax,devRatMinBIC,type="b",xlab=expression(italic(K)),ylab=expression("Pseudo "~italic(R^2)),las=1) |
|
|
128 |
image(1:nrow(clusters),Kmin:Kmax,clusters,col=rainbow(Kmax),axes=F,xlab="",ylab=expression(italic(K)),las=2 ) |
|
|
129 |
axis(1,at = 1:nrow(clusters),labels = rownames(dtl[[1]]) ,las=2) |
|
|
130 |
axis(2,at = Kmin:Kmax,las=1) |
|
|
131 |
dev.off() |
|
|
132 |
|
|
|
133 |
# plot heatmap and write features |
|
|
134 |
features = lapply(dtl,colnames) |
|
|
135 |
for(k in (Kmin:Kmax-1) ){ |
|
|
136 |
print(k) |
|
|
137 |
best.cluster=clusters[,k] |
|
|
138 |
print(best.cluster) |
|
|
139 |
best.fit=cv.fitl[[k]]$fit[[which.min(BIC[,k])]] |
|
|
140 |
print(best.fit) |
|
|
141 |
col.scheme = alist() |
|
|
142 |
col.scheme[[1]] = bluered(256) |
|
|
143 |
col.scheme[[2]] = bluered(256) |
|
|
144 |
col.scheme[[3]] = bluered(256) |
|
|
145 |
col.scheme[[4]] = bluered(256) |
|
|
146 |
|
|
|
147 |
png(paste(opt$out,"Heatmap_K",k+1,".png",sep=""),h=4*200,w=4*300,bg=0) |
|
|
148 |
plotHeatmap(fit=best.fit,datasets=dtl[1:ntypes], type=type, col.scheme = col.scheme[1:ntypes], row.order=row.order,plot.chr=plot.chr,sparse=sparse,cap=cap) |
|
|
149 |
dev.off() |
|
|
150 |
|
|
|
151 |
# get best features |
|
|
152 |
sigfeatures=alist() |
|
|
153 |
idsigfeatures=alist() |
|
|
154 |
print("yop") |
|
|
155 |
print(sum(!sapply(features,is.null))) |
|
|
156 |
for(i in 1:sum(!sapply(features,is.null)) ){ |
|
|
157 |
print(i) |
|
|
158 |
rowsum=apply(abs(best.fit$beta[[i]]),1, sum) |
|
|
159 |
upper=quantile(rowsum,prob=0.75) |
|
|
160 |
sigfeatures[[i]]=(features[[i]])[which(rowsum>upper)] |
|
|
161 |
idsigfeatures[[i]]=which(rowsum>upper) |
|
|
162 |
write.table(sigfeatures[[i]],file = paste(opt$out,"_K",k+1,"_features",i,".txt",sep=""),col.names = F,quote = F) |
|
|
163 |
} |
|
|
164 |
#names(sigfeatures)=c("expression","methylation") |
|
|
165 |
} |
|
|
166 |
|
|
|
167 |
|
|
|
168 |
|