Diff of /R/LRAcluster.R [000000] .. [494cbf]

Switch to unified view

a b/R/LRAcluster.R
1
#' @name LRAcluster
2
#' @aliases LRAcluster
3
#' @title integrated analysis of cancer omics data by low-rank approximation
4
#' @description The LRAcluster function is the main interface of this package, it gets a list of matrix as input and outputs the coordinates of the samples in the reduced space and the explained potential.
5
#' @param data a list of data matrix,please keep the columns are the same order of samples
6
#' @param types a list of data types, can be binary, gaussian, or poisson
7
#' @param dimension the reduced dimension
8
#' @param names data names
9
#' @return A list contains the following component:
10
#'
11
#'         \code{coordinate} A matrix of the coordinates of all the samples in the reduced space
12
#'
13
#'         \code{potential}  ratio of explained variance
14
#' @keywords internal
15
#' @author Dingming Wu, Dongfang Wang
16
#' @references Wu D, Wang D, Zhang MQ, Gu J (2015). Fast dimension reduction and integrative clustering of multi-omics data using low-rank approximation: application to cancer molecular classification. BMC Genomics, 16(1):1022.
17
LRAcluster <- function(data, types, dimension = 2, names = as.character(1:length(data)))
18
{
19
  #--------#
20
  # binary #
21
  #--------#
22
  epsilon.binary<-2.0
23
  check.binary.row<-function(arr)
24
  {
25
    if (sum(!is.na(arr))==0)
26
    {
27
      return (F)
28
    }
29
    else
30
    {
31
      idx<-!is.na(arr)
32
      if (sum(arr[idx])==0 || sum(arr[idx])==sum(idx))
33
      {
34
        return (F)
35
      }
36
      else
37
      {
38
        return (T)
39
      }
40
    }
41
  }
42
  check.binary<-function(mat,name)
43
  {
44
    index<-apply(mat,1,check.binary.row)
45
    n<-sum(!index)
46
    if (n>0)
47
    {
48
      w<-paste("Warning: ",name," have ",as.character(n)," invalid lines!",sep="")
49
      warning(w)
50
    }
51
    mat_c<-mat[index,]
52
    rownames(mat_c)<-rownames(mat)[index]
53
    colnames(mat_c)<-colnames(mat)
54
    return (mat_c)
55
  }
56
57
  base.binary.row<-function(arr)
58
  {
59
    idx<-!is.na(arr)
60
    n<-sum(idx)
61
    m<-sum(arr[idx])
62
    return (log(m/(n-m)))
63
  }
64
65
  base.binary<-function(mat)
66
  {
67
    mat_b<-matrix(0,nrow(mat),ncol(mat))
68
    ar_b<-apply(mat,1,base.binary.row)
69
    mat_b[1:nrow(mat_b),]<-ar_b
70
    return (mat_b)
71
  }
72
73
  update.binary<-function(mat,mat_b,mat_now,eps)
74
  {
75
    mat_p<-mat_b+mat_now
76
    mat_u<-matrix(0,nrow(mat),ncol(mat))
77
    idx1<-!is.na(mat) & mat==1
78
    idx0<-!is.na(mat) & mat==0
79
    index<-is.na(mat)
80
    arr<-exp(mat_p)
81
    mat_u[index]<-mat_now[index]
82
    mat_u[idx1]<-mat_now[idx1]+eps*epsilon.binary/(1.0+arr[idx1])
83
    mat_u[idx0]<-mat_now[idx0]-eps*epsilon.binary*arr[idx0]/(1.0+arr[idx0])
84
    return (mat_u)
85
  }
86
87
  stop.binary<-function(mat,mat_b,mat_now,mat_u)
88
  {
89
    index<-!is.na(mat)
90
    mn<-mat_b+mat_now
91
    mu<-mat_b+mat_u
92
    arn<-exp(mn)
93
    aru<-exp(mu)
94
    idx1<-!is.na(mat) & mat==1
95
    idx0<-!is.na(mat) & mat==0
96
    lgn<-sum(log(arn[idx1]/(1+arn[idx1])))+sum(log(1/(1+arn[idx0])))
97
    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
98
    return (lgu-lgn)
99
  }
100
101
  LL.binary<-function(mat,mat_b,mat_u)
102
  {
103
    index<-!is.na(mat)
104
    mu<-mat_b+mat_u
105
    aru<-exp(mu)
106
    idx1<-!is.na(mat) & mat==1
107
    idx0<-!is.na(mat) & mat==0
108
    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
109
    return (lgu)
110
  }
111
112
  LLmax.binary<-function(mat)
113
  {
114
    return (0)
115
  }
116
117
  LLmin.binary<-function(mat,mat_b)
118
  {
119
    index<-!is.na(mat)
120
    aru<-exp(mat_b)
121
    idx1<-!is.na(mat) & mat==1
122
    idx0<-!is.na(mat) & mat==0
123
    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
124
    return (lgu)
125
  }
126
127
  binary_type_base <- function( data,dimension=2 ,name="test")
128
  {
129
    data<-check.binary(data,name)
130
    data_b<-base.binary(data)
131
    data_now<-matrix(0,nrow(data),ncol(data))
132
    data_u<-update.binary(data,data_b,data_now)
133
    data_u<-nuclear_approximation(data_u,dimension)
134
    while (T)
135
    {
136
      thr<-stop.binary(data,data_b,data_now,data_u)
137
      message(thr)
138
      if (thr<0.2)
139
      {
140
        break
141
      }
142
      data_now<-data_u
143
      data_u<-update.binary(data,data_b,data_now)
144
      data_u<-nuclear_approximation(data_u,dimension)
145
    }
146
    return (data_now)
147
  }
148
149
  #----------#
150
  # gaussian #
151
  #----------#
152
153
  epsilon.gaussian=0.5
154
155
  check.gaussian.row<-function(arr)
156
  {
157
    if (sum(!is.na(arr))==0)
158
    {
159
      return (F)
160
    }
161
    else
162
    {
163
      return (T)
164
    }
165
  }
166
  check.gaussian<-function(mat,name)
167
  {
168
    index<-array(T,nrow(mat))
169
    for(i in 1:nrow(mat))
170
    {
171
      if (sum(is.na(mat[i,])==ncol(mat)))
172
      {
173
        war<-paste("Warning: ",name,"'s ",as.character(i)," line is all NA. Delete this line",sep="")
174
        warning(war)
175
        index[i]<-F
176
      }
177
    }
178
    mat_c<-mat[index,]
179
    rownames(mat_c)<-rownames(mat)[index]
180
    colnames(mat_c)<-colnames(mat)
181
    return (mat_c)
182
  }
183
184
  base.gaussian.row<-function(arr)
185
  {
186
    idx<-!is.na(arr)
187
    return (mean(arr[idx]))
188
  }
189
190
  base.gaussian<-function(mat)
191
  {
192
    mat_b<-matrix(0,nrow(mat),ncol(mat))
193
    ar_b<-apply(mat,1,base.gaussian.row)
194
    mat_b[1:nrow(mat_b),]<-ar_b
195
    return (mat_b)
196
  }
197
198
  update.gaussian<-function(mat,mat_b,mat_now,eps)
199
  {
200
    mat_p<-mat_b+mat_now
201
    mat_u<-matrix(0,nrow(mat),ncol(mat))
202
    index<-!is.na(mat)
203
    mat_u[index]<-mat_now[index]+eps*epsilon.gaussian*(mat[index]-mat_p[index])
204
    index<-is.na(mat)
205
    mat_u[index]<-mat_now[index]
206
    return (mat_u)
207
  }
208
209
  stop.gaussian<-function(mat,mat_b,mat_now,mat_u)
210
  {
211
    index<-!is.na(mat)
212
    mn<-mat_b+mat_now
213
    mu<-mat_b+mat_u
214
    ren<-mat[index]-mn[index]
215
    reu<-mat[index]-mu[index]
216
    lgn<- -0.5*sum(ren*ren)
217
    lgu<- -0.5*sum(reu*reu)
218
    return (lgu-lgn)
219
  }
220
221
  LL.gaussian<-function(mat,mat_b,mat_u)
222
  {
223
    index<-!is.na(mat)
224
    mu<-mat_b+mat_u
225
    reu<-mat[index]-mu[index]
226
    lgu<- -0.5*sum(reu*reu)
227
    return (lgu)
228
  }
229
230
  LLmax.gaussian<-function(mat)
231
  {
232
    return (0.0)
233
  }
234
235
  LLmin.gaussian<-function(mat,mat_b)
236
  {
237
    index<-!is.na(mat)
238
    reu<-mat[index]-mat_b[index]
239
    lgu<- -0.5*sum(reu*reu)
240
    return (lgu)
241
  }
242
243
  gaussian_base<-function(data,dimension=2,name="test")
244
  {
245
    data<-check.gaussian(data,name)
246
    data_b<-base.gaussian(data)
247
    data_now<-matrix(0,nrow(data),ncol(data))
248
    data_u<-update.gaussian(data,data_b,data_now)
249
    data_u<-nuclear_approximation(data_u,dimension)
250
    while(T)
251
    {
252
      thr<-stop.gaussian(data,data_b,data_now,data_u)
253
      message(thr)
254
      if (thr<0.2)
255
      {
256
        break
257
      }
258
      data_now<-data_u
259
      data_u<-update.gaussian(data,data_b,data_now)
260
      data_u<-nuclear_approximation(data_u,dimension)
261
    }
262
    return (data_now)
263
  }
264
265
  #---------#
266
  # poisson #
267
  #---------#
268
269
  epsilon.poisson<-0.5
270
271
  check.poisson.row<-function(arr)
272
  {
273
    if (sum(!is.na(arr))==0)
274
    {
275
      return (F)
276
    }
277
    else
278
    {
279
      idx<-!is.na(arr)
280
      if (sum(arr[idx]<0)>0)
281
      {
282
        return (F)
283
      }
284
      else
285
      {
286
        return (T)
287
      }
288
    }
289
  }
290
291
  check.poisson<-function(mat,name)
292
  {
293
    w<-paste(name," is poisson type. Add 1 to all counts",sep="")
294
    message(w)
295
    index<-apply(mat,1,check.poisson.row)
296
    n<-sum(!index)
297
    if (n>0)
298
    {
299
      w<-paste("Warning: ",name," have ",as.character(n)," invalid lines!",sep="")
300
      warning(w)
301
    }
302
    mat_c<-mat[index,]+1
303
    rownames(mat_c)<-rownames(mat)[index]
304
    colnames(mat_c)<-colnames(mat)
305
    return (mat_c)
306
  }
307
308
  base.poisson.row<-function(arr)
309
  {
310
    idx<-!is.na(arr)
311
    m<-sum(log(arr[idx]))
312
    n<-sum(idx)
313
    return(m/n)
314
  }
315
316
  base.poisson<-function(mat)
317
  {
318
    mat_b<-matrix(0,nrow(mat),ncol(mat))
319
    ar_b<-apply(mat,1,base.poisson.row)
320
    mat_b[1:nrow(mat_b),]<-ar_b
321
    return (mat_b)
322
  }
323
324
  update.poisson<-function(mat,mat_b,mat_now,eps)
325
  {
326
    mat_p<-mat_b+mat_now
327
    mat_u<-matrix(0,nrow(mat),ncol(mat))
328
    index<-!is.na(mat)
329
    mat_u[index]<-mat_now[index]+eps*epsilon.poisson*(log(mat[index])-mat_p[index])
330
    index<-is.na(mat)
331
    mat_u[index]<-mat_now[index]
332
    return (mat_u)
333
  }
334
335
  stop.poisson<-function(mat,mat_b,mat_now,mat_u)
336
  {
337
    index<-!is.na(mat)
338
    mn<-mat_b+mat_now
339
    mu<-mat_b+mat_u
340
    lgn<-sum(mat[index]*mn[index]-exp(mn[index]))
341
    lgu<-sum(mat[index]*mu[index]-exp(mu[index]))
342
    return (lgu-lgn)
343
  }
344
345
  LL.poisson<-function(mat,mat_b,mat_u)
346
  {
347
    index<-!is.na(mat)
348
    mu<-mat_b+mat_u
349
    lgu<-sum(mat[index]*mu[index]-exp(mu[index]))
350
    return (lgu)
351
  }
352
353
  LLmax.poisson<-function(mat)
354
  {
355
    index<-!is.na(mat)
356
    lgu<-sum(mat[index]*log(mat[index])-mat[index])
357
    return (lgu)
358
  }
359
360
  LLmin.poisson<-function(mat,mat_b)
361
  {
362
    index<-!is.na(mat)
363
    lgu<-sum(mat[index]*mat_b[index]-exp(mat_b[index]))
364
    return (lgu)
365
  }
366
367
  poisson_type_base<-function(data,dimension=2,name="test")
368
  {
369
    data<-check.poisson(data,name)
370
    data_b<-base.poisson(data)
371
    data_now<-matrix(0,nrow(data),ncol(data))
372
    data_u<-update.poisson(data,data_b,data_now)
373
    data_u<-nuclear_approximation(data_u,dimension)
374
    while(T)
375
    {
376
      thr<-stop.poisson(data,data_b,data_now,data_u)
377
      message(thr)
378
      if (thr<0.2)
379
      {
380
        break
381
      }
382
      data_now<-data_u
383
      data_u<-update.poisson(data,data_b,data_now)
384
      data_u<-nuclear_approximation(data_u,dimension)
385
    }
386
    return (data_now)
387
  }
388
389
  #----#
390
  # na #
391
  #----#
392
393
  nuclear_approximation<-function(mat,dimension)
394
  {
395
    svd<-svd(mat,nu=0,nv=0)
396
    if (dimension<length(svd$d))
397
    {
398
      lambda<-svd$d[dimension+1]
399
      svd<-svd(mat,nu=dimension,nv=dimension)
400
      indexh<-svd$d>lambda
401
      indexm<-svd$d<lambda
402
      dia<-array(svd$d,length(svd$d))
403
      dia[indexh]<-dia[indexh]-lambda
404
      dia[indexm]<-0
405
      mat_low<-svd$u%*%diag(c(dia[1:dimension],0))[1:dimension,1:dimension]%*%t(svd$v)
406
    }
407
    else
408
    {
409
      mat_low<-mat
410
    }
411
    return (mat_low)
412
  }
413
414
  #------------#
415
  # LRAcluster #
416
  #------------#
417
  check.matrix.element<-function(x)
418
  {
419
    if (!is.matrix(x))
420
    {
421
      return (T)
422
    }
423
    else
424
    {
425
      return (F)
426
    }
427
  }
428
429
  ncol.element<-function(x)
430
  {
431
    return (ncol(x))
432
  }
433
434
  nrow.element<-function(x)
435
  {
436
    return (nrow(x))
437
  }
438
439
  check<-function(mat,type,name)
440
  {
441
    if (type=="binary")
442
    {
443
      return (check.binary(mat,name))
444
    }
445
    else if (type=="gaussian")
446
    {
447
      return (check.gaussian(mat,name))
448
    }
449
    else if (type=="poisson")
450
    {
451
      return (check.poisson(mat,name))
452
    }
453
    else
454
    {
455
      e<-paste("unknown type ",type,sep="")
456
      stop(e)
457
    }
458
  }
459
460
  eps<-0.0
461
  if (!is.list(data))
462
  {
463
    stop("the input data must be a list!")
464
  }
465
  c<-sapply(data,check.matrix.element)
466
  if (sum(c)>0)
467
  {
468
    stop("each element of input list must be a matrix!")
469
  }
470
  c<-sapply(data,ncol.element)
471
  if (length(levels(factor(c)))>1)
472
  {
473
    stop("each element of input list must have the same column number!")
474
  }
475
  if (length(data)!=length(types))
476
  {
477
    stop("data and types must be the same length!")
478
  }
479
  nSample<-c[1]
480
  loglmin<-0
481
  loglmax<-0
482
  loglu<-0.0
483
  nData<-length(data)
484
  for (i in 1:nData)
485
  {
486
    data[[i]]<-check(data[[i]],types[[i]],names[[i]])
487
  }
488
  nGeneArr<-sapply(data,nrow.element)
489
  nGene<-sum(nGeneArr)
490
  indexData<-list()
491
  k=1
492
  for(i in 1:nData)
493
  {
494
    indexData[[i]]<- (k):(k+nGeneArr[i]-1)
495
    k<-k+nGeneArr[i]
496
  }
497
  base<-matrix(0,nGene,nSample)
498
  now<-matrix(0,nGene,nSample)
499
  update<-matrix(0,nGene,nSample)
500
  thr<-array(0,nData)
501
  for (i in 1:nData)
502
  {
503
    if (types[[i]]=="binary")
504
    {
505
      base[indexData[[i]],]<-base.binary(data[[i]])
506
      loglmin<-loglmin+LLmin.binary(data[[i]],base[indexData[[i]],])
507
      loglmax<-loglmax+LLmax.binary(data[[i]])
508
    }
509
    else if (types[[i]]=="gaussian")
510
    {
511
      base[indexData[[i]],]<-base.gaussian(data[[i]])
512
      loglmin<-loglmin+LLmin.gaussian(data[[i]],base[indexData[[i]],])
513
      loglmax<-loglmax+LLmax.gaussian(data[[i]])
514
    }
515
    else if (types[[i]]=="poisson")
516
    {
517
      base[indexData[[i]],]<-base.poisson(data[[i]])
518
      loglmin<-loglmin+LLmin.poisson(data[[i]],base[indexData[[i]],])
519
      loglmax<-loglmax+LLmax.poisson(data[[i]])
520
    }
521
  }
522
  for (i in 1:nData)
523
  {
524
    if (types[[i]]=="binary")
525
    {
526
      update[indexData[[i]],]<-update.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
527
    }
528
    else if (types[[i]]=="gaussian")
529
    {
530
      update[indexData[[i]],]<-update.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
531
    }
532
    else if (types[[i]]=="poisson")
533
    {
534
      update[indexData[[i]],]<-update.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
535
    }
536
  }
537
  update<-nuclear_approximation(update,dimension)
538
  nIter<-0
539
  thres<-array(Inf,3)
540
  epsN<-array(Inf,2)
541
  while(T)
542
  {
543
    for (i in 1:nData)
544
    {
545
      if (types[[i]]=="binary")
546
      {
547
        thr[i]<-stop.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
548
      }
549
      else if (types[[i]]=="gaussian")
550
      {
551
        thr[i]<-stop.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
552
      }
553
      else if (types[[i]]=="poisson")
554
      {
555
        thr[i]<-stop.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
556
      }
557
    }
558
    nIter<-nIter+1
559
    thres[1]<-thres[2]
560
    thres[2]<-thres[3]
561
    thres[3]<-sum(thr)
562
    epsN[1]<-epsN[2]
563
    epsN[2]<-eps
564
    if (nIter>5)
565
    {
566
      if (runif(1)<thres[1]*thres[3]/(thres[2]*thres[2]+thres[1]*thres[3]))
567
      {
568
        eps<-epsN[1]+0.05*runif(1)-0.025
569
      }
570
      else
571
      {
572
        eps<-epsN[2]+0.05*runif(1)-0.025
573
      }
574
      if (eps< -0.7)
575
      {
576
        eps<- 0
577
        epsN<-c(0,0)
578
      }
579
      if (eps > 1.4)
580
      {
581
        eps<-0
582
        epsN<-c(0,0)
583
      }
584
    }
585
    if (sum(thr)<nData*0.2)
586
    {
587
      break
588
    }
589
    now<-update
590
    for (i in 1:nData)
591
    {
592
      if (types[[i]]=="binary")
593
      {
594
        update[indexData[[i]],]<-update.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
595
      }
596
      else if (types[[i]]=="gaussian")
597
      {
598
        update[indexData[[i]],]<-update.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
599
      }
600
      else if (types[[i]]=="poisson")
601
      {
602
        update[indexData[[i]],]<-update.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
603
      }
604
    }
605
    update<-nuclear_approximation(update,dimension)
606
  }
607
  for (i in 1:nData)
608
  {
609
    if (types[[i]]=="binary")
610
    {
611
      loglu<-loglu+LL.binary(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
612
    }
613
    else if (types[[i]]=="gaussian")
614
    {
615
      loglu<-loglu+LL.gaussian(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
616
    }
617
    else if (types[[i]]=="poisson")
618
    {
619
      loglu<-loglu+LL.poisson(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
620
    }
621
  }
622
  sv<-svd(update,nu=0,nv=dimension)
623
  coordinate<-diag(c(sv$d[1:dimension],0))[1:dimension,1:dimension]%*%t(sv$v)
624
  colnames(coordinate)<-colnames(data[[1]])
625
  rownames(coordinate)<-paste("PC ",as.character(1:dimension),sep="")
626
  ratio<-(loglu-loglmin)/(loglmax-loglmin)
627
  return (list("coordinate"=coordinate,"potential"=ratio))
628
}