MOVICS / Git / Diff of /R/LRAcluster.R

Models:
AlyssaS/
MOVICS
Downloads: 1
Diff of /R/LRAcluster.R [000000] .. [494cbf]
Switch to side-by-side view

--- a
+++ b/R/LRAcluster.R
@@ -0,0 +1,628 @@
+#' @name LRAcluster
+#' @aliases LRAcluster
+#' @title integrated analysis of cancer omics data by low-rank approximation
+#' @description The LRAcluster function is the main interface of this package, it gets a list of matrix as input and outputs the coordinates of the samples in the reduced space and the explained potential.
+#' @param data a list of data matrix,please keep the columns are the same order of samples
+#' @param types a list of data types, can be binary, gaussian, or poisson
+#' @param dimension the reduced dimension
+#' @param names data names
+#' @return A list contains the following component:
+#'
+#'         \code{coordinate} A matrix of the coordinates of all the samples in the reduced space
+#'
+#'         \code{potential}  ratio of explained variance
+#' @keywords internal
+#' @author Dingming Wu, Dongfang Wang
+#' @references Wu D, Wang D, Zhang MQ, Gu J (2015). Fast dimension reduction and integrative clustering of multi-omics data using low-rank approximation: application to cancer molecular classification. BMC Genomics, 16(1):1022.
+LRAcluster <- function(data, types, dimension = 2, names = as.character(1:length(data)))
+{
+  #--------#
+  # binary #
+  #--------#
+  epsilon.binary<-2.0
+  check.binary.row<-function(arr)
+  {
+    if (sum(!is.na(arr))==0)
+    {
+      return (F)
+    }
+    else
+    {
+      idx<-!is.na(arr)
+      if (sum(arr[idx])==0 || sum(arr[idx])==sum(idx))
+      {
+        return (F)
+      }
+      else
+      {
+        return (T)
+      }
+    }
+  }
+  check.binary<-function(mat,name)
+  {
+    index<-apply(mat,1,check.binary.row)
+    n<-sum(!index)
+    if (n>0)
+    {
+      w<-paste("Warning: ",name," have ",as.character(n)," invalid lines!",sep="")
+      warning(w)
+    }
+    mat_c<-mat[index,]
+    rownames(mat_c)<-rownames(mat)[index]
+    colnames(mat_c)<-colnames(mat)
+    return (mat_c)
+  }
+
+  base.binary.row<-function(arr)
+  {
+    idx<-!is.na(arr)
+    n<-sum(idx)
+    m<-sum(arr[idx])
+    return (log(m/(n-m)))
+  }
+
+  base.binary<-function(mat)
+  {
+    mat_b<-matrix(0,nrow(mat),ncol(mat))
+    ar_b<-apply(mat,1,base.binary.row)
+    mat_b[1:nrow(mat_b),]<-ar_b
+    return (mat_b)
+  }
+
+  update.binary<-function(mat,mat_b,mat_now,eps)
+  {
+    mat_p<-mat_b+mat_now
+    mat_u<-matrix(0,nrow(mat),ncol(mat))
+    idx1<-!is.na(mat) & mat==1
+    idx0<-!is.na(mat) & mat==0
+    index<-is.na(mat)
+    arr<-exp(mat_p)
+    mat_u[index]<-mat_now[index]
+    mat_u[idx1]<-mat_now[idx1]+eps*epsilon.binary/(1.0+arr[idx1])
+    mat_u[idx0]<-mat_now[idx0]-eps*epsilon.binary*arr[idx0]/(1.0+arr[idx0])
+    return (mat_u)
+  }
+
+  stop.binary<-function(mat,mat_b,mat_now,mat_u)
+  {
+    index<-!is.na(mat)
+    mn<-mat_b+mat_now
+    mu<-mat_b+mat_u
+    arn<-exp(mn)
+    aru<-exp(mu)
+    idx1<-!is.na(mat) & mat==1
+    idx0<-!is.na(mat) & mat==0
+    lgn<-sum(log(arn[idx1]/(1+arn[idx1])))+sum(log(1/(1+arn[idx0])))
+    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
+    return (lgu-lgn)
+  }
+
+  LL.binary<-function(mat,mat_b,mat_u)
+  {
+    index<-!is.na(mat)
+    mu<-mat_b+mat_u
+    aru<-exp(mu)
+    idx1<-!is.na(mat) & mat==1
+    idx0<-!is.na(mat) & mat==0
+    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
+    return (lgu)
+  }
+
+  LLmax.binary<-function(mat)
+  {
+    return (0)
+  }
+
+  LLmin.binary<-function(mat,mat_b)
+  {
+    index<-!is.na(mat)
+    aru<-exp(mat_b)
+    idx1<-!is.na(mat) & mat==1
+    idx0<-!is.na(mat) & mat==0
+    lgu<-sum(log(aru[idx1]/(1+aru[idx1])))+sum(log(1/(1+aru[idx0])))
+    return (lgu)
+  }
+
+  binary_type_base <- function( data,dimension=2 ,name="test")
+  {
+    data<-check.binary(data,name)
+    data_b<-base.binary(data)
+    data_now<-matrix(0,nrow(data),ncol(data))
+    data_u<-update.binary(data,data_b,data_now)
+    data_u<-nuclear_approximation(data_u,dimension)
+    while (T)
+    {
+      thr<-stop.binary(data,data_b,data_now,data_u)
+      message(thr)
+      if (thr<0.2)
+      {
+        break
+      }
+      data_now<-data_u
+      data_u<-update.binary(data,data_b,data_now)
+      data_u<-nuclear_approximation(data_u,dimension)
+    }
+    return (data_now)
+  }
+
+  #----------#
+  # gaussian #
+  #----------#
+
+  epsilon.gaussian=0.5
+
+  check.gaussian.row<-function(arr)
+  {
+    if (sum(!is.na(arr))==0)
+    {
+      return (F)
+    }
+    else
+    {
+      return (T)
+    }
+  }
+  check.gaussian<-function(mat,name)
+  {
+    index<-array(T,nrow(mat))
+    for(i in 1:nrow(mat))
+    {
+      if (sum(is.na(mat[i,])==ncol(mat)))
+      {
+        war<-paste("Warning: ",name,"'s ",as.character(i)," line is all NA. Delete this line",sep="")
+        warning(war)
+        index[i]<-F
+      }
+    }
+    mat_c<-mat[index,]
+    rownames(mat_c)<-rownames(mat)[index]
+    colnames(mat_c)<-colnames(mat)
+    return (mat_c)
+  }
+
+  base.gaussian.row<-function(arr)
+  {
+    idx<-!is.na(arr)
+    return (mean(arr[idx]))
+  }
+
+  base.gaussian<-function(mat)
+  {
+    mat_b<-matrix(0,nrow(mat),ncol(mat))
+    ar_b<-apply(mat,1,base.gaussian.row)
+    mat_b[1:nrow(mat_b),]<-ar_b
+    return (mat_b)
+  }
+
+  update.gaussian<-function(mat,mat_b,mat_now,eps)
+  {
+    mat_p<-mat_b+mat_now
+    mat_u<-matrix(0,nrow(mat),ncol(mat))
+    index<-!is.na(mat)
+    mat_u[index]<-mat_now[index]+eps*epsilon.gaussian*(mat[index]-mat_p[index])
+    index<-is.na(mat)
+    mat_u[index]<-mat_now[index]
+    return (mat_u)
+  }
+
+  stop.gaussian<-function(mat,mat_b,mat_now,mat_u)
+  {
+    index<-!is.na(mat)
+    mn<-mat_b+mat_now
+    mu<-mat_b+mat_u
+    ren<-mat[index]-mn[index]
+    reu<-mat[index]-mu[index]
+    lgn<- -0.5*sum(ren*ren)
+    lgu<- -0.5*sum(reu*reu)
+    return (lgu-lgn)
+  }
+
+  LL.gaussian<-function(mat,mat_b,mat_u)
+  {
+    index<-!is.na(mat)
+    mu<-mat_b+mat_u
+    reu<-mat[index]-mu[index]
+    lgu<- -0.5*sum(reu*reu)
+    return (lgu)
+  }
+
+  LLmax.gaussian<-function(mat)
+  {
+    return (0.0)
+  }
+
+  LLmin.gaussian<-function(mat,mat_b)
+  {
+    index<-!is.na(mat)
+    reu<-mat[index]-mat_b[index]
+    lgu<- -0.5*sum(reu*reu)
+    return (lgu)
+  }
+
+  gaussian_base<-function(data,dimension=2,name="test")
+  {
+    data<-check.gaussian(data,name)
+    data_b<-base.gaussian(data)
+    data_now<-matrix(0,nrow(data),ncol(data))
+    data_u<-update.gaussian(data,data_b,data_now)
+    data_u<-nuclear_approximation(data_u,dimension)
+    while(T)
+    {
+      thr<-stop.gaussian(data,data_b,data_now,data_u)
+      message(thr)
+      if (thr<0.2)
+      {
+        break
+      }
+      data_now<-data_u
+      data_u<-update.gaussian(data,data_b,data_now)
+      data_u<-nuclear_approximation(data_u,dimension)
+    }
+    return (data_now)
+  }
+
+  #---------#
+  # poisson #
+  #---------#
+
+  epsilon.poisson<-0.5
+
+  check.poisson.row<-function(arr)
+  {
+    if (sum(!is.na(arr))==0)
+    {
+      return (F)
+    }
+    else
+    {
+      idx<-!is.na(arr)
+      if (sum(arr[idx]<0)>0)
+      {
+        return (F)
+      }
+      else
+      {
+        return (T)
+      }
+    }
+  }
+
+  check.poisson<-function(mat,name)
+  {
+    w<-paste(name," is poisson type. Add 1 to all counts",sep="")
+    message(w)
+    index<-apply(mat,1,check.poisson.row)
+    n<-sum(!index)
+    if (n>0)
+    {
+      w<-paste("Warning: ",name," have ",as.character(n)," invalid lines!",sep="")
+      warning(w)
+    }
+    mat_c<-mat[index,]+1
+    rownames(mat_c)<-rownames(mat)[index]
+    colnames(mat_c)<-colnames(mat)
+    return (mat_c)
+  }
+
+  base.poisson.row<-function(arr)
+  {
+    idx<-!is.na(arr)
+    m<-sum(log(arr[idx]))
+    n<-sum(idx)
+    return(m/n)
+  }
+
+  base.poisson<-function(mat)
+  {
+    mat_b<-matrix(0,nrow(mat),ncol(mat))
+    ar_b<-apply(mat,1,base.poisson.row)
+    mat_b[1:nrow(mat_b),]<-ar_b
+    return (mat_b)
+  }
+
+  update.poisson<-function(mat,mat_b,mat_now,eps)
+  {
+    mat_p<-mat_b+mat_now
+    mat_u<-matrix(0,nrow(mat),ncol(mat))
+    index<-!is.na(mat)
+    mat_u[index]<-mat_now[index]+eps*epsilon.poisson*(log(mat[index])-mat_p[index])
+    index<-is.na(mat)
+    mat_u[index]<-mat_now[index]
+    return (mat_u)
+  }
+
+  stop.poisson<-function(mat,mat_b,mat_now,mat_u)
+  {
+    index<-!is.na(mat)
+    mn<-mat_b+mat_now
+    mu<-mat_b+mat_u
+    lgn<-sum(mat[index]*mn[index]-exp(mn[index]))
+    lgu<-sum(mat[index]*mu[index]-exp(mu[index]))
+    return (lgu-lgn)
+  }
+
+  LL.poisson<-function(mat,mat_b,mat_u)
+  {
+    index<-!is.na(mat)
+    mu<-mat_b+mat_u
+    lgu<-sum(mat[index]*mu[index]-exp(mu[index]))
+    return (lgu)
+  }
+
+  LLmax.poisson<-function(mat)
+  {
+    index<-!is.na(mat)
+    lgu<-sum(mat[index]*log(mat[index])-mat[index])
+    return (lgu)
+  }
+
+  LLmin.poisson<-function(mat,mat_b)
+  {
+    index<-!is.na(mat)
+    lgu<-sum(mat[index]*mat_b[index]-exp(mat_b[index]))
+    return (lgu)
+  }
+
+  poisson_type_base<-function(data,dimension=2,name="test")
+  {
+    data<-check.poisson(data,name)
+    data_b<-base.poisson(data)
+    data_now<-matrix(0,nrow(data),ncol(data))
+    data_u<-update.poisson(data,data_b,data_now)
+    data_u<-nuclear_approximation(data_u,dimension)
+    while(T)
+    {
+      thr<-stop.poisson(data,data_b,data_now,data_u)
+      message(thr)
+      if (thr<0.2)
+      {
+        break
+      }
+      data_now<-data_u
+      data_u<-update.poisson(data,data_b,data_now)
+      data_u<-nuclear_approximation(data_u,dimension)
+    }
+    return (data_now)
+  }
+
+  #----#
+  # na #
+  #----#
+
+  nuclear_approximation<-function(mat,dimension)
+  {
+    svd<-svd(mat,nu=0,nv=0)
+    if (dimension<length(svd$d))
+    {
+      lambda<-svd$d[dimension+1]
+      svd<-svd(mat,nu=dimension,nv=dimension)
+      indexh<-svd$d>lambda
+      indexm<-svd$d<lambda
+      dia<-array(svd$d,length(svd$d))
+      dia[indexh]<-dia[indexh]-lambda
+      dia[indexm]<-0
+      mat_low<-svd$u%*%diag(c(dia[1:dimension],0))[1:dimension,1:dimension]%*%t(svd$v)
+    }
+    else
+    {
+      mat_low<-mat
+    }
+    return (mat_low)
+  }
+
+  #------------#
+  # LRAcluster #
+  #------------#
+  check.matrix.element<-function(x)
+  {
+    if (!is.matrix(x))
+    {
+      return (T)
+    }
+    else
+    {
+      return (F)
+    }
+  }
+
+  ncol.element<-function(x)
+  {
+    return (ncol(x))
+  }
+
+  nrow.element<-function(x)
+  {
+    return (nrow(x))
+  }
+
+  check<-function(mat,type,name)
+  {
+    if (type=="binary")
+    {
+      return (check.binary(mat,name))
+    }
+    else if (type=="gaussian")
+    {
+      return (check.gaussian(mat,name))
+    }
+    else if (type=="poisson")
+    {
+      return (check.poisson(mat,name))
+    }
+    else
+    {
+      e<-paste("unknown type ",type,sep="")
+      stop(e)
+    }
+  }
+
+  eps<-0.0
+  if (!is.list(data))
+  {
+    stop("the input data must be a list!")
+  }
+  c<-sapply(data,check.matrix.element)
+  if (sum(c)>0)
+  {
+    stop("each element of input list must be a matrix!")
+  }
+  c<-sapply(data,ncol.element)
+  if (length(levels(factor(c)))>1)
+  {
+    stop("each element of input list must have the same column number!")
+  }
+  if (length(data)!=length(types))
+  {
+    stop("data and types must be the same length!")
+  }
+  nSample<-c[1]
+  loglmin<-0
+  loglmax<-0
+  loglu<-0.0
+  nData<-length(data)
+  for (i in 1:nData)
+  {
+    data[[i]]<-check(data[[i]],types[[i]],names[[i]])
+  }
+  nGeneArr<-sapply(data,nrow.element)
+  nGene<-sum(nGeneArr)
+  indexData<-list()
+  k=1
+  for(i in 1:nData)
+  {
+    indexData[[i]]<- (k):(k+nGeneArr[i]-1)
+    k<-k+nGeneArr[i]
+  }
+  base<-matrix(0,nGene,nSample)
+  now<-matrix(0,nGene,nSample)
+  update<-matrix(0,nGene,nSample)
+  thr<-array(0,nData)
+  for (i in 1:nData)
+  {
+    if (types[[i]]=="binary")
+    {
+      base[indexData[[i]],]<-base.binary(data[[i]])
+      loglmin<-loglmin+LLmin.binary(data[[i]],base[indexData[[i]],])
+      loglmax<-loglmax+LLmax.binary(data[[i]])
+    }
+    else if (types[[i]]=="gaussian")
+    {
+      base[indexData[[i]],]<-base.gaussian(data[[i]])
+      loglmin<-loglmin+LLmin.gaussian(data[[i]],base[indexData[[i]],])
+      loglmax<-loglmax+LLmax.gaussian(data[[i]])
+    }
+    else if (types[[i]]=="poisson")
+    {
+      base[indexData[[i]],]<-base.poisson(data[[i]])
+      loglmin<-loglmin+LLmin.poisson(data[[i]],base[indexData[[i]],])
+      loglmax<-loglmax+LLmax.poisson(data[[i]])
+    }
+  }
+  for (i in 1:nData)
+  {
+    if (types[[i]]=="binary")
+    {
+      update[indexData[[i]],]<-update.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
+    }
+    else if (types[[i]]=="gaussian")
+    {
+      update[indexData[[i]],]<-update.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
+    }
+    else if (types[[i]]=="poisson")
+    {
+      update[indexData[[i]],]<-update.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
+    }
+  }
+  update<-nuclear_approximation(update,dimension)
+  nIter<-0
+  thres<-array(Inf,3)
+  epsN<-array(Inf,2)
+  while(T)
+  {
+    for (i in 1:nData)
+    {
+      if (types[[i]]=="binary")
+      {
+        thr[i]<-stop.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
+      }
+      else if (types[[i]]=="gaussian")
+      {
+        thr[i]<-stop.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
+      }
+      else if (types[[i]]=="poisson")
+      {
+        thr[i]<-stop.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],update[indexData[[i]],])
+      }
+    }
+    nIter<-nIter+1
+    thres[1]<-thres[2]
+    thres[2]<-thres[3]
+    thres[3]<-sum(thr)
+    epsN[1]<-epsN[2]
+    epsN[2]<-eps
+    if (nIter>5)
+    {
+      if (runif(1)<thres[1]*thres[3]/(thres[2]*thres[2]+thres[1]*thres[3]))
+      {
+        eps<-epsN[1]+0.05*runif(1)-0.025
+      }
+      else
+      {
+        eps<-epsN[2]+0.05*runif(1)-0.025
+      }
+      if (eps< -0.7)
+      {
+        eps<- 0
+        epsN<-c(0,0)
+      }
+      if (eps > 1.4)
+      {
+        eps<-0
+        epsN<-c(0,0)
+      }
+    }
+    if (sum(thr)<nData*0.2)
+    {
+      break
+    }
+    now<-update
+    for (i in 1:nData)
+    {
+      if (types[[i]]=="binary")
+      {
+        update[indexData[[i]],]<-update.binary(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
+      }
+      else if (types[[i]]=="gaussian")
+      {
+        update[indexData[[i]],]<-update.gaussian(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
+      }
+      else if (types[[i]]=="poisson")
+      {
+        update[indexData[[i]],]<-update.poisson(data[[i]],base[indexData[[i]],],now[indexData[[i]],],exp(eps))
+      }
+    }
+    update<-nuclear_approximation(update,dimension)
+  }
+  for (i in 1:nData)
+  {
+    if (types[[i]]=="binary")
+    {
+      loglu<-loglu+LL.binary(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
+    }
+    else if (types[[i]]=="gaussian")
+    {
+      loglu<-loglu+LL.gaussian(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
+    }
+    else if (types[[i]]=="poisson")
+    {
+      loglu<-loglu+LL.poisson(data[[i]],base[indexData[[i]],],update[indexData[[i]],])
+    }
+  }
+  sv<-svd(update,nu=0,nv=dimension)
+  coordinate<-diag(c(sv$d[1:dimension],0))[1:dimension,1:dimension]%*%t(sv$v)
+  colnames(coordinate)<-colnames(data[[1]])
+  rownames(coordinate)<-paste("PC ",as.character(1:dimension),sep="")
+  ratio<-(loglu-loglmin)/(loglmax-loglmin)
+  return (list("coordinate"=coordinate,"potential"=ratio))
+}