Omicsfold / Git / Diff of /OmicsFold/R/MixMC.R

Models:
AlyssaS/
Omicsfold
Downloads: 1
Diff of /OmicsFold/R/MixMC.R [000000] .. [e26484]
Switch to side-by-side view

--- a
+++ b/OmicsFold/R/MixMC.R
@@ -0,0 +1,237 @@
+#' Divide
+#'
+#' @description
+#' Internal function for dividing over complete samples in Total Sum Scaling
+#' (TSS).
+#'
+#' @param x Row to divide over.
+#'
+#' @return TSS scaled row.
+.TSS.divide = function(x){
+  if (sum(x) > 0)
+    return (x/sum(x))
+  else
+    return (x)
+}
+
+
+#' Remove features with low counts across samples
+#'
+#' @description
+#' Prefilter omics analysis input data in count form (e.g. OTUs) to remove
+#' features which have a total count less than a (small) proportion of the total
+#' measured counts. The default threshold is one part in 10,000 (0.01\%) - this
+#' is usually sufficient to remove very low-count variables, which will be
+#' unreliable features for model prediction.
+#'
+#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
+#' @param percent Cutoff chosen in percent, default to 0.01.
+#'
+#' @return Data frame of input data, filtered to omit features below the count
+#' proportion threshold.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' low.count.filter(raw.count)
+#' }
+low.count.removal = function(otu.counts, percent=0.01 ) {
+  keep.otu <-
+      which(colSums(otu.counts) * 100 / sum(colSums(otu.counts)) > percent)
+  data.filter <- otu.counts[, keep.otu]
+  return(data.filter)
+}
+
+
+#' Apply Total Sum Scaling normalisation
+#'
+#' @description
+#' Apply Total Sum Scaling (TSS) normalisation to count data, to account for
+#' differences in count (e.g. sequencing) depths between samples. Giving
+#' proportion of total sample counts, this is the conventional way of
+#' normalising OTU count data. Optionally include an offset to avoid
+#' division/log zero problems - this defaults to zero, but 1 (count) is usually
+#' appropriate for any count data with totals of thousands of counts or more.
+#'
+#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
+#' @param offset Offset to apply, defaulting to zero.
+#'
+#' @return Data frame containing count data normalised as proportion of total
+#' sample counts.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' normalise.tss(otu.count, offset=1)
+#' }
+normalise.tss = function(otu.counts, offset=0) {
+  offset <- otu.counts + offset
+  return(t(apply(offset, 1, .TSS.divide)))
+}
+
+
+#' Apply Cumulative Sum Scaling normalisation
+#'
+#' @description
+#' Alternate Cumulative Sum Scale (CSS) method for normalising count data for
+#' inter-sample depth. Relies upon the metagenomeSeq implementation.
+#'
+#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
+#'
+#' @return Data frame containing count data normalised cumulatively.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' normalise.css(otu.count)
+#' }
+normalise.css = function(otu.counts) {
+  data.metagenomeSeq <- metagenomeSeq::newMRexperiment(t(otu.counts),
+                                                       featureData=NULL,
+                                                       libSize=NULL,
+                                                       normFactors=NULL)
+  p <- metagenomeSeq::cumNormStat(data.metagenomeSeq)
+  data.cumnorm <- metagenomeSeq::cumNorm(data.metagenomeSeq, p=p)
+  otu.css <- t(metagenomeSeq::MRcounts(data.cumnorm, norm=TRUE, log=TRUE))
+  return(otu.css)
+}
+
+#' Apply the logit function to a single feature
+#'
+#' @description
+#' Internal function for "empirical" logit normalisation of a feature (column)
+#' of data. The empirical logit function differs for standard logit
+#' normalisation in that an epsilon factor is added to ensure that function does
+#' not tend to +/- infinity for input values close to 100\% and 0\%
+#' respectively.
+#'
+#' @param feature Feature column.
+#'
+#' @return Normalised feature column.
+.normalise.logit.feature = function(feature) {
+  epsilon.min <- min(feature)
+  epsilon.max <- 1-max(feature)
+  epsilon <- min(epsilon.min, epsilon.max)
+
+  # Set minimum and maximum values for the smoothing factor epsilon
+  epsilon <- max(epsilon, 0.01)
+  epsilon <- min(epsilon, 0.1)
+
+  return(log((feature + epsilon)/(1 - feature + epsilon)))
+}
+
+#' Normalise using the logit function in an empirical manner
+#'
+#' @description
+#' Apply the empirical logit normalisation to a data frame of omics input data.
+#' This is intended to convert compositional data, e.g. proportional data in the
+#' range 0..1, to Euclidean space which is most appropriate for the linear
+#' models. The empirical logit function differs for standard logit normalisation
+#' in that an epsilon factor is added to ensure that function does not tend to
+#' +/- infinity for input values close to 100\% and 0\% respectively. The logit
+#' or empirical logit function will be a more appropriate choice than centred
+#' log-ratio (CLR) for non-OTU data.
+#'
+#' @param input Data frame of input compositional data to normalise. Input data
+#' should be proportions 0-1.
+#'
+#' @return Data normalised using empirical logit. Proportions below 0.5 will be
+#' negative, but output will not tend to infinity for zero or 1 input.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' normalise.logit.empirical(data.proportional)
+#' }
+normalise.logit.empirical = function(input) {
+  normalised <- apply(input, 2, .normalise.logit.feature)
+  rownames(normalised) <- rownames(input)
+  return(normalised)
+}
+
+#' Normalise using the logit function
+#'
+#' @description
+#' Apply the standard logit normalisation to a data frame of omics input data.
+#' This is intended to convert compositional data, e.g. proportional data in the
+#' range 0..1, to Euclidean space which is most appropriate for the linear
+#' models. The logit function will tend to +/- infinity for input values close
+#' to 100% and 0% respectively. The logit or empirical logit function will be a
+#' more appropriate choice than centred log-ratio (CLR) for non-OTU data.
+#'
+#' @param input Data frame of input compositional data to normalise. Input data
+#' should be proportional 0-1.
+#'
+#' @return Data normalised using empirical logit. Proportions below 0.5 will be
+#' negative, and output will tend to -/+ infinity for zero or 1 input.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' normalise.logit(data.proportional)
+#' }
+normalise.logit = function(input) {
+  return(log(input/(1 - input)))
+}
+
+
+#' Apply centered log-ratio normalisation
+#'
+#' @description
+#' Apply centered log-ratio (CLR) normalisation to sum scaled OTU count data.
+#' This is another method for converting the compositional data, i.e.
+#' proportional data in the range 0..1 to Euclidean space which is most
+#' appropriate for the linear models. Note that this should only be applied to
+#' OTU data, as it applies another inter-sample normalisation.
+#'
+#' @param input Scaled OTU data as proportions 0-1, e.g. output by
+#' normalise.TSS().
+#' @param offset Optional offset to apply to raw data to avoid logging of zero
+#' values. Only needed if any zeroes are present - should generally be set very
+#' small, e.g. 0.000001.
+#'
+#' @return Data normalised by the CLR method.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' normalise.clr(otu.data.tss)
+#' }
+normalise.clr = function(input, offset=0) {
+  normalised.clr <- mixOmics::logratio.transfo(X = as.matrix(input),
+                                               logratio = 'CLR',
+                                               offset = offset)
+  # Annoyingly, the output object does not allow direct access the matrix of
+  # results. This is an easy way to return it.
+  return(normalised.clr[,])
+}
+
+#' Apply centered log-ratio normalisation within features only
+#'
+#' @description
+#' Apply centered log-ratio (CLR) normalisation to other compositional data, but
+#' restrict normalisation to *within* features only. This is another method for
+#' converting the compositional data, i.e. proportional data in the range 0..1
+#' to Euclidean space which is most appropriate for the linear models. The
+#' implementation is the same as CLR, but on transposed input data (which is
+#' then transposed back to the input orientation). Note this is experimental,
+#' though it does give a sensible normalisation.
+#'
+#' @param input Data as proportions 0-1.
+#' @param offset Optional offset to apply to raw data to avoid logging of zero
+#' values. Only needed if any zeroes are present - should generally be set very
+#' small, e.g. 0.000001.
+#'
+#' @return Data normalised by the within-feature CLR method
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' normalise.clr.within.features(data.proportional)
+#' }
+normalise.clr.within.features = function(input, offset=0) {
+  normalised.clr <- mixOmics::logratio.transfo(X = t(as.matrix(input)),
+                                               logratio = 'CLR',
+                                               offset = offset)
+  return(t(normalised.clr[,]))
+}