Diff of /OmicsFold/R/MixMC.R [000000] .. [e26484]

Switch to unified view

a b/OmicsFold/R/MixMC.R
1
#' Divide
2
#'
3
#' @description
4
#' Internal function for dividing over complete samples in Total Sum Scaling
5
#' (TSS).
6
#'
7
#' @param x Row to divide over.
8
#'
9
#' @return TSS scaled row.
10
.TSS.divide = function(x){
11
  if (sum(x) > 0)
12
    return (x/sum(x))
13
  else
14
    return (x)
15
}
16
17
18
#' Remove features with low counts across samples
19
#'
20
#' @description
21
#' Prefilter omics analysis input data in count form (e.g. OTUs) to remove
22
#' features which have a total count less than a (small) proportion of the total
23
#' measured counts. The default threshold is one part in 10,000 (0.01\%) - this
24
#' is usually sufficient to remove very low-count variables, which will be
25
#' unreliable features for model prediction.
26
#'
27
#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
28
#' @param percent Cutoff chosen in percent, default to 0.01.
29
#'
30
#' @return Data frame of input data, filtered to omit features below the count
31
#' proportion threshold.
32
#' @export
33
#'
34
#' @examples
35
#' \dontrun{
36
#' low.count.filter(raw.count)
37
#' }
38
low.count.removal = function(otu.counts, percent=0.01 ) {
39
  keep.otu <-
40
      which(colSums(otu.counts) * 100 / sum(colSums(otu.counts)) > percent)
41
  data.filter <- otu.counts[, keep.otu]
42
  return(data.filter)
43
}
44
45
46
#' Apply Total Sum Scaling normalisation
47
#'
48
#' @description
49
#' Apply Total Sum Scaling (TSS) normalisation to count data, to account for
50
#' differences in count (e.g. sequencing) depths between samples. Giving
51
#' proportion of total sample counts, this is the conventional way of
52
#' normalising OTU count data. Optionally include an offset to avoid
53
#' division/log zero problems - this defaults to zero, but 1 (count) is usually
54
#' appropriate for any count data with totals of thousands of counts or more.
55
#'
56
#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
57
#' @param offset Offset to apply, defaulting to zero.
58
#'
59
#' @return Data frame containing count data normalised as proportion of total
60
#' sample counts.
61
#' @export
62
#'
63
#' @examples
64
#' \dontrun{
65
#' normalise.tss(otu.count, offset=1)
66
#' }
67
normalise.tss = function(otu.counts, offset=0) {
68
  offset <- otu.counts + offset
69
  return(t(apply(offset, 1, .TSS.divide)))
70
}
71
72
73
#' Apply Cumulative Sum Scaling normalisation
74
#'
75
#' @description
76
#' Alternate Cumulative Sum Scale (CSS) method for normalising count data for
77
#' inter-sample depth. Relies upon the metagenomeSeq implementation.
78
#'
79
#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
80
#'
81
#' @return Data frame containing count data normalised cumulatively.
82
#' @export
83
#'
84
#' @examples
85
#' \dontrun{
86
#' normalise.css(otu.count)
87
#' }
88
normalise.css = function(otu.counts) {
89
  data.metagenomeSeq <- metagenomeSeq::newMRexperiment(t(otu.counts),
90
                                                       featureData=NULL,
91
                                                       libSize=NULL,
92
                                                       normFactors=NULL)
93
  p <- metagenomeSeq::cumNormStat(data.metagenomeSeq)
94
  data.cumnorm <- metagenomeSeq::cumNorm(data.metagenomeSeq, p=p)
95
  otu.css <- t(metagenomeSeq::MRcounts(data.cumnorm, norm=TRUE, log=TRUE))
96
  return(otu.css)
97
}
98
99
#' Apply the logit function to a single feature
100
#'
101
#' @description
102
#' Internal function for "empirical" logit normalisation of a feature (column)
103
#' of data. The empirical logit function differs for standard logit
104
#' normalisation in that an epsilon factor is added to ensure that function does
105
#' not tend to +/- infinity for input values close to 100\% and 0\%
106
#' respectively.
107
#'
108
#' @param feature Feature column.
109
#'
110
#' @return Normalised feature column.
111
.normalise.logit.feature = function(feature) {
112
  epsilon.min <- min(feature)
113
  epsilon.max <- 1-max(feature)
114
  epsilon <- min(epsilon.min, epsilon.max)
115
116
  # Set minimum and maximum values for the smoothing factor epsilon
117
  epsilon <- max(epsilon, 0.01)
118
  epsilon <- min(epsilon, 0.1)
119
120
  return(log((feature + epsilon)/(1 - feature + epsilon)))
121
}
122
123
#' Normalise using the logit function in an empirical manner
124
#'
125
#' @description
126
#' Apply the empirical logit normalisation to a data frame of omics input data.
127
#' This is intended to convert compositional data, e.g. proportional data in the
128
#' range 0..1, to Euclidean space which is most appropriate for the linear
129
#' models. The empirical logit function differs for standard logit normalisation
130
#' in that an epsilon factor is added to ensure that function does not tend to
131
#' +/- infinity for input values close to 100\% and 0\% respectively. The logit
132
#' or empirical logit function will be a more appropriate choice than centred
133
#' log-ratio (CLR) for non-OTU data.
134
#'
135
#' @param input Data frame of input compositional data to normalise. Input data
136
#' should be proportions 0-1.
137
#'
138
#' @return Data normalised using empirical logit. Proportions below 0.5 will be
139
#' negative, but output will not tend to infinity for zero or 1 input.
140
#' @export
141
#'
142
#' @examples
143
#' \dontrun{
144
#' normalise.logit.empirical(data.proportional)
145
#' }
146
normalise.logit.empirical = function(input) {
147
  normalised <- apply(input, 2, .normalise.logit.feature)
148
  rownames(normalised) <- rownames(input)
149
  return(normalised)
150
}
151
152
#' Normalise using the logit function
153
#'
154
#' @description
155
#' Apply the standard logit normalisation to a data frame of omics input data.
156
#' This is intended to convert compositional data, e.g. proportional data in the
157
#' range 0..1, to Euclidean space which is most appropriate for the linear
158
#' models. The logit function will tend to +/- infinity for input values close
159
#' to 100% and 0% respectively. The logit or empirical logit function will be a
160
#' more appropriate choice than centred log-ratio (CLR) for non-OTU data.
161
#'
162
#' @param input Data frame of input compositional data to normalise. Input data
163
#' should be proportional 0-1.
164
#'
165
#' @return Data normalised using empirical logit. Proportions below 0.5 will be
166
#' negative, and output will tend to -/+ infinity for zero or 1 input.
167
#' @export
168
#'
169
#' @examples
170
#' \dontrun{
171
#' normalise.logit(data.proportional)
172
#' }
173
normalise.logit = function(input) {
174
  return(log(input/(1 - input)))
175
}
176
177
178
#' Apply centered log-ratio normalisation
179
#'
180
#' @description
181
#' Apply centered log-ratio (CLR) normalisation to sum scaled OTU count data.
182
#' This is another method for converting the compositional data, i.e.
183
#' proportional data in the range 0..1 to Euclidean space which is most
184
#' appropriate for the linear models. Note that this should only be applied to
185
#' OTU data, as it applies another inter-sample normalisation.
186
#'
187
#' @param input Scaled OTU data as proportions 0-1, e.g. output by
188
#' normalise.TSS().
189
#' @param offset Optional offset to apply to raw data to avoid logging of zero
190
#' values. Only needed if any zeroes are present - should generally be set very
191
#' small, e.g. 0.000001.
192
#'
193
#' @return Data normalised by the CLR method.
194
#' @export
195
#'
196
#' @examples
197
#' \dontrun{
198
#' normalise.clr(otu.data.tss)
199
#' }
200
normalise.clr = function(input, offset=0) {
201
  normalised.clr <- mixOmics::logratio.transfo(X = as.matrix(input),
202
                                               logratio = 'CLR',
203
                                               offset = offset)
204
  # Annoyingly, the output object does not allow direct access the matrix of
205
  # results. This is an easy way to return it.
206
  return(normalised.clr[,])
207
}
208
209
#' Apply centered log-ratio normalisation within features only
210
#'
211
#' @description
212
#' Apply centered log-ratio (CLR) normalisation to other compositional data, but
213
#' restrict normalisation to *within* features only. This is another method for
214
#' converting the compositional data, i.e. proportional data in the range 0..1
215
#' to Euclidean space which is most appropriate for the linear models. The
216
#' implementation is the same as CLR, but on transposed input data (which is
217
#' then transposed back to the input orientation). Note this is experimental,
218
#' though it does give a sensible normalisation.
219
#'
220
#' @param input Data as proportions 0-1.
221
#' @param offset Optional offset to apply to raw data to avoid logging of zero
222
#' values. Only needed if any zeroes are present - should generally be set very
223
#' small, e.g. 0.000001.
224
#'
225
#' @return Data normalised by the within-feature CLR method
226
#' @export
227
#'
228
#' @examples
229
#' \dontrun{
230
#' normalise.clr.within.features(data.proportional)
231
#' }
232
normalise.clr.within.features = function(input, offset=0) {
233
  normalised.clr <- mixOmics::logratio.transfo(X = t(as.matrix(input)),
234
                                               logratio = 'CLR',
235
                                               offset = offset)
236
  return(t(normalised.clr[,]))
237
}