[e26484]: / OmicsFold / R / MixMC.R

Download this file

238 lines (223 with data), 8.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#' Divide
#'
#' @description
#' Internal function for dividing over complete samples in Total Sum Scaling
#' (TSS).
#'
#' @param x Row to divide over.
#'
#' @return TSS scaled row.
.TSS.divide = function(x){
if (sum(x) > 0)
return (x/sum(x))
else
return (x)
}
#' Remove features with low counts across samples
#'
#' @description
#' Prefilter omics analysis input data in count form (e.g. OTUs) to remove
#' features which have a total count less than a (small) proportion of the total
#' measured counts. The default threshold is one part in 10,000 (0.01\%) - this
#' is usually sufficient to remove very low-count variables, which will be
#' unreliable features for model prediction.
#'
#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
#' @param percent Cutoff chosen in percent, default to 0.01.
#'
#' @return Data frame of input data, filtered to omit features below the count
#' proportion threshold.
#' @export
#'
#' @examples
#' \dontrun{
#' low.count.filter(raw.count)
#' }
low.count.removal = function(otu.counts, percent=0.01 ) {
keep.otu <-
which(colSums(otu.counts) * 100 / sum(colSums(otu.counts)) > percent)
data.filter <- otu.counts[, keep.otu]
return(data.filter)
}
#' Apply Total Sum Scaling normalisation
#'
#' @description
#' Apply Total Sum Scaling (TSS) normalisation to count data, to account for
#' differences in count (e.g. sequencing) depths between samples. Giving
#' proportion of total sample counts, this is the conventional way of
#' normalising OTU count data. Optionally include an offset to avoid
#' division/log zero problems - this defaults to zero, but 1 (count) is usually
#' appropriate for any count data with totals of thousands of counts or more.
#'
#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
#' @param offset Offset to apply, defaulting to zero.
#'
#' @return Data frame containing count data normalised as proportion of total
#' sample counts.
#' @export
#'
#' @examples
#' \dontrun{
#' normalise.tss(otu.count, offset=1)
#' }
normalise.tss = function(otu.counts, offset=0) {
offset <- otu.counts + offset
return(t(apply(offset, 1, .TSS.divide)))
}
#' Apply Cumulative Sum Scaling normalisation
#'
#' @description
#' Alternate Cumulative Sum Scale (CSS) method for normalising count data for
#' inter-sample depth. Relies upon the metagenomeSeq implementation.
#'
#' @param otu.counts OTU count data frame of size n (sample) x p (OTU).
#'
#' @return Data frame containing count data normalised cumulatively.
#' @export
#'
#' @examples
#' \dontrun{
#' normalise.css(otu.count)
#' }
normalise.css = function(otu.counts) {
data.metagenomeSeq <- metagenomeSeq::newMRexperiment(t(otu.counts),
featureData=NULL,
libSize=NULL,
normFactors=NULL)
p <- metagenomeSeq::cumNormStat(data.metagenomeSeq)
data.cumnorm <- metagenomeSeq::cumNorm(data.metagenomeSeq, p=p)
otu.css <- t(metagenomeSeq::MRcounts(data.cumnorm, norm=TRUE, log=TRUE))
return(otu.css)
}
#' Apply the logit function to a single feature
#'
#' @description
#' Internal function for "empirical" logit normalisation of a feature (column)
#' of data. The empirical logit function differs for standard logit
#' normalisation in that an epsilon factor is added to ensure that function does
#' not tend to +/- infinity for input values close to 100\% and 0\%
#' respectively.
#'
#' @param feature Feature column.
#'
#' @return Normalised feature column.
.normalise.logit.feature = function(feature) {
epsilon.min <- min(feature)
epsilon.max <- 1-max(feature)
epsilon <- min(epsilon.min, epsilon.max)
# Set minimum and maximum values for the smoothing factor epsilon
epsilon <- max(epsilon, 0.01)
epsilon <- min(epsilon, 0.1)
return(log((feature + epsilon)/(1 - feature + epsilon)))
}
#' Normalise using the logit function in an empirical manner
#'
#' @description
#' Apply the empirical logit normalisation to a data frame of omics input data.
#' This is intended to convert compositional data, e.g. proportional data in the
#' range 0..1, to Euclidean space which is most appropriate for the linear
#' models. The empirical logit function differs for standard logit normalisation
#' in that an epsilon factor is added to ensure that function does not tend to
#' +/- infinity for input values close to 100\% and 0\% respectively. The logit
#' or empirical logit function will be a more appropriate choice than centred
#' log-ratio (CLR) for non-OTU data.
#'
#' @param input Data frame of input compositional data to normalise. Input data
#' should be proportions 0-1.
#'
#' @return Data normalised using empirical logit. Proportions below 0.5 will be
#' negative, but output will not tend to infinity for zero or 1 input.
#' @export
#'
#' @examples
#' \dontrun{
#' normalise.logit.empirical(data.proportional)
#' }
normalise.logit.empirical = function(input) {
normalised <- apply(input, 2, .normalise.logit.feature)
rownames(normalised) <- rownames(input)
return(normalised)
}
#' Normalise using the logit function
#'
#' @description
#' Apply the standard logit normalisation to a data frame of omics input data.
#' This is intended to convert compositional data, e.g. proportional data in the
#' range 0..1, to Euclidean space which is most appropriate for the linear
#' models. The logit function will tend to +/- infinity for input values close
#' to 100% and 0% respectively. The logit or empirical logit function will be a
#' more appropriate choice than centred log-ratio (CLR) for non-OTU data.
#'
#' @param input Data frame of input compositional data to normalise. Input data
#' should be proportional 0-1.
#'
#' @return Data normalised using empirical logit. Proportions below 0.5 will be
#' negative, and output will tend to -/+ infinity for zero or 1 input.
#' @export
#'
#' @examples
#' \dontrun{
#' normalise.logit(data.proportional)
#' }
normalise.logit = function(input) {
return(log(input/(1 - input)))
}
#' Apply centered log-ratio normalisation
#'
#' @description
#' Apply centered log-ratio (CLR) normalisation to sum scaled OTU count data.
#' This is another method for converting the compositional data, i.e.
#' proportional data in the range 0..1 to Euclidean space which is most
#' appropriate for the linear models. Note that this should only be applied to
#' OTU data, as it applies another inter-sample normalisation.
#'
#' @param input Scaled OTU data as proportions 0-1, e.g. output by
#' normalise.TSS().
#' @param offset Optional offset to apply to raw data to avoid logging of zero
#' values. Only needed if any zeroes are present - should generally be set very
#' small, e.g. 0.000001.
#'
#' @return Data normalised by the CLR method.
#' @export
#'
#' @examples
#' \dontrun{
#' normalise.clr(otu.data.tss)
#' }
normalise.clr = function(input, offset=0) {
normalised.clr <- mixOmics::logratio.transfo(X = as.matrix(input),
logratio = 'CLR',
offset = offset)
# Annoyingly, the output object does not allow direct access the matrix of
# results. This is an easy way to return it.
return(normalised.clr[,])
}
#' Apply centered log-ratio normalisation within features only
#'
#' @description
#' Apply centered log-ratio (CLR) normalisation to other compositional data, but
#' restrict normalisation to *within* features only. This is another method for
#' converting the compositional data, i.e. proportional data in the range 0..1
#' to Euclidean space which is most appropriate for the linear models. The
#' implementation is the same as CLR, but on transposed input data (which is
#' then transposed back to the input orientation). Note this is experimental,
#' though it does give a sensible normalisation.
#'
#' @param input Data as proportions 0-1.
#' @param offset Optional offset to apply to raw data to avoid logging of zero
#' values. Only needed if any zeroes are present - should generally be set very
#' small, e.g. 0.000001.
#'
#' @return Data normalised by the within-feature CLR method
#' @export
#'
#' @examples
#' \dontrun{
#' normalise.clr.within.features(data.proportional)
#' }
normalise.clr.within.features = function(input, offset=0) {
normalised.clr <- mixOmics::logratio.transfo(X = t(as.matrix(input)),
logratio = 'CLR',
offset = offset)
return(t(normalised.clr[,]))
}