voomNSC.train =
function (counts, conditions, n.threshold = 30, offset.percent = 50, remove.zeros = TRUE,
normalization = c("TMM", "deseq", "none"))
{
normalization = match.arg(normalization)
this.call = match.call()
#### voom transformation steps
{
if (normalization == "TMM"){
design = model.matrix(~conditions)
rownames(design) = colnames(counts)
dge = DGEList(counts = counts)
dge = calcNormFactors(dge, method = "TMM")
vm = voom(dge, design, plot=F)
x = vm $ E #pxn dim gene expression matrix
w = vm $ weights #pxn dim weight matrix
dimnames(w) = dimnames(x)
}
else if (normalization == "deseq"){
design = model.matrix(~conditions)
rownames(design) = colnames(counts)
dge = DGEList(counts = counts)
dge = calcNormFactors(dge, method = "RLE")
vm = voom(dge, design, plot=F)
x = vm $ E #pxn dim gene expression matrix
w = vm $ weights #pxn dim weight matrix
dimnames(w) = dimnames(x)
}
else {
design = model.matrix(~conditions)
rownames(design) = colnames(counts)
vm = voom(counts, design, plot=F)
x = vm $ E #pxn dim gene expression matrix
w = vm $ weights #pxn dim weight matrix
dimnames(w) = dimnames(x)
}
}
y = as.factor(conditions)
n.class = table(y)
prior = n.class/length(y)
# soft.shrink(...)
soft.shrink =
function (delta, threshold)
{
dif = abs(delta) - threshold
delta = sign(delta) * dif * (dif > 0)
nonzero = sum(drop((dif > 0) %*% rep(1, ncol(delta))) > 0)
attr(delta, "nonzero") = nonzero
delta
}
# diag.disc(...)
diag.disc =
function (x, centroids, prior, weight)
{
if (!missing(weight)) {
posid = (weight > 0)
if (any(posid)) {
weight = sqrt(weight[posid])
centroids = centroids[posid, , drop = FALSE] * weight
x = x[posid, , drop = FALSE] * weight
}
else {
mat = outer(rep(1, ncol(x)), log(prior), "*")
dimnames(mat) = list(NULL, dimnames(centroids)[[2]])
return(mat)
}
}
dd = t(x) %*% centroids
dd0 = drop(rep(1, nrow(centroids)) %*% (centroids^2))/2 -
log(prior)
names(dd0) = NULL
scale(dd, dd0, FALSE)
}
# safe.exp(...)
safe.exp = function (x)
{
xx = sign(x) * pmin(abs(x), 500)
return(exp(xx))
}
#soft.max(...)
softmax =
function (x, gap = FALSE)
{
d = dim(x)
maxdist = x[, 1]
pclass = rep(1, d[1])
for (i in seq(2, d[2])) {
l = x[, i] > maxdist
pclass[l] = i
maxdist[l] = x[l, i]
}
dd = dimnames(x)[[2]]
if (gap) {
x = abs(maxdist - x)
x[cbind(seq(d[1]), pclass)] = drop(x %*% rep(1, d[2]))
gaps = do.call("pmin", data.frame(x))
}
pclass = if (is.null(dd) || !length(dd))
pclass
else factor(pclass, levels = seq(d[2]), labels = dd)
if (gap)
list(class = pclass, gaps = gaps)
else pclass
}
## wnsc(....)
wnsc =
function (x, y, n.threshold = 30, offset.percent = 50, prior = NULL, remove.zeros = TRUE, weights = NULL)
{
selected.genes = selected.genesIndex = list()
this.call = match.call()
Y = model.matrix(~factor(y) - 1, data = list(y = y))
xtest = x
ytest = y
wStats = weighted.stats(x, weights, y)
if (min(n.class) == 1) {
stop(warning("Warning: a class contains only 1 sample"))
}
n = sum(n.class)
ntest = ncol(xtest)
K = length(prior)
p = nrow(x)
dimnames(Y) = list(NULL, names(n.class))
centroids = wStats$weightedMean.C
sd = wStats$weightedSD.pooled
offset = quantile(sd, offset.percent/100)
sd = sd + offset
centroid.overall = wStats$weightedMean
se.scale = wStats$se.scale
delta = wStats$delta
threshold = seq(0, max(abs(delta)), length = n.threshold)
nonzero = seq(n.threshold)
errors = threshold
pred.conditions = as.list(seq(n.threshold))
prob = array(0, c(ntest, K, n.threshold))
dshrunkAll <- list()
for (ii in 1:n.threshold) {
delta.shrunk = soft.shrink(delta, threshold[ii])
delta.shrunk = t(t(delta.shrunk) * as.numeric(se.scale))
dshrunkAll[[ii]] <- delta.shrunk
nonzero[ii] = attr(delta.shrunk, "nonzero")
posid = drop(abs(delta.shrunk) %*% rep(1, K)) > 0
selected.genes[[ii]] = rownames(wStats$weightedMean.C)[posid]
selected.genesIndex[[ii]] = which(posid == TRUE)
dd = diag.disc((xtest - centroid.overall)/sd, delta.shrunk,
prior, weight = posid)
pred.conditions[[ii]] = softmax(dd)
dd = safe.exp(dd)
prob[, , ii] = dd/drop(dd %*% rep(1, K))
if (!is.null(ytest)) {
errors[ii] = sum(pred.conditions[[ii]] != ytest)
}
}
thresh.names = format(round(threshold, 3))
names(pred.conditions) = names(selected.genes) = thresh.names
attr(pred.conditions, "row.names") = paste(seq(ntest))
class(pred.conditions) = "data.frame"
if (remove.zeros)
n.threshold = match(0, nonzero, n.threshold)
dimnames(prob) = list(paste(seq(ntest)), names(n.class),
thresh.names)
object = list(counts = counts, conditions = factor(conditions), pred.conditions = pred.conditions, prob = prob[, , seq(n.threshold)],
weightedMean.C = centroids, weightedMean = centroid.overall, delta = delta, normalization = normalization,
weightedSD.pooled = sd, threshold = threshold[seq(n.threshold)], nonzero = nonzero[seq(n.threshold)],
se.scale = se.scale, call = this.call, prior = prior, offset = offset, SelectedGenes = selected.genes,
SelectedGenesIndex = selected.genesIndex)
object$errors = errors
opt.threshold = max(object$threshold[which(object$errors == min(object$errors))])
model.res = as.data.frame(cbind(object$threshold, object$nonzero, object$errors))
colnames(model.res) = c("threshold", "nonzero", "errors")
object$modelRes = model.res
object$delta.shrunk <- dshrunkAll[[which(object$threshold == opt.threshold)]]
#min.error = model.res[model.res$errors == min(model.res$errors),]
#min.error.gene = min.error[min.error$nonzero == min(min.error$nonzero),]
#opt.threshold = min.error.gene$threshold
object$opt.threshold = opt.threshold
object
}
junk = wnsc(x, y = conditions, offset.percent = offset.percent, n.threshold = n.threshold, prior = prior,
remove.zeros = remove.zeros, weights = w)
junk$call = this.call
class(junk) = "pamrtrained"
junk
}