|
a |
|
b/voomNSC.train.R |
|
|
1 |
voomNSC.train = |
|
|
2 |
function (counts, conditions, n.threshold = 30, offset.percent = 50, remove.zeros = TRUE, |
|
|
3 |
normalization = c("TMM", "deseq", "none")) |
|
|
4 |
{ |
|
|
5 |
normalization = match.arg(normalization) |
|
|
6 |
this.call = match.call() |
|
|
7 |
|
|
|
8 |
#### voom transformation steps |
|
|
9 |
{ |
|
|
10 |
if (normalization == "TMM"){ |
|
|
11 |
design = model.matrix(~conditions) |
|
|
12 |
rownames(design) = colnames(counts) |
|
|
13 |
dge = DGEList(counts = counts) |
|
|
14 |
dge = calcNormFactors(dge, method = "TMM") |
|
|
15 |
vm = voom(dge, design, plot=F) |
|
|
16 |
x = vm $ E #pxn dim gene expression matrix |
|
|
17 |
w = vm $ weights #pxn dim weight matrix |
|
|
18 |
dimnames(w) = dimnames(x) |
|
|
19 |
} |
|
|
20 |
else if (normalization == "deseq"){ |
|
|
21 |
design = model.matrix(~conditions) |
|
|
22 |
rownames(design) = colnames(counts) |
|
|
23 |
dge = DGEList(counts = counts) |
|
|
24 |
dge = calcNormFactors(dge, method = "RLE") |
|
|
25 |
vm = voom(dge, design, plot=F) |
|
|
26 |
x = vm $ E #pxn dim gene expression matrix |
|
|
27 |
w = vm $ weights #pxn dim weight matrix |
|
|
28 |
dimnames(w) = dimnames(x) |
|
|
29 |
} |
|
|
30 |
else { |
|
|
31 |
design = model.matrix(~conditions) |
|
|
32 |
rownames(design) = colnames(counts) |
|
|
33 |
vm = voom(counts, design, plot=F) |
|
|
34 |
x = vm $ E #pxn dim gene expression matrix |
|
|
35 |
w = vm $ weights #pxn dim weight matrix |
|
|
36 |
dimnames(w) = dimnames(x) |
|
|
37 |
} |
|
|
38 |
} |
|
|
39 |
|
|
|
40 |
y = as.factor(conditions) |
|
|
41 |
n.class = table(y) |
|
|
42 |
prior = n.class/length(y) |
|
|
43 |
|
|
|
44 |
# soft.shrink(...) |
|
|
45 |
soft.shrink = |
|
|
46 |
function (delta, threshold) |
|
|
47 |
{ |
|
|
48 |
dif = abs(delta) - threshold |
|
|
49 |
delta = sign(delta) * dif * (dif > 0) |
|
|
50 |
nonzero = sum(drop((dif > 0) %*% rep(1, ncol(delta))) > 0) |
|
|
51 |
attr(delta, "nonzero") = nonzero |
|
|
52 |
delta |
|
|
53 |
} |
|
|
54 |
|
|
|
55 |
# diag.disc(...) |
|
|
56 |
diag.disc = |
|
|
57 |
function (x, centroids, prior, weight) |
|
|
58 |
{ |
|
|
59 |
if (!missing(weight)) { |
|
|
60 |
posid = (weight > 0) |
|
|
61 |
if (any(posid)) { |
|
|
62 |
weight = sqrt(weight[posid]) |
|
|
63 |
centroids = centroids[posid, , drop = FALSE] * weight |
|
|
64 |
x = x[posid, , drop = FALSE] * weight |
|
|
65 |
} |
|
|
66 |
else { |
|
|
67 |
mat = outer(rep(1, ncol(x)), log(prior), "*") |
|
|
68 |
dimnames(mat) = list(NULL, dimnames(centroids)[[2]]) |
|
|
69 |
return(mat) |
|
|
70 |
} |
|
|
71 |
} |
|
|
72 |
dd = t(x) %*% centroids |
|
|
73 |
dd0 = drop(rep(1, nrow(centroids)) %*% (centroids^2))/2 - |
|
|
74 |
log(prior) |
|
|
75 |
names(dd0) = NULL |
|
|
76 |
scale(dd, dd0, FALSE) |
|
|
77 |
} |
|
|
78 |
|
|
|
79 |
# safe.exp(...) |
|
|
80 |
safe.exp = function (x) |
|
|
81 |
{ |
|
|
82 |
xx = sign(x) * pmin(abs(x), 500) |
|
|
83 |
return(exp(xx)) |
|
|
84 |
} |
|
|
85 |
|
|
|
86 |
#soft.max(...) |
|
|
87 |
softmax = |
|
|
88 |
function (x, gap = FALSE) |
|
|
89 |
{ |
|
|
90 |
d = dim(x) |
|
|
91 |
maxdist = x[, 1] |
|
|
92 |
pclass = rep(1, d[1]) |
|
|
93 |
for (i in seq(2, d[2])) { |
|
|
94 |
l = x[, i] > maxdist |
|
|
95 |
pclass[l] = i |
|
|
96 |
maxdist[l] = x[l, i] |
|
|
97 |
} |
|
|
98 |
dd = dimnames(x)[[2]] |
|
|
99 |
if (gap) { |
|
|
100 |
x = abs(maxdist - x) |
|
|
101 |
x[cbind(seq(d[1]), pclass)] = drop(x %*% rep(1, d[2])) |
|
|
102 |
gaps = do.call("pmin", data.frame(x)) |
|
|
103 |
} |
|
|
104 |
pclass = if (is.null(dd) || !length(dd)) |
|
|
105 |
pclass |
|
|
106 |
else factor(pclass, levels = seq(d[2]), labels = dd) |
|
|
107 |
if (gap) |
|
|
108 |
list(class = pclass, gaps = gaps) |
|
|
109 |
else pclass |
|
|
110 |
} |
|
|
111 |
|
|
|
112 |
## wnsc(....) |
|
|
113 |
wnsc = |
|
|
114 |
function (x, y, n.threshold = 30, offset.percent = 50, prior = NULL, remove.zeros = TRUE, weights = NULL) |
|
|
115 |
{ |
|
|
116 |
selected.genes = selected.genesIndex = list() |
|
|
117 |
this.call = match.call() |
|
|
118 |
Y = model.matrix(~factor(y) - 1, data = list(y = y)) |
|
|
119 |
|
|
|
120 |
xtest = x |
|
|
121 |
ytest = y |
|
|
122 |
|
|
|
123 |
wStats = weighted.stats(x, weights, y) |
|
|
124 |
|
|
|
125 |
if (min(n.class) == 1) { |
|
|
126 |
stop(warning("Warning: a class contains only 1 sample")) |
|
|
127 |
} |
|
|
128 |
|
|
|
129 |
n = sum(n.class) |
|
|
130 |
ntest = ncol(xtest) |
|
|
131 |
K = length(prior) |
|
|
132 |
p = nrow(x) |
|
|
133 |
dimnames(Y) = list(NULL, names(n.class)) |
|
|
134 |
centroids = wStats$weightedMean.C |
|
|
135 |
sd = wStats$weightedSD.pooled |
|
|
136 |
offset = quantile(sd, offset.percent/100) |
|
|
137 |
sd = sd + offset |
|
|
138 |
centroid.overall = wStats$weightedMean |
|
|
139 |
se.scale = wStats$se.scale |
|
|
140 |
delta = wStats$delta |
|
|
141 |
threshold = seq(0, max(abs(delta)), length = n.threshold) |
|
|
142 |
nonzero = seq(n.threshold) |
|
|
143 |
errors = threshold |
|
|
144 |
pred.conditions = as.list(seq(n.threshold)) |
|
|
145 |
prob = array(0, c(ntest, K, n.threshold)) |
|
|
146 |
|
|
|
147 |
dshrunkAll <- list() |
|
|
148 |
for (ii in 1:n.threshold) { |
|
|
149 |
|
|
|
150 |
delta.shrunk = soft.shrink(delta, threshold[ii]) |
|
|
151 |
delta.shrunk = t(t(delta.shrunk) * as.numeric(se.scale)) |
|
|
152 |
dshrunkAll[[ii]] <- delta.shrunk |
|
|
153 |
|
|
|
154 |
nonzero[ii] = attr(delta.shrunk, "nonzero") |
|
|
155 |
posid = drop(abs(delta.shrunk) %*% rep(1, K)) > 0 |
|
|
156 |
selected.genes[[ii]] = rownames(wStats$weightedMean.C)[posid] |
|
|
157 |
selected.genesIndex[[ii]] = which(posid == TRUE) |
|
|
158 |
dd = diag.disc((xtest - centroid.overall)/sd, delta.shrunk, |
|
|
159 |
prior, weight = posid) |
|
|
160 |
pred.conditions[[ii]] = softmax(dd) |
|
|
161 |
dd = safe.exp(dd) |
|
|
162 |
prob[, , ii] = dd/drop(dd %*% rep(1, K)) |
|
|
163 |
if (!is.null(ytest)) { |
|
|
164 |
errors[ii] = sum(pred.conditions[[ii]] != ytest) |
|
|
165 |
} |
|
|
166 |
} |
|
|
167 |
|
|
|
168 |
thresh.names = format(round(threshold, 3)) |
|
|
169 |
names(pred.conditions) = names(selected.genes) = thresh.names |
|
|
170 |
attr(pred.conditions, "row.names") = paste(seq(ntest)) |
|
|
171 |
class(pred.conditions) = "data.frame" |
|
|
172 |
|
|
|
173 |
if (remove.zeros) |
|
|
174 |
n.threshold = match(0, nonzero, n.threshold) |
|
|
175 |
|
|
|
176 |
dimnames(prob) = list(paste(seq(ntest)), names(n.class), |
|
|
177 |
thresh.names) |
|
|
178 |
|
|
|
179 |
object = list(counts = counts, conditions = factor(conditions), pred.conditions = pred.conditions, prob = prob[, , seq(n.threshold)], |
|
|
180 |
weightedMean.C = centroids, weightedMean = centroid.overall, delta = delta, normalization = normalization, |
|
|
181 |
weightedSD.pooled = sd, threshold = threshold[seq(n.threshold)], nonzero = nonzero[seq(n.threshold)], |
|
|
182 |
se.scale = se.scale, call = this.call, prior = prior, offset = offset, SelectedGenes = selected.genes, |
|
|
183 |
SelectedGenesIndex = selected.genesIndex) |
|
|
184 |
|
|
|
185 |
object$errors = errors |
|
|
186 |
|
|
|
187 |
opt.threshold = max(object$threshold[which(object$errors == min(object$errors))]) |
|
|
188 |
|
|
|
189 |
model.res = as.data.frame(cbind(object$threshold, object$nonzero, object$errors)) |
|
|
190 |
colnames(model.res) = c("threshold", "nonzero", "errors") |
|
|
191 |
object$modelRes = model.res |
|
|
192 |
|
|
|
193 |
object$delta.shrunk <- dshrunkAll[[which(object$threshold == opt.threshold)]] |
|
|
194 |
#min.error = model.res[model.res$errors == min(model.res$errors),] |
|
|
195 |
#min.error.gene = min.error[min.error$nonzero == min(min.error$nonzero),] |
|
|
196 |
#opt.threshold = min.error.gene$threshold |
|
|
197 |
|
|
|
198 |
object$opt.threshold = opt.threshold |
|
|
199 |
object |
|
|
200 |
} |
|
|
201 |
|
|
|
202 |
junk = wnsc(x, y = conditions, offset.percent = offset.percent, n.threshold = n.threshold, prior = prior, |
|
|
203 |
remove.zeros = remove.zeros, weights = w) |
|
|
204 |
|
|
|
205 |
junk$call = this.call |
|
|
206 |
class(junk) = "pamrtrained" |
|
|
207 |
junk |
|
|
208 |
} |