|
a |
|
b/4x/physical.py |
|
|
1 |
import numpy as np |
|
|
2 |
import matplotlib.pyplot as plt |
|
|
3 |
import seaborn |
|
|
4 |
import pandas as pd |
|
|
5 |
import itertools |
|
|
6 |
from sklearn import linear_model, ensemble, decomposition, cross_validation, preprocessing |
|
|
7 |
from statsmodels.regression.mixed_linear_model import MixedLM |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
|
|
|
11 |
|
|
|
12 |
|
|
|
13 |
|
|
|
14 |
|
|
|
15 |
|
|
|
16 |
|
|
|
17 |
def normalise(df, skip = []) : |
|
|
18 |
for i in df.columns : |
|
|
19 |
if i not in skip : |
|
|
20 |
df[i] -= df[i].mean() |
|
|
21 |
df[i] /= df[i].std() |
|
|
22 |
return df |
|
|
23 |
|
|
|
24 |
|
|
|
25 |
|
|
|
26 |
|
|
|
27 |
|
|
|
28 |
|
|
|
29 |
# Remove a layer from a list |
|
|
30 |
def delayer(m) : |
|
|
31 |
out = [] |
|
|
32 |
for i in m : |
|
|
33 |
if isinstance(i, list) : |
|
|
34 |
for j in i : |
|
|
35 |
out.append(j) |
|
|
36 |
else : |
|
|
37 |
out.append(i) |
|
|
38 |
return out |
|
|
39 |
|
|
|
40 |
|
|
|
41 |
|
|
|
42 |
|
|
|
43 |
|
|
|
44 |
|
|
|
45 |
|
|
|
46 |
# Remove all layers from a list |
|
|
47 |
def flatten(m) : |
|
|
48 |
out = m[:] |
|
|
49 |
|
|
|
50 |
while out != delayer(out) : |
|
|
51 |
out = delayer(out) |
|
|
52 |
|
|
|
53 |
return out |
|
|
54 |
|
|
|
55 |
|
|
|
56 |
|
|
|
57 |
|
|
|
58 |
|
|
|
59 |
|
|
|
60 |
|
|
|
61 |
|
|
|
62 |
# Generate all combinations of objects in a list |
|
|
63 |
def combinatorial(l) : |
|
|
64 |
out = [] |
|
|
65 |
|
|
|
66 |
for numel in range(len(l)) : |
|
|
67 |
for i in itertools.combinations(l, numel+1) : |
|
|
68 |
out.append(list(i)) |
|
|
69 |
|
|
|
70 |
return out |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
|
|
|
74 |
|
|
|
75 |
|
|
|
76 |
|
|
|
77 |
|
|
|
78 |
|
|
|
79 |
|
|
|
80 |
|
|
|
81 |
def pcaplot(df) : |
|
|
82 |
|
|
|
83 |
# PCA |
|
|
84 |
pca = decomposition.PCA(whiten = True) |
|
|
85 |
pca.fit(df) |
|
|
86 |
p1 = pca.components_[0] / np.abs(pca.components_[0]).max() * np.sqrt(2)/2 |
|
|
87 |
p2 = pca.components_[1] / np.abs(pca.components_[1]).max() * np.sqrt(2)/2 |
|
|
88 |
|
|
|
89 |
# Normalise |
|
|
90 |
norms = np.max([np.sqrt((np.array(zip(p1, p2)[i])**2).sum()) for i in range(len(p1))]) |
|
|
91 |
c = plt.Circle( (0, 0), radius = 1, alpha = 0.2) |
|
|
92 |
plt.axes(aspect = 1) |
|
|
93 |
plt.gca().add_artist(c) |
|
|
94 |
|
|
|
95 |
plt.scatter(p1 / norms, p2 / norms) |
|
|
96 |
plt.xlim([-1, 1]) |
|
|
97 |
plt.ylim([-1, 1]) |
|
|
98 |
|
|
|
99 |
for i, text in enumerate(df.columns) : |
|
|
100 |
plt.annotate(text, xy = [p1[i], p2[i]]) |
|
|
101 |
|
|
|
102 |
plt.tight_layout() |
|
|
103 |
|
|
|
104 |
|
|
|
105 |
|
|
|
106 |
|
|
|
107 |
|
|
|
108 |
|
|
|
109 |
|
|
|
110 |
|
|
|
111 |
|
|
|
112 |
|
|
|
113 |
|
|
|
114 |
def test_all_linear(df, y, x, return_significant = True, group = None) : |
|
|
115 |
|
|
|
116 |
# All possible combinations of independent variables |
|
|
117 |
independent = combinatorial(x) |
|
|
118 |
|
|
|
119 |
fits = {} |
|
|
120 |
pval = {} |
|
|
121 |
linmodels = {} |
|
|
122 |
qsum = {} |
|
|
123 |
|
|
|
124 |
# For all dependent variables, one at a time |
|
|
125 |
for dependent in y : |
|
|
126 |
|
|
|
127 |
print "Fitting for %s." % dependent |
|
|
128 |
|
|
|
129 |
# For all combinations of independent variables |
|
|
130 |
for covariate in independent : |
|
|
131 |
|
|
|
132 |
# Standard mixed model |
|
|
133 |
if group is None : |
|
|
134 |
|
|
|
135 |
# Fit a linear model |
|
|
136 |
ols = pd.stats.api.ols(y = df[dependent], x = df[covariate]) |
|
|
137 |
|
|
|
138 |
# Save the results |
|
|
139 |
if (return_significant and ols.f_stat["p-value"] < 0.05) or (not return_significant) : |
|
|
140 |
linmodels.setdefault(dependent, []).append(ols) |
|
|
141 |
fits.setdefault(dependent, []).append(ols.r2) |
|
|
142 |
pval.setdefault(dependent, []).append(ols.f_stat["p-value"]) |
|
|
143 |
|
|
|
144 |
|
|
|
145 |
# Mixed effects model |
|
|
146 |
else : |
|
|
147 |
subset = delayer([covariate, dependent, group]) |
|
|
148 |
df2 = df[delayer(subset)].dropna() |
|
|
149 |
|
|
|
150 |
# Fit a mixed effects model |
|
|
151 |
ols = MixedLM(endog = df2[dependent], exog = df2[covariate], groups = df2[group]).fit() |
|
|
152 |
|
|
|
153 |
# Calculate AIC |
|
|
154 |
linmodels.setdefault(dependent, []).append(ols) |
|
|
155 |
fits.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf) |
|
|
156 |
pval.setdefault(dependent, []).append(ols.pvalues) |
|
|
157 |
|
|
|
158 |
if group is not None : |
|
|
159 |
for i in y : |
|
|
160 |
f = np.array(fits[i]) |
|
|
161 |
models = np.array(linmodels[i]) |
|
|
162 |
idx = np.where(f - f.min() <= 2)[0] |
|
|
163 |
bestmodelDoF = [j.k_fe for j in np.array(linmodels[i])[idx]] |
|
|
164 |
bestmodels = [idx[j] for j in np.where(bestmodelDoF == np.min(bestmodelDoF))[0]] |
|
|
165 |
qsum[i] = models[idx[np.where(f[bestmodels] == np.min(f[bestmodels]))]] |
|
|
166 |
|
|
|
167 |
|
|
|
168 |
return linmodels, fits, pval, qsum |
|
|
169 |
|
|
|
170 |
return linmodels, fits, pval |
|
|
171 |
|
|
|
172 |
|
|
|
173 |
|
|
|
174 |
|
|
|
175 |
|
|
|
176 |
|
|
|
177 |
|
|
|
178 |
|
|
|
179 |
|
|
|
180 |
|
|
|
181 |
|
|
|
182 |
|
|
|
183 |
|
|
|
184 |
|
|
|
185 |
|
|
|
186 |
|
|
|
187 |
|
|
|
188 |
|
|
|
189 |
def summary(models) : |
|
|
190 |
|
|
|
191 |
# Generate list of everything |
|
|
192 |
r2 = np.array([m.r2 for dependent in models.keys() for m in models[dependent]]) |
|
|
193 |
p = np.array([m.f_stat["p-value"] for dependent in models.keys() for m in models[dependent]]) |
|
|
194 |
mod = np.array([m for dependent in models.keys() for m in models[dependent]]) |
|
|
195 |
dependent = np.array([dependent for dependent in models.keys() for m in models[dependent]]) |
|
|
196 |
|
|
|
197 |
# Sort by R2 |
|
|
198 |
idx = np.argsort(r2)[::-1] |
|
|
199 |
|
|
|
200 |
# Output string |
|
|
201 |
s = "%d significant regressions.\n\n" % len(r2) |
|
|
202 |
s += "Ten most correlated :\n\n" |
|
|
203 |
|
|
|
204 |
# Print a summary of the top ten correlations |
|
|
205 |
for i in idx[:10] : |
|
|
206 |
s += ("%s ~ %s\n" % (dependent[i], " + ".join(mod[i].x.columns[:-1]))) |
|
|
207 |
s += ("R^2 = %f\tp = %f\n\n" % (r2[i], p[i])) |
|
|
208 |
|
|
|
209 |
print s |
|
|
210 |
#return s |
|
|
211 |
|
|
|
212 |
|
|
|
213 |
|
|
|
214 |
|
|
|
215 |
|
|
|
216 |
|
|
|
217 |
|
|
|
218 |
|
|
|
219 |
|
|
|
220 |
|
|
|
221 |
|
|
|
222 |
|
|
|
223 |
|
|
|
224 |
|
|
|
225 |
|
|
|
226 |
|
|
|
227 |
|
|
|
228 |
|
|
|
229 |
|
|
|
230 |
|
|
|
231 |
|
|
|
232 |
|
|
|
233 |
|
|
|
234 |
|
|
|
235 |
|
|
|
236 |
# Read physical data |
|
|
237 |
physical = pd.read_csv("../physical.csv").drop(["CurrTag", "DeathDate", "Category"], 1) |
|
|
238 |
|
|
|
239 |
|
|
|
240 |
|
|
|
241 |
# Read improc data |
|
|
242 |
ent = pd.read_csv("results/entropy.csv").drop(["Unnamed: 0"], 1) |
|
|
243 |
foci = pd.read_csv("results/foci.csv").drop(["Unnamed: 0"], 1) |
|
|
244 |
lac = pd.read_csv("results/normalised_lacunarity.csv").drop(["Unnamed: 0"], 1) |
|
|
245 |
gabor = pd.read_csv("results/gabor_filters.csv").drop(["Unnamed: 0"], 1) |
|
|
246 |
ts = pd.read_csv("results/tissue_sinusoid_ratio.csv").drop(["Unnamed: 0"], 1) |
|
|
247 |
|
|
|
248 |
|
|
|
249 |
|
|
|
250 |
# Merge dataframes |
|
|
251 |
sheep = np.unique(ent.Sheep) |
|
|
252 |
|
|
|
253 |
Ent = [np.mean(ent.loc[ent.Sheep == i]).Entropy for i in sheep] |
|
|
254 |
EntVar = [np.var(ent.loc[ent.Sheep == i]).Entropy for i in sheep] |
|
|
255 |
|
|
|
256 |
Focicount = [np.mean(foci.loc[foci.Sheep == i]).Count for i in sheep] |
|
|
257 |
FocicountVar = [np.var(foci.loc[foci.Sheep == i]).Count for i in sheep] |
|
|
258 |
Focisize = [np.mean(foci.loc[foci.Sheep == i]).meanSize for i in sheep] |
|
|
259 |
FocisizeVar = [np.var(foci.loc[foci.Sheep == i]).meanSize for i in sheep] |
|
|
260 |
|
|
|
261 |
Lac = [np.mean(lac.loc[lac.Sheep == i]).Lacunarity for i in sheep] |
|
|
262 |
LacVar = [np.var(lac.loc[lac.Sheep == i]).Lacunarity for i in sheep] |
|
|
263 |
|
|
|
264 |
Scale = [np.mean(gabor.loc[gabor.Sheep == i]).Scale for i in sheep] |
|
|
265 |
ScaleVar = [np.var(gabor.loc[gabor.Sheep == i]).Scale for i in sheep] |
|
|
266 |
Dir = [np.mean(gabor.loc[gabor.Sheep == i]).Directionality for i in sheep] |
|
|
267 |
DirVar = [np.var(gabor.loc[gabor.Sheep == i]).Directionality for i in sheep] |
|
|
268 |
|
|
|
269 |
TS = [np.mean(ts.loc[ts.Sheep == i]).TSRatio for i in sheep] |
|
|
270 |
TSVar = [np.var(ts.loc[ts.Sheep == i]).TSRatio for i in sheep] |
|
|
271 |
|
|
|
272 |
improc = pd.DataFrame({ "Sheep" : sheep, |
|
|
273 |
"Entropy" : Ent, |
|
|
274 |
"EntropyVar" : EntVar, |
|
|
275 |
"FociCount" : Focicount, |
|
|
276 |
"FociCountVar" : FocicountVar, |
|
|
277 |
"FociSize" : Focisize, |
|
|
278 |
"FociSizeVar" : FocisizeVar, |
|
|
279 |
"Lacunarity" : Lac, |
|
|
280 |
"LacunarityVar" : LacVar, |
|
|
281 |
"Scale" : Scale, |
|
|
282 |
"ScaleVar" : ScaleVar, |
|
|
283 |
"Directionality" : Dir, |
|
|
284 |
"DirectionalityVar" : DirVar, |
|
|
285 |
"TissueToSinusoid" : TS, |
|
|
286 |
"TissueToSinusoidVar" : TSVar |
|
|
287 |
}) |
|
|
288 |
|
|
|
289 |
|
|
|
290 |
|
|
|
291 |
|
|
|
292 |
physcols = ["Weight", "Sex", "AgeAtDeath", "Foreleg", "Hindleg"] |
|
|
293 |
imagecols = ["Entropy", "Lacunarity", "Scale", "Directionality", "FociCount", "FociSize", "TissueToSinusoid"] |
|
|
294 |
|
|
|
295 |
|
|
|
296 |
|
|
|
297 |
|
|
|
298 |
# Merges : |
|
|
299 |
|
|
|
300 |
# Sheep-centred dataframe |
|
|
301 |
rawsheepdata = pd.merge(physical, improc, on="Sheep", how="outer") |
|
|
302 |
sheepdata = normalise(rawsheepdata, skip = "Sheep") |
|
|
303 |
|
|
|
304 |
# Image-centred dataframe |
|
|
305 |
rawimagedata = pd.merge(lac, ent, |
|
|
306 |
on=["Sheep", "Image"]).merge(foci.drop("stdSize", 1), |
|
|
307 |
on=["Sheep", "Image"]).merge(gabor, |
|
|
308 |
on=["Sheep", "Image"]).merge(ts, |
|
|
309 |
on=["Sheep", "Image"]).merge(normalise(physical, skip = "Sheep"), |
|
|
310 |
on="Sheep") |
|
|
311 |
rawimagedata.rename(columns = { "meanSize" : "FociSize", |
|
|
312 |
"TSRatio" : "TissueToSinusoid", |
|
|
313 |
"Count" : "FociCount" }, inplace=True) |
|
|
314 |
imagedata = normalise(rawimagedata, skip = physcols + ["Sheep"]) |
|
|
315 |
|
|
|
316 |
|
|
|
317 |
|
|
|
318 |
|
|
|
319 |
|
|
|
320 |
|
|
|
321 |
# COLUMN ON NUMBER OF IMAGES |
|
|
322 |
|
|
|
323 |
|
|
|
324 |
|
|
|
325 |
|
|
|
326 |
|
|
|
327 |
|
|
|
328 |
|
|
|
329 |
|
|
|
330 |
|
|
|
331 |
|
|
|
332 |
|
|
|
333 |
|
|
|
334 |
|
|
|
335 |
|
|
|
336 |
|
|
|
337 |
|
|
|
338 |
|
|
|
339 |
|
|
|
340 |
|
|
|
341 |
|
|
|
342 |
|
|
|
343 |
|
|
|
344 |
""" |
|
|
345 |
# CV |
|
|
346 |
|
|
|
347 |
m = [ [[linear_model.ElasticNet(max_iter=1000000, alpha = i, l1_ratio = j) |
|
|
348 |
for i in np.linspace(0.1, 1.0, 100)] |
|
|
349 |
for j in np.linspace(0., 1.0, 100)], |
|
|
350 |
[[[[linear_model.BayesianRidge(n_iter = 10000, alpha_1 = a1, alpha_2 = a2, lambda_1 = l1, lambda_2 = l2) |
|
|
351 |
for a1 in np.logspace(1e-8, 1e-4, 20)] |
|
|
352 |
for a2 in np.logspace(1e-8, 1e-4, 20)] |
|
|
353 |
for l1 in np.logspace(1e-8, 1e-4, 20)] |
|
|
354 |
for l2 in np.logspace(1e-8, 1e-4, 20)], |
|
|
355 |
[ensemble.RandomForestRegressor(n_estimators = i) for i in np.unique(np.logspace(1, 4, 100).astype(int))], |
|
|
356 |
[[[[ensemble.GradientBoostingRegressor(n_estimators = e, loss=l, learning_rate = r, max_depth = m) |
|
|
357 |
for l in ["ls", "lad", "huber"]] |
|
|
358 |
for e in np.unique(np.logspace(1, 4, 100).astype(int))] |
|
|
359 |
for r in np.logspace(-4, 0, 200)] |
|
|
360 |
for m in np.arange(1, 20)], |
|
|
361 |
#ensemble.AdaBoostRegressor(n_estimators = 100), |
|
|
362 |
#ensemble.AdaBoostRegressor(base_estimator = ensemble.GradientBoostingRegressor(n_estimators = 100), n_estimators = 100), |
|
|
363 |
#ensemble.AdaBoostRegressor(base_estimator = ensemble.RandomForestRegressor(n_estimators = 50), n_estimators = 100), |
|
|
364 |
[ensemble.BaggingRegressor(n_estimators = e) for e in np.unique(np.logspace(1, 4, 100).astype(int))] |
|
|
365 |
] |
|
|
366 |
|
|
|
367 |
methods = flatten(m) |
|
|
368 |
|
|
|
369 |
|
|
|
370 |
|
|
|
371 |
|
|
|
372 |
|
|
|
373 |
# For each physical measurement, perform fits using all above methods |
|
|
374 |
scores = {} |
|
|
375 |
|
|
|
376 |
|
|
|
377 |
|
|
|
378 |
for col in physcols : |
|
|
379 |
for q, method in enumerate(methods[10000:10100]) : |
|
|
380 |
if not scores.has_key(col) : |
|
|
381 |
scores[col] = [] |
|
|
382 |
scores[col].append( |
|
|
383 |
cross_validation.cross_val_score( |
|
|
384 |
method, d[imagecols], d[col], cv=cross_validation.ShuffleSplit(len(d), 100, test_size=0.1), |
|
|
385 |
n_jobs = -1, scoring = "r2").mean()) |
|
|
386 |
print "Done %d" % q |
|
|
387 |
|
|
|
388 |
""" |
|
|
389 |
|
|
|
390 |
|
|
|
391 |
|