Diff of /4x/physical.py [000000] .. [171cba]

Switch to unified view

a b/4x/physical.py
1
import numpy as np
2
import matplotlib.pyplot as plt
3
import seaborn
4
import pandas as pd
5
import itertools
6
from sklearn import linear_model, ensemble, decomposition, cross_validation, preprocessing
7
from statsmodels.regression.mixed_linear_model import MixedLM
8
9
10
11
12
13
14
15
16
17
def normalise(df, skip = []) :
18
    for i in df.columns :
19
        if i not in skip :
20
            df[i] -= df[i].mean()
21
            df[i] /= df[i].std()
22
    return df
23
24
25
26
27
28
29
# Remove a layer from a list
30
def delayer(m) :
31
    out = []
32
    for i in m :
33
        if isinstance(i, list) :
34
            for j in i :
35
                out.append(j)
36
        else :
37
            out.append(i)
38
    return out
39
40
41
42
43
44
45
46
# Remove all layers from a list
47
def flatten(m) :
48
    out = m[:]
49
50
    while out != delayer(out) :
51
        out = delayer(out)
52
53
    return out
54
55
56
57
58
59
60
61
62
# Generate all combinations of objects in a list
63
def combinatorial(l) :
64
    out = []
65
66
    for numel in range(len(l)) :
67
        for i in itertools.combinations(l, numel+1) :
68
            out.append(list(i))
69
70
    return out
71
72
73
74
75
76
77
78
79
80
81
def pcaplot(df) :
82
83
    # PCA
84
    pca = decomposition.PCA(whiten = True)
85
    pca.fit(df)
86
    p1 = pca.components_[0] / np.abs(pca.components_[0]).max() * np.sqrt(2)/2
87
    p2 = pca.components_[1] / np.abs(pca.components_[1]).max() * np.sqrt(2)/2
88
89
    # Normalise
90
    norms = np.max([np.sqrt((np.array(zip(p1, p2)[i])**2).sum()) for i in range(len(p1))])
91
    c = plt.Circle( (0, 0), radius = 1, alpha = 0.2)
92
    plt.axes(aspect = 1)
93
    plt.gca().add_artist(c)
94
95
    plt.scatter(p1 / norms, p2 / norms)
96
    plt.xlim([-1, 1])
97
    plt.ylim([-1, 1])
98
99
    for i, text in enumerate(df.columns) :
100
        plt.annotate(text, xy = [p1[i], p2[i]])
101
102
    plt.tight_layout()
103
104
105
106
107
108
109
110
111
112
113
114
def test_all_linear(df, y, x, return_significant = True, group = None) :
115
116
    # All possible combinations of independent variables
117
    independent = combinatorial(x)
118
119
    fits = {}
120
    pval = {}
121
    linmodels = {}
122
    qsum = {}
123
124
    # For all dependent variables, one at a time
125
    for dependent in y :
126
127
        print "Fitting for %s." % dependent
128
129
        # For all combinations of independent variables
130
        for covariate in independent :
131
132
            # Standard mixed model
133
            if group is None :
134
135
                # Fit a linear model
136
                ols = pd.stats.api.ols(y = df[dependent], x = df[covariate])
137
138
                # Save the results
139
                if (return_significant and ols.f_stat["p-value"] < 0.05) or (not return_significant) :
140
                    linmodels.setdefault(dependent, []).append(ols)
141
                    fits.setdefault(dependent, []).append(ols.r2)
142
                    pval.setdefault(dependent, []).append(ols.f_stat["p-value"])
143
144
145
            # Mixed effects model
146
            else :
147
                subset = delayer([covariate, dependent, group])
148
                df2 = df[delayer(subset)].dropna()
149
150
                # Fit a mixed effects model
151
                ols = MixedLM(endog = df2[dependent], exog = df2[covariate], groups = df2[group]).fit()
152
153
                # Calculate AIC
154
                linmodels.setdefault(dependent, []).append(ols)
155
                fits.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf)
156
                pval.setdefault(dependent, []).append(ols.pvalues)
157
158
    if group is not None :
159
        for i in y :
160
            f = np.array(fits[i])
161
            models = np.array(linmodels[i])
162
            idx = np.where(f - f.min() <= 2)[0]
163
            bestmodelDoF = [j.k_fe for j in np.array(linmodels[i])[idx]]
164
            bestmodels = [idx[j] for j in np.where(bestmodelDoF == np.min(bestmodelDoF))[0]]
165
            qsum[i] = models[idx[np.where(f[bestmodels] == np.min(f[bestmodels]))]]
166
167
168
        return linmodels, fits, pval, qsum
169
170
    return linmodels, fits, pval
171
172
    
173
        
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def summary(models) :
190
191
    # Generate list of everything
192
    r2 = np.array([m.r2 for dependent in models.keys() for m in models[dependent]])
193
    p = np.array([m.f_stat["p-value"] for dependent in models.keys() for m in models[dependent]])
194
    mod = np.array([m for dependent in models.keys() for m in models[dependent]])
195
    dependent = np.array([dependent for dependent in models.keys() for m in models[dependent]])
196
197
    # Sort by R2
198
    idx = np.argsort(r2)[::-1]
199
200
    # Output string
201
    s = "%d significant regressions.\n\n" % len(r2)
202
    s += "Ten most correlated :\n\n"
203
204
    # Print a summary of the top ten correlations
205
    for i in idx[:10] :
206
        s += ("%s ~ %s\n" % (dependent[i], " + ".join(mod[i].x.columns[:-1])))
207
        s += ("R^2 = %f\tp = %f\n\n" % (r2[i], p[i]))
208
209
    print s
210
    #return s
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# Read physical data
237
physical = pd.read_csv("../physical.csv").drop(["CurrTag", "DeathDate", "Category"], 1)
238
239
240
241
# Read improc data
242
ent = pd.read_csv("results/entropy.csv").drop(["Unnamed: 0"], 1)
243
foci = pd.read_csv("results/foci.csv").drop(["Unnamed: 0"], 1)
244
lac = pd.read_csv("results/normalised_lacunarity.csv").drop(["Unnamed: 0"], 1)
245
gabor = pd.read_csv("results/gabor_filters.csv").drop(["Unnamed: 0"], 1)
246
ts = pd.read_csv("results/tissue_sinusoid_ratio.csv").drop(["Unnamed: 0"], 1)
247
248
249
250
# Merge dataframes
251
sheep = np.unique(ent.Sheep)
252
253
Ent = [np.mean(ent.loc[ent.Sheep == i]).Entropy for i in sheep]
254
EntVar = [np.var(ent.loc[ent.Sheep == i]).Entropy for i in sheep]
255
256
Focicount = [np.mean(foci.loc[foci.Sheep == i]).Count for i in sheep]
257
FocicountVar = [np.var(foci.loc[foci.Sheep == i]).Count for i in sheep]
258
Focisize = [np.mean(foci.loc[foci.Sheep == i]).meanSize for i in sheep]
259
FocisizeVar = [np.var(foci.loc[foci.Sheep == i]).meanSize for i in sheep]
260
261
Lac = [np.mean(lac.loc[lac.Sheep == i]).Lacunarity for i in sheep]
262
LacVar = [np.var(lac.loc[lac.Sheep == i]).Lacunarity for i in sheep]
263
264
Scale = [np.mean(gabor.loc[gabor.Sheep == i]).Scale for i in sheep]
265
ScaleVar = [np.var(gabor.loc[gabor.Sheep == i]).Scale for i in sheep]
266
Dir = [np.mean(gabor.loc[gabor.Sheep == i]).Directionality for i in sheep]
267
DirVar = [np.var(gabor.loc[gabor.Sheep == i]).Directionality for i in sheep]
268
269
TS = [np.mean(ts.loc[ts.Sheep == i]).TSRatio for i in sheep]
270
TSVar = [np.var(ts.loc[ts.Sheep == i]).TSRatio for i in sheep]
271
272
improc = pd.DataFrame({ "Sheep" : sheep,
273
                        "Entropy" : Ent,
274
                        "EntropyVar" : EntVar,
275
                        "FociCount" : Focicount,
276
                        "FociCountVar" : FocicountVar,
277
                        "FociSize" : Focisize,
278
                        "FociSizeVar" : FocisizeVar,
279
                        "Lacunarity" : Lac,
280
                        "LacunarityVar" : LacVar,
281
                        "Scale" : Scale,
282
                        "ScaleVar" : ScaleVar,
283
                        "Directionality" : Dir,
284
                        "DirectionalityVar" : DirVar,
285
                        "TissueToSinusoid" : TS,
286
                        "TissueToSinusoidVar" : TSVar
287
                        })
288
289
290
291
292
physcols = ["Weight", "Sex", "AgeAtDeath", "Foreleg", "Hindleg"]
293
imagecols = ["Entropy", "Lacunarity", "Scale", "Directionality", "FociCount", "FociSize", "TissueToSinusoid"]
294
295
296
297
298
# Merges :
299
300
# Sheep-centred dataframe
301
rawsheepdata = pd.merge(physical, improc, on="Sheep", how="outer")
302
sheepdata = normalise(rawsheepdata, skip = "Sheep")
303
304
# Image-centred dataframe
305
rawimagedata = pd.merge(lac, ent,
306
    on=["Sheep", "Image"]).merge(foci.drop("stdSize", 1), 
307
    on=["Sheep", "Image"]).merge(gabor,
308
    on=["Sheep", "Image"]).merge(ts, 
309
    on=["Sheep", "Image"]).merge(normalise(physical, skip = "Sheep"),
310
    on="Sheep")
311
rawimagedata.rename(columns = { "meanSize" : "FociSize", 
312
                                "TSRatio" : "TissueToSinusoid",
313
                                "Count" : "FociCount" }, inplace=True)
314
imagedata = normalise(rawimagedata, skip = physcols + ["Sheep"])
315
316
317
318
319
320
321
# COLUMN ON NUMBER OF IMAGES
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
"""
345
# CV
346
347
m = [   [[linear_model.ElasticNet(max_iter=1000000, alpha = i, l1_ratio = j) 
348
            for i in np.linspace(0.1, 1.0, 100)]
349
            for j in np.linspace(0., 1.0, 100)],
350
        [[[[linear_model.BayesianRidge(n_iter = 10000, alpha_1 = a1, alpha_2 = a2, lambda_1 = l1, lambda_2 = l2) 
351
            for a1 in np.logspace(1e-8, 1e-4, 20)] 
352
            for a2 in np.logspace(1e-8, 1e-4, 20)]
353
            for l1 in np.logspace(1e-8, 1e-4, 20)]
354
            for l2 in np.logspace(1e-8, 1e-4, 20)],
355
        [ensemble.RandomForestRegressor(n_estimators = i) for i in np.unique(np.logspace(1, 4, 100).astype(int))],
356
        [[[[ensemble.GradientBoostingRegressor(n_estimators = e, loss=l, learning_rate = r, max_depth = m) 
357
            for l in ["ls", "lad", "huber"]] 
358
            for e in np.unique(np.logspace(1, 4, 100).astype(int))]
359
            for r in np.logspace(-4, 0, 200)]
360
            for m in np.arange(1, 20)],
361
        #ensemble.AdaBoostRegressor(n_estimators = 100),
362
        #ensemble.AdaBoostRegressor(base_estimator = ensemble.GradientBoostingRegressor(n_estimators = 100), n_estimators = 100),
363
        #ensemble.AdaBoostRegressor(base_estimator = ensemble.RandomForestRegressor(n_estimators = 50), n_estimators = 100),
364
        [ensemble.BaggingRegressor(n_estimators = e) for e in np.unique(np.logspace(1, 4, 100).astype(int))]
365
    ]
366
367
methods = flatten(m)
368
369
370
371
372
373
# For each physical measurement, perform fits using all above methods
374
scores = {}
375
376
377
378
for col in physcols :
379
    for q, method in enumerate(methods[10000:10100]) :
380
        if not scores.has_key(col) :
381
            scores[col] = []
382
        scores[col].append(
383
            cross_validation.cross_val_score(
384
                method, d[imagecols], d[col], cv=cross_validation.ShuffleSplit(len(d), 100, test_size=0.1), 
385
            n_jobs = -1, scoring = "r2").mean())
386
        print "Done %d" % q
387
388
"""
389
390
391