Diff of /4x/Data_analysis.py [000000] .. [171cba]

Switch to unified view

a b/4x/Data_analysis.py
1
# -*- coding: utf-8 -*-
2
# <nbformat>3.0</nbformat>
3
4
# <codecell>
5
6
import numpy as np
7
from matplotlib import rcParams
8
import matplotlib.pyplot as plt
9
import seaborn
10
import pandas as pd
11
import itertools
12
import os
13
from sklearn import linear_model, ensemble, decomposition, cross_validation, preprocessing
14
from statsmodels.regression.mixed_linear_model import MixedLM
15
import statsmodels
16
import statsmodels.api as sm
17
from statsmodels.regression.linear_model import OLSResults
18
from statsmodels.tools.tools import add_constant
19
from sklearn.neighbors import KernelDensity
20
21
from mpl_toolkits.mplot3d import Axes3D, proj3d
22
23
print statsmodels.__version__
24
25
%matplotlib inline
26
rcParams["figure.figsize"] = (14, 8)
27
28
rcParams["text.usetex"] = False
29
rcParams["xtick.labelsize"] = 12
30
rcParams["ytick.labelsize"] = 12
31
rcParams["font.size"] = 14
32
rcParams["axes.titlesize"] = 16
33
#rcParams["text.usetex"] = False
34
rcParams["font.family"] = "Serif"
35
rcParams["figure.dpi"] = 600
36
37
# <codecell>
38
39
# RAW DATA
40
41
raw_physical = pd.read_csv("../data/physical.csv")
42
raw_histo = pd.read_csv("../data/tawfik.csv")
43
ent = pd.read_csv("results/entropy.csv").drop(["Unnamed: 0"], 1)
44
foci = pd.read_csv("results/foci.csv").drop(["Unnamed: 0"], 1)
45
lac = pd.read_csv("results/normalised_lacunarity.csv").drop(["Unnamed: 0"], 1)
46
gabor = pd.read_csv("results/gabor_filters.csv").drop(["Unnamed: 0"], 1)
47
ts = pd.read_csv("results/tissue_sinusoid_ratio.csv").drop(["Unnamed: 0"], 1)
48
blur = pd.read_csv("results/blur.csv").drop(["Unnamed: 0"], 1)
49
distances = pd.read_csv("results/interfoci_dist.csv").drop(["Unnamed: 0"], 1)
50
51
raw_image = pd.merge(lac, ent,
52
    on=["Sheep", "Image"]).merge(foci, 
53
    on=["Sheep", "Image"]).merge(gabor,
54
    on=["Sheep", "Image"]).merge(ts, 
55
    on=["Sheep", "Image"]).merge(blur, 
56
    on=["Sheep", "Image"]).merge(distances, 
57
    on=["Sheep", "Image"])
58
raw_image.rename(columns = {    "meanSize" : "FociSize", 
59
                                "TSRatio" : "TissueToSinusoid",
60
                                "Count" : "FociCount" }, inplace=True)
61
62
# <codecell>
63
64
def normalise(df, skip = []) :
65
    for i in df.columns :
66
        if i not in skip :
67
            df[i] -= df[i].mean()
68
            df[i] /= df[i].std()
69
    return df
70
71
72
73
74
75
76
def rescale(df, skip = []) :
77
    for i in df.columns :
78
        if i not in skip :
79
            df[i] -= df[i].min()
80
            df[i] /= df[i].max()
81
    return df
82
83
84
85
# Remove a layer from a list
86
def delayer(m) :
87
    out = []
88
    for i in m :
89
        if isinstance(i, list) :
90
            for j in i :
91
                out.append(j)
92
        else :
93
            out.append(i)
94
    return out
95
96
97
98
99
100
101
102
# Remove all layers from a list
103
def flatten(m) :
104
    out = m[:]
105
106
    while out != delayer(out) :
107
        out = delayer(out)
108
109
    return out
110
111
112
113
114
115
116
117
118
# Generate all combinations of objects in a list
119
def combinatorial(l, short = np.inf) :
120
    out = []
121
122
    for numel in range(len(l)) :
123
        for i in itertools.combinations(l, numel+1) :
124
            if len(list(i)) < short :
125
                out.append(list(i))
126
127
    return out
128
129
130
131
132
133
134
135
136
137
138
def pcaplot(df) :
139
140
    # PCA
141
    pca = decomposition.PCA(whiten = True)
142
    pca.fit(df)
143
    p1 = pca.components_[0] / np.abs(pca.components_[0]).max() * np.sqrt(2)/2
144
    p2 = pca.components_[1] / np.abs(pca.components_[1]).max() * np.sqrt(2)/2
145
146
    # Normalise
147
    norms = np.max([np.sqrt((np.array(zip(p1, p2)[i])**2).sum()) for i in range(len(p1))])
148
    c = plt.Circle( (0, 0), radius = 1, alpha = 0.2)
149
    plt.axes(aspect = 1)
150
    plt.gca().add_artist(c)
151
152
    plt.scatter(p1 / norms, p2 / norms)
153
    plt.xlim([-1, 1])
154
    plt.ylim([-1, 1])
155
156
    for i, text in enumerate(df.columns) :
157
        plt.annotate(text, xy = [p1[i], p2[i]])
158
159
    plt.tight_layout()
160
161
162
163
164
165
166
167
168
169
def big_ass_matrix(df, y, x, group = None, short = True) :
170
171
    independent = combinatorial(x, short)
172
    
173
    models = {}
174
    p = {}
175
    aic = {}
176
    r2 = {}
177
    best = {}
178
    dfs = {}
179
    bestdf = {}
180
    
181
    for dependent in y :
182
        
183
        print "Regressing for %s" % dependent
184
        
185
        for covariate in independent :
186
            
187
            if group is None :
188
                
189
                subset = delayer([covariate, dependent])
190
                df2 = df[subset].dropna()
191
                df2["Intercept"] = np.ones(len(df2))
192
                dfs.setdefault(dependent, []).append(df2)
193
                
194
                ols = sm.GLS(endog=df2[dependent], exog=df2[delayer([covariate, "Intercept"])]).fit()
195
                
196
                models.setdefault(dependent, []).append(ols)
197
                p.setdefault(dependent, []).append(ols.pvalues[:-1].values)
198
                aic.setdefault(dependent, []).append(ols.aic)
199
                r2.setdefault(dependent, []).append(ols.rsquared)
200
            
201
            else :
202
                
203
                subset = delayer([covariate, dependent, group])
204
                df2 = df[subset].dropna()
205
                dfs.setdefault(dependent, []).append(df2)
206
                
207
                ols = MixedLM.from_formula(rstr(y=dependent, x=covariate), data=df2, groups=df2[group]).fit()
208
                
209
                models.setdefault(dependent, []).append(ols)
210
                aic.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf)
211
                p.setdefault(dependent, []).append(ols.pvalues[1:-1].values)
212
                r2.setdefault(dependent, []).append(mmR2(df2, ols))
213
214
    
215
       
216
        bestAIC = np.min(aic[dependent])
217
        
218
        for i, val in enumerate(models[dependent]) :
219
            
220
            if aic[dependent][i] < 2 + bestAIC :
221
                
222
                if np.sum(p[dependent][i] > 0.05) == 0 :
223
                    
224
                    if group is None :
225
                        
226
                        best.setdefault(dependent, []).append(val)
227
                        bestdf.setdefault(dependent, []).append(dfs[dependent][i])
228
                        
229
                    else :
230
                        
231
                        if val.random_effects.abs().mean()[0] > 0.01 :
232
                            
233
                            best.setdefault(dependent, []).append(val)
234
                            bestdf.setdefault(dependent, []).append(dfs[dependent][i])
235
       
236
            
237
        if best.has_key(dependent) :
238
            for i, model in enumerate(best[dependent]) :
239
                
240
                if not os.path.exists("regressions/%s" % dependent) :                  
241
                    os.mkdir("regressions/%s" % dependent)
242
                    
243
                if not os.path.exists("../talk/figures/regressions/%s" % dependent) :                  
244
                    os.mkdir("../talk/figures/regressions/%s" % dependent)                
245
246
                if group is None :
247
248
                    dfx = bestdf[dependent][i]
249
                    plt.scatter(model.fittedvalues.values, dfx[model.model.endog_names].values, c=seaborn.color_palette("deep", 8)[0])
250
                    plt.plot(dfx[model.model.endog_names].values, dfx[model.model.endog_names].values, c=seaborn.color_palette("deep", 8)[2])
251
                    plt.ylabel(model.model.endog_names)
252
                    yl = model.model.exog_names[:]
253
                    yl.remove("Intercept")
254
                    plt.xlabel("Estimate using " + ", ".join(yl))
255
                    plt.title(rstr(dependent, model.model.exog_names).replace(" + Intercept", ""))
256
                    #plt.title(r"$R^2$ = %.02f" % model.rsquared)
257
                    st = ("$R^2$ = %.03f\n\n"% model.rsquared)
258
                    for coefnum, coef in enumerate(yl) :
259
                        st += ("%s" % coef)
260
                        st += (" : %.03f\n" % model.params[coef])
261
                        st += ("$p$ = %.01e\n\n" % model.pvalues[coefnum])
262
                    #plt.suptitle(st)
263
                    plt.text(0.01, .99, st, va="top", ha="left")
264
                    plt.xlim([-0.05, 1.05])
265
                    plt.ylim([-0.05, 1.05])
266
                    plt.savefig("regressions/%s/lm-%d.pdf" % (dependent, i))
267
                    plt.savefig("../talk/figures/regressions/%s/lm-%d.png" % (dependent, i), dpi=300, jpeg_quality=90)
268
                    plt.close()
269
270
                else :
271
                    dfx = bestdf[dependent][i]
272
                    y, yhat = mmPredict(model.model.data.frame, model)
273
                    plt.scatter(yhat, y, c=seaborn.color_palette("deep", 8)[0])
274
                    plt.plot(y, y, c=seaborn.color_palette("deep", 8)[2])
275
                    plt.ylabel(model.model.endog_names)
276
                    yl = model.model.exog_names[:]
277
                    yl.remove("Intercept")
278
                    plt.xlabel("Estimate using " + ", ".join(yl))
279
                    plt.title(rstr(dependent, model.model.exog_names).replace("Intercept + ", ""))
280
                    
281
                    #plt.title(r"$R^2$ = %.02f" % mmR2(dfx, model))
282
                    st = ("$R^2$ = %.03f\n\n" % mmR2(dfx, model))
283
                    for coefnum, coef in enumerate(yl) :
284
                        st += coef
285
                        st += " : %.03f\n" % model.fe_params[1+coefnum]
286
                        st += "$p$ = %.01e\n\n" % model.pvalues[coef]
287
                    st += ("Avg. abs. RE coef. : %.03f" % model.random_effects.abs().mean())
288
                    plt.text(0.01, .99, st, va="top", ha="left")
289
                    
290
                    plt.xlim([-0.05, 1.05])
291
                    plt.ylim([-0.05, 1.05])
292
                    plt.savefig("regressions/%s/mm_%d.pdf" % (dependent, i))
293
                    plt.savefig("../talk/figures/regressions/%s/mm_%d.png" % (dependent, i), dpi=300, jpeg_quality=90)
294
                    plt.close()
295
        
296
    return best, (models, p, r2, aic)
297
    
298
    
299
    
300
    
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
def summary(models) :
316
317
    # Generate list of everything
318
    r2 = np.array([m.r2 for dependent in models.keys() for m in models[dependent]])
319
    p = np.array([m.f_stat["p-value"] for dependent in models.keys() for m in models[dependent]])
320
    mod = np.array([m for dependent in models.keys() for m in models[dependent]])
321
    dependent = np.array([dependent for dependent in models.keys() for m in models[dependent]])
322
323
    # Sort by R2
324
    idx = np.argsort(r2)[::-1]
325
326
    # Output string
327
    s = "%d significant regressions.\n\n" % len(r2)
328
    s += "Ten most correlated :\n\n"
329
330
    # Print a summary of the top ten correlations
331
    for i in idx[:10] :
332
        s += ("%s ~ %s\n" % (dependent[i], " + ".join(mod[i].x.columns[:-1])))
333
        s += ("R^2 = %f\tp = %f\n\n" % (r2[i], p[i]))
334
335
    print s
336
    
337
    
338
    
339
    
340
    
341
    
342
    
343
def rstr(y, x) :
344
    formatstr = "%s ~ " % y
345
    for i in x[:-1] :
346
        formatstr += str(i)
347
        formatstr += " + "
348
    formatstr += str(x[-1])
349
    return formatstr
350
351
352
353
354
355
356
357
358
359
def mmR2(df, ols) :
360
361
    y = df[ols.model.endog_names]
362
    sigma_a = ols.random_effects.values.var()
363
364
    yhat = np.zeros_like(y)
365
    for i, coef in enumerate(ols.fe_params) :
366
        yhat += ols.model.data.exog[:, i] * coef
367
368
    sigma_f = yhat.var()
369
370
    idxx = []
371
    
372
    grouplabel = list(df.columns)
373
    grouplabel.remove(ols.model.endog_names)
374
    remove_these = ols.model.exog_names[:]
375
    remove_these.remove("Intercept")
376
    for i in remove_these :
377
        grouplabel.remove(i)
378
379
    for j, u in enumerate(ols.model.group_labels) :
380
        idx = np.where(df[grouplabel] == u)[0]
381
        yhat[idx] += ols.random_effects.values[j]
382
383
    sigma_e = np.var(yhat - y)
384
385
    return (sigma_f + sigma_a) / (sigma_f + sigma_a + sigma_e)
386
387
388
389
390
391
def mmPredict(df, ols) :
392
393
    y = df[ols.model.endog_names]
394
    sigma_a = ols.random_effects.values.var()
395
396
    yhat = np.zeros_like(y)
397
    for i, coef in enumerate(ols.fe_params) :
398
        yhat += ols.model.data.exog[:, i] * coef
399
400
    sigma_f = yhat.var()
401
402
    idxx = []
403
    
404
    grouplabel = list(df.columns)
405
    grouplabel.remove(ols.model.endog_names)
406
    remove_these = ols.model.exog_names[:]
407
    remove_these.remove("Intercept")
408
    for i in remove_these :
409
        grouplabel.remove(i)
410
411
    for j, u in enumerate(ols.model.group_labels) :
412
        idx = np.where(df[grouplabel] == u)[0]
413
        yhat[idx] += ols.random_effects.values[j]
414
        
415
    return y, yhat
416
417
418
419
420
#best, stuff = big_ass_matrix(df=sheep, y=["Weight"], x=imagecols, group="AgeAtDeath", short=True)
421
422
# <codecell>
423
424
raw_image.columns
425
426
# <codecell>
427
428
# CLEAN DATA
429
430
physcols = ["Weight", "Sex", "AgeAtDeath", "Foreleg", "Hindleg", "E", "CES", "RES"]
431
imagecols = ["Entropy", "Lacunarity", "Inflammation", "Scale", "Directionality", "FociCount", "FociSize", "TissueToSinusoid", "Blur", "MinDist", "IFDist"]
432
histcols = ["LobularCollapse", "InterfaceHepatitis", "ConfluentNecrosis", "LnApRi", "PortalInflammation", "BDHyperplasia", "Fibrosis", "TawfikTotal", "MeanHepSize", "MinHepSize", "MaxHepSize"]
433
434
435
436
437
438
# IMAGE
439
440
# Set FociSize to zero if FociCount is zero
441
# Drop stdSize
442
image = raw_image
443
image = image.drop("stdSize", 1)
444
image.FociSize[raw_image.FociCount == 0] = 0
445
446
447
448
# HISTO
449
450
histo = raw_histo
451
histo = histo.drop(["Vessels", "Vacuol", "Pigment", "Std_hep_size"], 1)
452
453
454
455
# PHYSICAL
456
physical = raw_physical
457
physical = physical.drop(["CurrTag", "DeathDate", "Category"], 1)
458
exposure = pd.read_csv("../data/exposure.csv")
459
exposure.rename(columns={"BirthYear" : "BirthYear", "AvgOfLambWS" : "E"}, inplace=True)
460
exposure["AgeAtDeath"] = 2011 - exposure.BirthYear
461
exposure.E = np.round(1. - exposure.E, 6)
462
exposure["CES"] = np.flipud(np.flipud(exposure.E).cumsum())
463
exposure["RES"] = np.flipud(np.flipud(exposure.E - exposure.CES / (exposure.AgeAtDeath + 1)).cumsum())
464
physical = pd.merge(physical, exposure, on="AgeAtDeath", how="inner")
465
466
467
468
469
# COMPLETE DATASET
470
471
raw_data = pd.merge(pd.merge(image, histo, on="Sheep", how="outer"), physical, on="Sheep", how="outer")
472
raw_data.to_csv("../data/tentative_complete.csv")
473
474
475
476
477
# AVERAGED BY SHEEP
478
data = raw_data
479
data["Inflammation"] = data.FociCount * data.FociSize
480
481
sheep = np.round(rescale(data.groupby("Sheep").mean()), 9)
482
483
484
485
sheep.rename(columns = {"Lobular_collapse" : "LobularCollapse", 
486
                        "Interface_hepatitis" : "InterfaceHepatitis",
487
                        "Confluent_necrosis" : "ConfluentNecrosis",
488
                        "Ln_ap_ri" : "LnApRi",
489
                        "Portal_inflammation" : "PortalInflammation",
490
                        "BD_hyperplasia" : "BDHyperplasia",
491
                        "Mean_hep_size" : "MeanHepSize",
492
                        "Min_hep_size" : "MinHepSize",
493
                        "Max_hep_size" : "MaxHepSize"}, inplace=True)
494
495
496
497
data.rename(columns = {"Lobular_collapse" : "LobularCollapse", 
498
                        "Interface_hepatitis" : "InterfaceHepatitis",
499
                        "Confluent_necrosis" : "ConfluentNecrosis",
500
                        "Ln_ap_ri" : "LnApRi",
501
                        "Portal_inflammation" : "PortalInflammation",
502
                        "BD_hyperplasia" : "BDHyperplasia",
503
                        "Mean_hep_size" : "MeanHepSize",
504
                        "Min_hep_size" : "MinHepSize",
505
                        "Max_hep_size" : "MaxHepSize"}, inplace=True)
506
507
#age = rescale(data.groupby("Sheep").mean().dropna(subset=delayer([imagecols, histcols, "AgeAtDeath"])).groupby("AgeAtDeath").mean())
508
509
# <codecell>
510
511
pcols = physcols[:]
512
pcols.remove("AgeAtDeath")
513
514
print " -------- 1"
515
best_lm_phys, stuff_lm_phys = big_ass_matrix(df=sheep, y=physcols, x=imagecols, group=None, short=5)
516
517
print " -------- 2"
518
best_lm_hist, stuff_lm_hist = big_ass_matrix(df=sheep, y=histcols, x=imagecols, group=None, short=5)
519
520
print " -------- 3"
521
best_mm_phys, stuff_mm_phys = big_ass_matrix(df=sheep, y=pcols, x=imagecols, group="AgeAtDeath", short=5)
522
523
print " -------- 4"
524
best_mm_hist, stuff_mm_hist = big_ass_matrix(df=sheep, y=histcols, x=imagecols, group="AgeAtDeath", short=5)
525
526
# <codecell>
527
528
y = "BDHyperplasia"
529
x = ["Inflammation", "Scale", "Directionality"]
530
531
dfx = sheep[delayer([x, y, "AgeAtDeath"])].dropna()
532
model = MixedLM.from_formula(rstr(y, x), data=dfx, groups="AgeAtDeath").fit()
533
#model = sm.GLS(endog=dfx.Portal_inflammation, exog=dfx[["FociSize", "AgeAtDeath"]]).fit()
534
535
dfx = sheep[["BDHyperplasia", "Inflammation", "AgeAtDeath"]].dropna()
536
model2 = MixedLM.from_formula(rstr(y, ["Inflammation"]), data=dfx, groups="AgeAtDeath").fit()
537
538
dfx = sheep[["BDHyperplasia", "FociSize", "AgeAtDeath"]].dropna()
539
model3 = MixedLM.from_formula(rstr(y, ["FociSize"]), data=dfx, groups="AgeAtDeath").fit()
540
541
# <codecell>
542
543
ss = "E"
544
s = np.array([sheep[sheep.AgeAtDeath == model.random_effects.index.values[i]][ss].iloc[0] for i in range(len(model.random_effects.index.values))])
545
s -= s.min()
546
s /= s.max()
547
548
idx = np.round(s * 2000).astype(int)
549
C = seaborn.color_palette("coolwarm", 2001)
550
CC = []
551
for i in idx :
552
    CC.append(C[i])
553
554
555
fig = plt.figure()
556
ax = fig.add_subplot(111, projection='3d')
557
plt.scatter(model.random_effects, model2.random_effects, zs=model3.random_effects, s=450, c=CC, alpha=1)#, c=np.array(C)[idx.astype(int)], alpha=1)
558
plt.title("Biliary Hyperplasia : Coefficients of Birth Year Random Effect\nFirst component of PCA explains 98.3%% of variance.\nWarmer colours - higher %s" % ss, y=0.9)
559
560
for i, sheep_age in enumerate(model.random_effects.index) :
561
    sheep_e = sheep[sheep.AgeAtDeath == sheep_age].E.iloc[0]
562
    x2, y2, _ = proj3d.proj_transform(model.random_effects.values[i], model2.random_effects.values[i], model3.random_effects.values[i], ax.get_proj())
563
    plt.annotate(int(np.round(2011 - sheep_age * 11)), xy=(x2, y2), xytext=(x2-0.0105, y2-0.002))
564
565
ax.set_axis_bgcolor("white")
566
plt.savefig("../talk/figures/regressions/BDHyperplasia/mm_coefs_color_%s.png" % ss, dpi=600, jpeg_quality=100)
567
568
# <codecell>
569
570
plt.scatter(exposure.BirthYear, exposure.CES)
571
572
# <codecell>
573
574
plt.scatter(model.random_effects.index.values, idx)
575
576
# <codecell>
577
578
579
#plt.plot([sheep[np.round(sheep.AgeAtDeath * 11) == i].E.iloc[0] for i in range(11)])
580
#plt.plot(exposure.AgeAtDeath, exposure.E)
581
C2 = []
582
for i in (np.unique(sheep[["AgeAtDeath", "BD_hyperplasia"]].dropna().AgeAtDeath) * 11).astype(int) :
583
    C2.append(sheep[np.round(sheep.AgeAtDeath * 11) == i].E.iloc[0])
584
#sheep[["AgeAtDeath", "E"]].dropna()
585
586
# <codecell>
587
588
plt.plot(exposure.BirthYear, exposure.E)
589
590
# <codecell>
591
592
exposure
593
594
# <codecell>
595
596
plt.text()
597
598
# <codecell>
599
600
df = sheep[["Interface_hepatitis", "AgeAtDeath", "Entropy"]].dropna()
601
ols = MixedLM.from_formula(rstr("Interface_hepatitis", ["Entropy"]), data=df, groups=df.AgeAtDeath).fit()
602
ols.summary()
603
604
# <codecell>
605
606
print mmR2(df, ols)
607
print ols.random_effects.abs().mean()
608
609
# <codecell>
610
611
print ols.summary()
612
613
print ols.bse
614
print ols.bse_re
615
print ols.bse_fe
616
617
# <codecell>
618
619
histo.columns
620
621
# <codecell>
622
623
y1, y2 = mmPredict(df, ols)
624
plt.scatter(y1, y2)
625
plt.plot(y1,y1)
626
627
# <codecell>
628
629
630
# <codecell>
631
632
ols = pd.stats.api.ols(y = avesc.Portal_inflammation, x = avesc.Inflammation)
633
print ols.summary
634
635
# <codecell>
636
637
plt.scatter(avesc.FociSize, avesc.Portal_inflammation)
638
plt.plot(avesc.FociSize, avesc.FociSize * ols.beta[0] + ols.beta[1])
639
plt.title("p = %.02e" % ols.p_value[0])
640
641
# <codecell>
642
643
ols2 = pd.stats.api.ols(y = avesc.BD_hyperplasia, x = avesc[["FociSize", "Directionality", "Scale"]])
644
print ols2.summary
645
646
# <codecell>
647
648
X = ["FociSize", "Directionality", "Scale"]
649
x = (ols2.beta[-1] + 
650
    ols2.beta[0] * avesc[X[0]] +
651
    ols2.beta[1] * avesc[X[1]] + 
652
    ols2.beta[2] * avesc[X[2]])
653
y = avesc.BD_hyperplasia
654
655
plt.scatter(x, y)
656
#plt.plot([0, 1], [- 0.0504, 1- 0.0504] )
657
#plt.plot(y, x)
658
659
#plt.scatter(avesc.FociSize, avesc.Portal_inflammation)
660
#plt.plot(avesc.FociSize, avesc.FociSize * ols.beta[0] + ols.beta[1])
661
#plt.title("p = %.02e" % ols.p_value[0])
662
663
# <codecell>
664
665
ols3 = pd.stats.api.ols(y = avesc.Portal_inflammation, x = avesc.Inflammation)
666
plt.scatter(avesc.Inflammation, avesc.Portal_inflammation)
667
plt.plot(avesc.Inflammation, avesc.Inflammation * ols3.beta[0] + ols3.beta[1])
668
print ols3.summary
669
670
# <codecell>
671
672
ols4 = pd.stats.api.ols(y = avesc.QTotal, x = avesc.TawfikTotal)
673
plt.scatter(avesc.TawfikTotal, avesc.QTotal)
674
plt.plot(avesc.TawfikTotal, avesc.TawfikTotal * ols4.beta[0] + ols4.beta[1])
675
print ols4.summary
676
677
# <codecell>
678
679
df2 = sheep[["TawfikTotal", "Entropy", "AgeAtDeath"]]
680
df2.dropna(inplace=True)
681
ols = MixedLM.from_formula(rstr("TawfikTotal", ["Entropy"]), data=df2, groups="AgeAtDeath").fit()
682
683
ols.pvalues[1:-1].values
684
685
# <codecell>
686
687
df = sheep[["Inflammation", "Mean_hep_size", "FociSize"]].dropna()
688
df["Intercept"] = np.ones(len(df))
689
ols = sm.GLS(endog=df.Mean_hep_size, exog=df[["Inflammation", "FociSize"]]).fit()
690
ols.rsquared
691
692
# <codecell>
693
694
mmR2(df, ols)
695
696
697
698
699
700
701
print "YO"
702
703
# <codecell>
704
705
print "YO"
706
707
# <codecell>
708
709
y = pca.fit_transform(s[histcols])
710
711
# <codecell>
712
713
mm, mmfits, mmpvals, mmqsum = test_all_linear(sheep, ["TawfikTotal"], imagecols, group="AgeAtDeath")
714
715
# <codecell>
716
717
df = sheep[delayer(["TawfikTotal", "Inflammation", "ResidualES"])].dropna()
718
df["Intercept"] = np.ones(len(df))
719
tt = MixedLM(endog = df.TawfikTotal, exog = df[["Inflammation"]], groups=df.ResidualES).fit()
720
tt.summary()
721
#del fibrosis.tables[2]
722
723
724
# <codecell>
725
726
models, fits, pvals, blah = test_all_linear(sheep, ["Ln_ap_ri"], imagecols, group="AgeAtDeath")
727
728
# <codecell>
729
730
np.where(np.array(fits["Ln_ap_ri"]) < (2 + np.min(fits["Ln_ap_ri"])))
731
732
# <codecell>
733
734
idx = 0
735
print models["Ln_ap_ri"][idx].summary()
736
print fits["Ln_ap_ri"][idx]
737
738
# <codecell>
739
740
a.model.endog_names
741
742
plt.scatter(y, yhat, c=seaborn.color_palette("deep", 8)[0])
743
plt.plot(y, y, c=seaborn.color_palette("deep", 8)[2])
744
plt.xlabel(a.model.endog_names)
745
yl = a.model.exog_names
746
#yl.remove("Intercept")
747
plt.ylabel(", ".join(yl))
748
plt.title("R2 = %.02f")
749
st = "%s : %.03f, p = %.03e.\n" * len(yl)
750
stl = []
751
for i in range(len(yl)) :
752
    stl.append(yl[i])
753
    stl.append(p[dependent][i])
754
plt.suptitle(st % tuple(delayer([yl[i], p[dependent][i] for i in range(len(yl))
755
756
# <codecell>
757
758
a.model.exog_names
759
760
# <codecell>
761
762
763
# <codecell>
764
765
raw_data[raw_data.TissueToSinusoid == raw_data.TissueToSinusoid.max()]
766
767
# <codecell>
768
769
df = sheep[["MeanHepSize", "Directionality"]].dropna()
770
df["Intercept"] = np.ones(len(df))
771
ols = sm.GLS(endog=df.MeanHepSize, exog=df[["Directionality", "Intercept"]]).fit()
772
ols.summary()
773
774
# <codecell>
775
776