a b/talk/GrahamGroup.py
1
# -*- coding: utf-8 -*-
2
# <nbformat>3.0</nbformat>
3
4
# <headingcell level=1>
5
6
# Robust Extraction of Quantitative Information from Histology Images
7
8
# <headingcell level=4>
9
10
# Quentin Caudron
11
# <br /><br />
12
# 
13
# Romain Garnier
14
# <br /><br />
15
# 
16
# *with Bryan Grenfell and Andrea Graham*
17
18
# <headingcell level=3>
19
20
# Outline
21
22
# <markdowncell>
23
24
# - Image processing
25
# - Extracted measures
26
# - Preliminary analysis
27
# - Future directions
28
29
# <markdowncell>
30
31
# 4. Age as random effect <---
32
# 
33
# ["interface_hepatitis", "confluent_necrosis", "portal_inflammation", "ln_ap_ri"]
34
35
# <codecell>
36
37
def normalise(df, skip = []) :
38
    for i in df.columns :
39
        if i not in skip :
40
            df[i] -= df[i].mean()
41
            df[i] /= df[i].std()
42
    return df
43
44
45
46
47
48
49
def rescale(df, skip = []) :
50
    for i in df.columns :
51
        if i not in skip :
52
            df[i] -= df[i].min()
53
            df[i] /= df[i].max()
54
    return df
55
56
57
58
# Remove a layer from a list
59
def delayer(m) :
60
    out = []
61
    for i in m :
62
        if isinstance(i, list) :
63
            for j in i :
64
                out.append(j)
65
        else :
66
            out.append(i)
67
    return out
68
69
70
71
72
73
74
75
# Remove all layers from a list
76
def flatten(m) :
77
    out = m[:]
78
79
    while out != delayer(out) :
80
        out = delayer(out)
81
82
    return out
83
84
85
86
87
88
89
90
91
# Generate all combinations of objects in a list
92
def combinatorial(l) :
93
    out = []
94
95
    for numel in range(len(l)) :
96
        for i in itertools.combinations(l, numel+1) :
97
            out.append(list(i))
98
99
    return out
100
101
102
103
104
105
106
107
108
109
110
def pcaplot(df) :
111
112
    # PCA
113
    pca = decomposition.PCA(whiten = True)
114
    pca.fit(df)
115
    p1 = pca.components_[0] / np.abs(pca.components_[0]).max() * np.sqrt(2)/2
116
    p2 = pca.components_[1] / np.abs(pca.components_[1]).max() * np.sqrt(2)/2
117
118
    # Normalise
119
    norms = np.max([np.sqrt((np.array(zip(p1, p2)[i])**2).sum()) for i in range(len(p1))])
120
    c = plt.Circle( (0, 0), radius = 1, alpha = 0.2)
121
    plt.axes(aspect = 1)
122
    plt.gca().add_artist(c)
123
124
    plt.scatter(p1 / norms, p2 / norms)
125
    plt.xlim([-1, 1])
126
    plt.ylim([-1, 1])
127
128
    for i, text in enumerate(df.columns) :
129
        plt.annotate(text, xy = [p1[i], p2[i]])
130
131
    plt.tight_layout()
132
133
134
135
136
137
138
139
140
141
142
143
def test_all_linear(df, y, x, return_significant = False, group = None) :
144
145
    # All possible combinations of independent variables
146
    independent = combinatorial(x)
147
148
    fits = {}
149
    pval = {}
150
    linmodels = {}
151
    qsum = {}
152
    aic = {}
153
154
    # For all dependent variables, one at a time
155
    for dependent in y :
156
157
        print "Fitting for %s." % dependent
158
159
        # For all combinations of independent variables
160
        for covariate in independent :
161
162
            # Standard mixed model
163
            if group is None :
164
165
                # Fit a linear model
166
                subset = delayer([covariate, dependent])
167
                df2 = df[delayer(subset)].dropna()
168
                df2["Intercept"] = np.ones(len(df2))
169
                
170
                ols = sm.GLS(endog = df2[dependent], exog = df2[delayer([covariate, "Intercept"])]).fit()
171
172
                # Save the results
173
                if (return_significant and ols.f_pvalue < 0.05) or (not return_significant) :
174
                    linmodels.setdefault(dependent, []).append(ols)
175
                    fits.setdefault(dependent, []).append(ols.rsquared)
176
                    pval.setdefault(dependent, []).append(ols.f_pvalue)
177
                    aic.setdefault(dependent, []).append(ols.aic)
178
179
180
            # Mixed effects model
181
            else :
182
                subset = delayer([covariate, dependent, group])
183
                df2 = df[delayer(subset)].dropna()
184
185
                # Fit a mixed effects model
186
                ols = MixedLM(endog = df2[dependent], exog = df2[covariate], groups = df2[group]).fit()
187
188
                # Calculate AIC
189
                linmodels.setdefault(dependent, []).append(ols)
190
                fits.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf)
191
                pval.setdefault(dependent, []).append(ols.pvalues)
192
193
    if group is not None :
194
        for i in y :
195
            f = np.array(fits[i])
196
            models = np.array(linmodels[i])
197
            idx = np.where(f - f.min() <= 2)[0]
198
            bestmodelDoF = [j.k_fe for j in np.array(linmodels[i])[idx]]
199
            bestmodels = [idx[j] for j in np.where(bestmodelDoF == np.min(bestmodelDoF))[0]]
200
            qsum[i] = models[idx[np.where(f[bestmodels] == np.min(f[bestmodels]))]]
201
202
203
        return linmodels, fits, pval, qsum
204
205
    return linmodels, fits, pval, aic
206
207
    
208
        
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def summary(models) :
225
226
    # Generate list of everything
227
    r2 = np.array([m.r2 for dependent in models.keys() for m in models[dependent]])
228
    p = np.array([m.f_stat["p-value"] for dependent in models.keys() for m in models[dependent]])
229
    mod = np.array([m for dependent in models.keys() for m in models[dependent]])
230
    dependent = np.array([dependent for dependent in models.keys() for m in models[dependent]])
231
232
    # Sort by R2
233
    idx = np.argsort(r2)[::-1]
234
235
    # Output string
236
    s = "%d significant regressions.\n\n" % len(r2)
237
    s += "Ten most correlated :\n\n"
238
239
    # Print a summary of the top ten correlations
240
    for i in idx[:10] :
241
        s += ("%s ~ %s\n" % (dependent[i], " + ".join(mod[i].x.columns[:-1])))
242
        s += ("R^2 = %f\tp = %f\n\n" % (r2[i], p[i]))
243
244
    print s
245
    
246
    
247
    
248
    
249
    
250
    
251
    
252
def rstr(y, x) :
253
    formatstr = "%s ~ " % y
254
    for i in x[:-1] :
255
        formatstr += str(i)
256
        formatstr += " + "
257
    formatstr += str(x[-1])
258
    return formatstr
259
260
261
262
263
264
265
266
267
# <codecell>
268
269
import numpy as np
270
from sklearn.neighbors import KernelDensity
271
from matplotlib import rcParams
272
import matplotlib.pyplot as plt
273
import seaborn
274
import pandas as pd
275
import itertools
276
from sklearn import linear_model, ensemble, decomposition, cross_validation, preprocessing
277
from statsmodels.regression.mixed_linear_model import MixedLM
278
import statsmodels.api as sm
279
from statsmodels.regression.linear_model import OLSResults
280
from statsmodels.tools.tools import add_constant
281
282
283
%matplotlib inline
284
rcParams["figure.figsize"] = (14, 8)
285
286
287
# RAW DATA
288
289
raw_physical = pd.read_csv("../data/physical.csv")
290
raw_histo = pd.read_csv("../data/tawfik.csv")
291
ent = pd.read_csv("../4x/results/entropy.csv").drop(["Unnamed: 0"], 1)
292
foci = pd.read_csv("../4x/results/foci.csv").drop(["Unnamed: 0"], 1)
293
lac = pd.read_csv("../4x/results/normalised_lacunarity.csv").drop(["Unnamed: 0"], 1)
294
gabor = pd.read_csv("../4x/results/gabor_filters.csv").drop(["Unnamed: 0"], 1)
295
ts = pd.read_csv("../4x/results/tissue_sinusoid_ratio.csv").drop(["Unnamed: 0"], 1)
296
297
raw_image = pd.merge(lac, ent,
298
    on=["Sheep", "Image"]).merge(foci, 
299
    on=["Sheep", "Image"]).merge(gabor,
300
    on=["Sheep", "Image"]).merge(ts, 
301
    on=["Sheep", "Image"])
302
raw_image.rename(columns = {    "meanSize" : "FociSize", 
303
                                "TSRatio" : "TissueToSinusoid",
304
                                "Count" : "FociCount" }, inplace=True)
305
306
307
308
# CLEAN DATA
309
310
physcols = ["Weight", "Sex", "AgeAtDeath", "Foreleg", "Hindleg"]
311
imagecols = ["Entropy", "Lacunarity", "Inflammation", "Scale", "Directionality", "FociCount", "FociSize", "TissueToSinusoid"]
312
histcols = ["Lobular_collapse", "Interface_hepatitis", "Confluent_necrosis", "Ln_ap_ri", "Portal_inflammation", "BD_hyperplasia", "Fibrosis", "TawfikTotal", "Mean_hep_size", "Min_hep_size", "Max_hep_size"]
313
314
315
316
317
318
# IMAGE
319
320
# Set FociSize to zero if FociCount is zero
321
# Drop stdSize
322
image = raw_image
323
image = image.drop("stdSize", 1)
324
image.FociSize[raw_image.FociCount == 0] = 0
325
326
327
328
# HISTO
329
330
histo = raw_histo
331
histo = histo.drop(["Vessels", "Vacuol", "Pigment", "Std_hep_size"], 1)
332
333
334
335
# PHYSICAL
336
337
physical = raw_physical
338
physical = physical.drop(["CurrTag", "DeathDate", "Category"], 1)
339
physical
340
341
342
343
344
# COMPLETE DATASET
345
346
raw_data = pd.merge(pd.merge(image, histo, on="Sheep", how="outer"), physical, on="Sheep", how="outer")
347
raw_data.to_csv("../data/tentative_complete.csv")
348
349
350
351
352
# AVERAGED BY SHEEP
353
data = raw_data
354
data["Inflammation"] = data.FociCount * data.FociSize
355
356
sheep = rescale(data.groupby("Sheep").mean())
357
age = rescale(data.groupby("AgeAtDeath").mean())
358
359
360
361
362
363
364
365
# REGRESSIONS : fixed effects, grouped by sheep
366
367
df = sheep[["Portal_inflammation", "FociSize"]].dropna()
368
df["Intercept"] = np.ones(len(df))
369
portal_inflammation = sm.GLS(endog = df.Portal_inflammation, exog = df[["FociSize", "Intercept"]]).fit().summary()
370
#portal_inflammation = portal_inflammation.summary()
371
del portal_inflammation.tables[2]
372
373
374
375
df = sheep[["BD_hyperplasia", "Scale", "Directionality", "FociSize"]].dropna()
376
df["Intercept"] = np.ones(len(df))
377
hyperplasia = sm.GLS(endog = df.BD_hyperplasia, exog = df[["FociSize", "Scale", "Directionality", "Intercept"]]).fit().summary()
378
#hyperplasia.summary()
379
del hyperplasia.tables[2]
380
381
382
383
384
385
386
# REGRESSIONS : fixed effects, grouped by age
387
388
df = age[["Max_hep_size", "Entropy", "Directionality"]].dropna()
389
df["Intercept"] = np.ones(len(df))
390
maxhepsize = sm.GLS(endog = df.Max_hep_size, exog = df[["Entropy", "Directionality", "Intercept"]]).fit().summary()
391
del maxhepsize.tables[2]
392
393
394
395
396
df = age[["Lobular_collapse", "FociSize"]].dropna()
397
df["Intercept"] = np.ones(len(df))
398
lobular_collapse = sm.GLS(endog = df.Lobular_collapse, exog = df[["FociSize", "Intercept"]]).fit().summary()
399
del lobular_collapse.tables[2]
400
401
402
df = age[["Interface_hepatitis", "Lacunarity"]].dropna()
403
df["Intercept"] = np.ones(len(df))
404
interface_hepatitis = sm.GLS(endog = df.Interface_hepatitis, exog = df[["Lacunarity", "Intercept"]]).fit().summary()
405
del interface_hepatitis.tables[2]
406
407
408
df = age[["Fibrosis", "Inflammation"]].dropna()
409
df["Intercept"] = np.ones(len(df))
410
fibrosis = sm.GLS(endog = df.Fibrosis, exog = df[["Inflammation", "Intercept"]]).fit().summary()
411
del fibrosis.tables[2]
412
413
414
415
416
# PCA
417
418
s = sheep.dropna(subset=delayer([imagecols, histcols]))
419
pca = decomposition.PCA(n_components=1)
420
pcax = pca.fit_transform(s[imagecols])
421
pcay = pca.fit_transform(s[histcols])
422
pca = sm.GLS(endog = pcay[:, 0][:, np.newaxis], exog = add_constant(pcax)).fit().summary()
423
del pca.tables[2]
424
425
426
427
428
429
# REGRESSIONS : mixed effects, intercept on age at death
430
431
df = age[["Fibrosis", "Inflammation"]].dropna()
432
df["Intercept"] = np.ones(len(df))
433
fibrosis = sm.GLS(endog = df.Fibrosis, exog = df[["Inflammation", "Intercept"]]).fit().summary()
434
del fibrosis.tables[2]
435
436
# <codecell>
437
438
a = portal_inflammation.summary()
439
del a.tables[2]
440
a
441
442
# <headingcell level=2>
443
444
# Image Processing
445
446
# <markdowncell>
447
448
# <img src="figures/sheep.jpg"></img>
449
450
# <markdowncell>
451
452
# <img src="figures/processed.jpg"></img>
453
454
# <headingcell level=3>
455
456
# Extraction
457
458
# <markdowncell>
459
460
# - Automagical
461
# - Reasonably quick
462
463
# <headingcell level=3>
464
465
# Robust
466
467
# <markdowncell>
468
469
# - Invariant to staining, slicing, field-related variation
470
# - Capture intersample variation
471
472
# <markdowncell>
473
474
# ![image](figures/robust3.jpg)
475
476
# <markdowncell>
477
478
# ![image](figures/robust4.jpg)
479
480
# <markdowncell>
481
482
# ![image](figures/robust1.jpg)
483
484
# <markdowncell>
485
486
# ![image](figures/robust2.jpg)
487
488
# <headingcell level=2>
489
490
# Structural and Textural Measures
491
492
# <markdowncell>
493
494
# - characteristic **scale** of sinusoid widths
495
# - **directional** amplitude of preferred sinusoid alignment
496
# - **tissue to sinusoid** ratio
497
# - **count** of inflammatory foci per image
498
# - **mean size** of inflammatory foci per image
499
# - information **entropy** of sinusoid distribution
500
# - **lacunarity** ( clustering ) of sinusoids
501
502
# <markdowncell>
503
504
# <img src="figures/gif.gif"></img>
505
506
# <markdowncell>
507
508
# ![image](figures/intra.png)
509
510
# <markdowncell>
511
512
# ![image](figures/inter2.png)
513
514
# <headingcell level=2>
515
516
# Exploratory Analysis
517
518
# <headingcell level=3>
519
520
# by individual
521
522
# <codecell>
523
524
portal_inflammation
525
526
# <markdowncell>
527
528
# ![image](figures/portal_inflammation.png)
529
530
# <codecell>
531
532
hyperplasia
533
534
# <markdowncell>
535
536
# ![image](figures/hyperplasia.png)
537
538
# <codecell>
539
540
pca
541
542
# <markdowncell>
543
544
# ![image](figures/pca.png)
545
546
# <headingcell level=2>
547
548
# Exploratory Analysis
549
550
# <headingcell level=3>
551
552
# by age class
553
554
# <codecell>
555
556
fibrosis
557
558
# <markdowncell>
559
560
# ![image](figures/fibrosis.png)
561
562
# <codecell>
563
564
lobular_collapse
565
566
# <markdowncell>
567
568
# ![image](figures/lobular_collapse.png)
569
570
# <codecell>
571
572
interface_hepatitis
573
574
# <markdowncell>
575
576
# ![image](figures/interface_hepatitis.png)
577
578
# <headingcell level=2>
579
580
# Exploratory analysis
581
582
# <headingcell level=3>
583
584
# with a random effect on age at death
585
586
# <markdowncell>
587
588
# | Dependent variable                       | Models<br />AIC < 2 + AIC<sub>min</sub> | Primary explanatory variables                           |
589
# |------------------------------------------|:----------------------------------:|---------------------------------------------------------------------|
590
# | Ishak score                              |                  7                 | entropy, tissue-to-sinusoid, focus count, focus size                |
591
# | Lobular collapse                         |                  5                 | entropy, lacunarity, tissue-to-sinusoid, focus count                |
592
# | Confluent necrosis                       |                  1                 | entropy                                                             |
593
# | Interface hepatitis                      |                  2                 | entropy, tissue-to-sinusoid                                         |
594
# | Portal inflammation                      |                  4                 | entropy, focus size, lacunarity, focus count, scale, directionality |
595
# | Fibrosis                                 |                  2                 | entropy, lacunarity, tissue-to-sinusoid                             |
596
# | Biliary hyperplasia                      |                  1                 | focus size                                                          |
597
# | Necrosis, apoptosis, random inflammation |    <font color="white">This_is_bla</font>2<font color="white">This_is_bla</font>           | entropy, lacunarity                                                 |
598
599
# <markdowncell>
600
601
# - entropy consistently explains histological measures when controlled for age
602
# - also important : tissue to sinusoid ratio, focus count and size, lacunarity
603
604
# <markdowncell>
605
606
# - biological / historical reasoning for this potential cohort effect
607
# - interpretation of these models
608
# - quality of fit
609
610
# <headingcell level=2>
611
612
# Conclusions
613
614
# <markdowncell>
615
616
# - our **semi-educated guess** measures may capture relevant information
617
# - underlying **structure** in the data needs thought
618
# - still no **map** from image or histological measures to condition of individual
619
620
# <headingcell level=2>
621
622
# Future directions
623
624
# <headingcell level=3>
625
626
# Further exploration of the dataset
627
628
# <markdowncell>
629
630
# - 145 sheep ( 89 females )
631
# - 11 age classes
632
# - potential redundancy in various measures
633
634
# <markdowncell>
635
636
# - 4460 entries across 27 variables
637
# - 3330 with full image and histological information
638
# - 1196 for which **complete** information is available
639
640
# <headingcell level=3>
641
642
# More data
643
644
# <markdowncell>
645
646
# - nutritional information
647
# - immunity data
648
649
# <headingcell level=3>
650
651
# Narrow-field images
652
653
# <markdowncell>
654
655
# - 12536 images
656
# - spatial distribution of nuclei
657
658
# <markdowncell>
659
660
# ![image](figures/10.jpg)
661
662
# <markdowncell>
663
664
# ![image](figures/Processed2.jpg)
665
666
# <markdowncell>
667
668
# ![image](figures/Segmented.jpg)
669
670
# <markdowncell>
671
672
# <img src="figures/10x.png" width=100%></src>
673