Robust Extraction of Quantitative Information from Histology Images¶

Quentin Caudron

Romain Garnier

with Bryan Grenfell and Andrea Graham¶

Outline¶

Image processing
Extracted measures
Preliminary analysis
Future directions

In [3]:

def normalise(df, skip = []) :
	for i in df.columns :
		if i not in skip :
			df[i] -= df[i].mean()
			df[i] /= df[i].std()
	return df






def rescale(df, skip = []) :
    for i in df.columns :
        if i not in skip :
            df[i] -= df[i].min()
            df[i] /= df[i].max()
    return df



# Remove a layer from a list
def delayer(m) :
	out = []
	for i in m :
		if isinstance(i, list) :
			for j in i :
				out.append(j)
		else :
			out.append(i)
	return out







# Remove all layers from a list
def flatten(m) :
	out = m[:]

	while out != delayer(out) :
		out = delayer(out)

	return out








# Generate all combinations of objects in a list
def combinatorial(l) :
	out = []

	for numel in range(len(l)) :
		for i in itertools.combinations(l, numel+1) :
			out.append(list(i))

	return out










def pcaplot(df) :

	# PCA
	pca = decomposition.PCA(whiten = True)
	pca.fit(df)
	p1 = pca.components_[0] / np.abs(pca.components_[0]).max() * np.sqrt(2)/2
	p2 = pca.components_[1] / np.abs(pca.components_[1]).max() * np.sqrt(2)/2

	# Normalise
	norms = np.max([np.sqrt((np.array(zip(p1, p2)[i])**2).sum()) for i in range(len(p1))])
	c = plt.Circle( (0, 0), radius = 1, alpha = 0.2)
	plt.axes(aspect = 1)
	plt.gca().add_artist(c)

	plt.scatter(p1 / norms, p2 / norms)
	plt.xlim([-1, 1])
	plt.ylim([-1, 1])

	for i, text in enumerate(df.columns) :
		plt.annotate(text, xy = [p1[i], p2[i]])

	plt.tight_layout()











def test_all_linear(df, y, x, return_significant = False, group = None) :

    # All possible combinations of independent variables
	independent = combinatorial(x)

	fits = {}
	pval = {}
	linmodels = {}
	qsum = {}
	aic = {}

	# For all dependent variables, one at a time
	for dependent in y :

		print "Fitting for %s." % dependent

		# For all combinations of independent variables
		for covariate in independent :

			# Standard mixed model
			if group is None :

				# Fit a linear model
				subset = delayer([covariate, dependent])
				df2 = df[delayer(subset)].dropna()
				df2["Intercept"] = np.ones(len(df2))
                
				ols = sm.GLS(endog = df2[dependent], exog = df2[delayer([covariate, "Intercept"])]).fit()

				# Save the results
				if (return_significant and ols.f_pvalue < 0.05) or (not return_significant) :
					linmodels.setdefault(dependent, []).append(ols)
					fits.setdefault(dependent, []).append(ols.rsquared)
					pval.setdefault(dependent, []).append(ols.f_pvalue)
					aic.setdefault(dependent, []).append(ols.aic)


			# Mixed effects model
			else :
				subset = delayer([covariate, dependent, group])
				df2 = df[delayer(subset)].dropna()

				# Fit a mixed effects model
				ols = MixedLM(endog = df2[dependent], exog = df2[covariate], groups = df2[group]).fit()

				# Calculate AIC
				linmodels.setdefault(dependent, []).append(ols)
				fits.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf)
				pval.setdefault(dependent, []).append(ols.pvalues)

	if group is not None :
		for i in y :
			f = np.array(fits[i])
			models = np.array(linmodels[i])
			idx = np.where(f - f.min() <= 2)[0]
			bestmodelDoF = [j.k_fe for j in np.array(linmodels[i])[idx]]
			bestmodels = [idx[j] for j in np.where(bestmodelDoF == np.min(bestmodelDoF))[0]]
			qsum[i] = models[idx[np.where(f[bestmodels] == np.min(f[bestmodels]))]]


		return linmodels, fits, pval, qsum

	return linmodels, fits, pval, aic

	
		















def summary(models) :

	# Generate list of everything
	r2 = np.array([m.r2 for dependent in models.keys() for m in models[dependent]])
	p = np.array([m.f_stat["p-value"] for dependent in models.keys() for m in models[dependent]])
	mod = np.array([m for dependent in models.keys() for m in models[dependent]])
	dependent = np.array([dependent for dependent in models.keys() for m in models[dependent]])

	# Sort by R2
	idx = np.argsort(r2)[::-1]

	# Output string
	s = "%d significant regressions.\n\n" % len(r2)
	s += "Ten most correlated :\n\n"

	# Print a summary of the top ten correlations
	for i in idx[:10] :
		s += ("%s ~ %s\n" % (dependent[i], " + ".join(mod[i].x.columns[:-1])))
		s += ("R^2 = %f\tp = %f\n\n" % (r2[i], p[i]))

	print s
    
    
    
    
    
    
    
def rstr(y, x) :
    formatstr = "%s ~ " % y
    for i in x[:-1] :
        formatstr += str(i)
        formatstr += " + "
    formatstr += str(x[-1])
    return formatstr

In [14]:

import numpy as np
from sklearn.neighbors import KernelDensity
from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn
import pandas as pd
import itertools
from sklearn import linear_model, ensemble, decomposition, cross_validation, preprocessing
from statsmodels.regression.mixed_linear_model import MixedLM
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLSResults
from statsmodels.tools.tools import add_constant


%matplotlib inline
rcParams["figure.figsize"] = (14, 8)


# RAW DATA

raw_physical = pd.read_csv("../data/physical.csv")
raw_histo = pd.read_csv("../data/tawfik.csv")
ent = pd.read_csv("../4x/results/entropy.csv").drop(["Unnamed: 0"], 1)
foci = pd.read_csv("../4x/results/foci.csv").drop(["Unnamed: 0"], 1)
lac = pd.read_csv("../4x/results/normalised_lacunarity.csv").drop(["Unnamed: 0"], 1)
gabor = pd.read_csv("../4x/results/gabor_filters.csv").drop(["Unnamed: 0"], 1)
ts = pd.read_csv("../4x/results/tissue_sinusoid_ratio.csv").drop(["Unnamed: 0"], 1)

raw_image = pd.merge(lac, ent,
	on=["Sheep", "Image"]).merge(foci, 
	on=["Sheep", "Image"]).merge(gabor,
	on=["Sheep", "Image"]).merge(ts, 
    on=["Sheep", "Image"])
raw_image.rename(columns = {	"meanSize" : "FociSize", 
								"TSRatio" : "TissueToSinusoid",
								"Count" : "FociCount" }, inplace=True)



# CLEAN DATA

physcols = ["Weight", "Sex", "AgeAtDeath", "Foreleg", "Hindleg"]
imagecols = ["Entropy", "Lacunarity", "Inflammation", "Scale", "Directionality", "FociCount", "FociSize", "TissueToSinusoid"]
histcols = ["Lobular_collapse", "Interface_hepatitis", "Confluent_necrosis", "Ln_ap_ri", "Portal_inflammation", "BD_hyperplasia", "Fibrosis", "TawfikTotal", "Mean_hep_size", "Min_hep_size", "Max_hep_size"]





# IMAGE

# Set FociSize to zero if FociCount is zero
# Drop stdSize
image = raw_image
image = image.drop("stdSize", 1)
image.FociSize[raw_image.FociCount == 0] = 0



# HISTO

histo = raw_histo
histo = histo.drop(["Vessels", "Vacuol", "Pigment", "Std_hep_size"], 1)



# PHYSICAL

physical = raw_physical
physical = physical.drop(["CurrTag", "DeathDate", "Category"], 1)
physical




# COMPLETE DATASET

raw_data = pd.merge(pd.merge(image, histo, on="Sheep", how="outer"), physical, on="Sheep", how="outer")
raw_data.to_csv("../data/tentative_complete.csv")




# AVERAGED BY SHEEP
data = raw_data
data["Inflammation"] = data.FociCount * data.FociSize

sheep = rescale(data.groupby("Sheep").mean())
age = rescale(data.groupby("AgeAtDeath").mean())







# REGRESSIONS : fixed effects, grouped by sheep

df = sheep[["Portal_inflammation", "FociSize"]].dropna()
df["Intercept"] = np.ones(len(df))
portal_inflammation = sm.GLS(endog = df.Portal_inflammation, exog = df[["FociSize", "Intercept"]]).fit().summary()
#portal_inflammation = portal_inflammation.summary()
del portal_inflammation.tables[2]



df = sheep[["BD_hyperplasia", "Scale", "Directionality", "FociSize"]].dropna()
df["Intercept"] = np.ones(len(df))
hyperplasia = sm.GLS(endog = df.BD_hyperplasia, exog = df[["FociSize", "Scale", "Directionality", "Intercept"]]).fit().summary()
#hyperplasia.summary()
del hyperplasia.tables[2]






# REGRESSIONS : fixed effects, grouped by age

df = age[["Max_hep_size", "Entropy", "Directionality"]].dropna()
df["Intercept"] = np.ones(len(df))
maxhepsize = sm.GLS(endog = df.Max_hep_size, exog = df[["Entropy", "Directionality", "Intercept"]]).fit().summary()
del maxhepsize.tables[2]




df = age[["Lobular_collapse", "FociSize"]].dropna()
df["Intercept"] = np.ones(len(df))
lobular_collapse = sm.GLS(endog = df.Lobular_collapse, exog = df[["FociSize", "Intercept"]]).fit().summary()
del lobular_collapse.tables[2]


df = age[["Interface_hepatitis", "Lacunarity"]].dropna()
df["Intercept"] = np.ones(len(df))
interface_hepatitis = sm.GLS(endog = df.Interface_hepatitis, exog = df[["Lacunarity", "Intercept"]]).fit().summary()
del interface_hepatitis.tables[2]


df = age[["Fibrosis", "Inflammation"]].dropna()
df["Intercept"] = np.ones(len(df))
fibrosis = sm.GLS(endog = df.Fibrosis, exog = df[["Inflammation", "Intercept"]]).fit().summary()
del fibrosis.tables[2]




# PCA

s = sheep.dropna(subset=delayer([imagecols, histcols]))
pca = decomposition.PCA(n_components=1)
pcax = pca.fit_transform(s[imagecols])
pcay = pca.fit_transform(s[histcols])
pca = sm.GLS(endog = pcay[:, 0][:, np.newaxis], exog = add_constant(pcax)).fit().summary()
del pca.tables[2]





# REGRESSIONS : mixed effects, intercept on age at death

df = age[["Fibrosis", "Inflammation"]].dropna()
df["Intercept"] = np.ones(len(df))
fibrosis = sm.GLS(endog = df.Fibrosis, exog = df[["Inflammation", "Intercept"]]).fit().summary()
del fibrosis.tables[2]

Image Processing¶

Extraction¶

Automagical
Reasonably quick

Robust¶

Invariant to staining, slicing, field-related variation
Capture intersample variation

Structural and Textural Measures¶

characteristic scale of sinusoid widths
directional amplitude of preferred sinusoid alignment
tissue to sinusoid ratio
count of inflammatory foci per image
mean size of inflammatory foci per image
information entropy of sinusoid distribution
lacunarity ( clustering ) of sinusoids

Exploratory Analysis¶

by individual¶

In [29]:

portal_inflammation

Out[29]:

GLS Regression Results
Dep. Variable:	Portal_inflammation	R-squared:	0.280
Model:	GLS	Adj. R-squared:	0.273
Method:	Least Squares	F-statistic:	37.34
Date:	Tue, 28 Oct 2014	Prob (F-statistic):	2.12e-08
Time:	23:40:10	Log-Likelihood:	14.996
No. Observations:	98	AIC:	-25.99
Df Residuals:	96	BIC:	-20.82
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
FociSize	0.5627	0.092	6.111	0.000	0.380 0.746
Intercept	0.3368	0.043	7.855	0.000	0.252 0.422

In [31]:

hyperplasia

Out[31]:

GLS Regression Results
Dep. Variable:	BD_hyperplasia	R-squared:	0.306
Model:	GLS	Adj. R-squared:	0.284
Method:	Least Squares	F-statistic:	13.83
Date:	Tue, 28 Oct 2014	Prob (F-statistic):	1.52e-07
Time:	23:40:10	Log-Likelihood:	-3.9632
No. Observations:	98	AIC:	15.93
Df Residuals:	94	BIC:	26.27
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
FociSize	0.6698	0.113	5.902	0.000	0.444 0.895
Scale	0.5811	0.243	2.394	0.019	0.099 1.063
Directionality	-0.4419	0.190	-2.330	0.022	-0.819 -0.065
Intercept	-0.0504	0.079	-0.642	0.523	-0.206 0.105

In [15]:

pca

Out[15]:

GLS Regression Results
Dep. Variable:	y	R-squared:	0.075
Model:	GLS	Adj. R-squared:	0.065
Method:	Least Squares	F-statistic:	7.723
Date:	Wed, 29 Oct 2014	Prob (F-statistic):	0.00657
Time:	14:38:47	Log-Likelihood:	-70.082
No. Observations:	97	AIC:	144.2
Df Residuals:	95	BIC:	149.3
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	-2.949e-17	0.051	-5.77e-16	1.000	-0.102 0.102
x1	0.3865	0.139	2.779	0.007	0.110 0.663

Exploratory Analysis¶

by age class¶

In [6]:

fibrosis

Out[6]:

GLS Regression Results
Dep. Variable:	Fibrosis	R-squared:	0.800
Model:	GLS	Adj. R-squared:	0.778
Method:	Least Squares	F-statistic:	36.07
Date:	Wed, 29 Oct 2014	Prob (F-statistic):	0.000201
Time:	11:13:48	Log-Likelihood:	7.8003
No. Observations:	11	AIC:	-11.60
Df Residuals:	9	BIC:	-10.80
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Inflammation	1.0159	0.169	6.006	0.000	0.633 1.399
Intercept	-0.0105	0.083	-0.126	0.902	-0.198 0.177

In [7]:

lobular_collapse

Out[7]:

GLS Regression Results
Dep. Variable:	Lobular_collapse	R-squared:	0.586
Model:	GLS	Adj. R-squared:	0.540
Method:	Least Squares	F-statistic:	12.73
Date:	Wed, 29 Oct 2014	Prob (F-statistic):	0.00605
Time:	11:13:48	Log-Likelihood:	2.2626
No. Observations:	11	AIC:	-0.5252
Df Residuals:	9	BIC:	0.2706
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
FociSize	1.1379	0.319	3.567	0.006	0.416 1.860
Intercept	0.0460	0.159	0.289	0.779	-0.314 0.406

In [8]:

interface_hepatitis

Out[8]:

GLS Regression Results
Dep. Variable:	Interface_hepatitis	R-squared:	0.659
Model:	GLS	Adj. R-squared:	0.621
Method:	Least Squares	F-statistic:	17.38
Date:	Wed, 29 Oct 2014	Prob (F-statistic):	0.00242
Time:	11:13:48	Log-Likelihood:	2.3063
No. Observations:	11	AIC:	-0.6126
Df Residuals:	9	BIC:	0.1832
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Lacunarity	-1.0224	0.245	-4.168	0.002	-1.577 -0.468
Intercept	0.9504	0.143	6.669	0.000	0.628 1.273

Exploratory analysis¶

with a random effect on age at death¶

Dependent variable	Models AIC < 2 + AIC_min	Primary explanatory variables
Ishak score	7	entropy, tissue-to-sinusoid, focus count, focus size
Lobular collapse	5	entropy, lacunarity, tissue-to-sinusoid, focus count
Confluent necrosis	1	entropy
Interface hepatitis	2	entropy, tissue-to-sinusoid
Portal inflammation	4	entropy, focus size, lacunarity, focus count, scale, directionality
Fibrosis	2	entropy, lacunarity, tissue-to-sinusoid
Biliary hyperplasia	1	focus size
Necrosis, apoptosis, random inflammation	This_is_bla2This_is_bla	entropy, lacunarity

entropy consistently explains histological measures when controlled for age
also important : tissue to sinusoid ratio, focus count and size, lacunarity

biological / historical reasoning for this potential cohort effect
interpretation of these models
quality of fit

Conclusions¶

our semi-educated guess measures may capture relevant information
underlying structure in the data needs thought
still no map from image or histological measures to condition of individual

Future directions¶

Further exploration of the dataset¶

145 sheep ( 89 females )
11 age classes
potential redundancy in various measures

4460 entries across 27 variables
3330 with full image and histological information
1196 for which complete information is available

More data¶

nutritional information
immunity data

Narrow-field images¶

12536 images
spatial distribution of nuclei

Robust Extraction of Quantitative Information from Histology Images¶

Quentin Caudron Romain Garnier with Bryan Grenfell and Andrea Graham¶

Outline¶

Image Processing¶

Extraction¶

Robust¶

Structural and Textural Measures¶

Exploratory Analysis¶

by individual¶

Exploratory Analysis¶

by age class¶

Exploratory analysis¶

with a random effect on age at death¶

Conclusions¶

Future directions¶

Further exploration of the dataset¶

More data¶

Narrow-field images¶

Quentin Caudron

Romain Garnier

with Bryan Grenfell and Andrea Graham¶