Diff of /4x/physical.py [000000] .. [171cba]

Switch to side-by-side view

--- a
+++ b/4x/physical.py
@@ -0,0 +1,391 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn
+import pandas as pd
+import itertools
+from sklearn import linear_model, ensemble, decomposition, cross_validation, preprocessing
+from statsmodels.regression.mixed_linear_model import MixedLM
+
+
+
+
+
+
+
+
+
+def normalise(df, skip = []) :
+	for i in df.columns :
+		if i not in skip :
+			df[i] -= df[i].mean()
+			df[i] /= df[i].std()
+	return df
+
+
+
+
+
+
+# Remove a layer from a list
+def delayer(m) :
+	out = []
+	for i in m :
+		if isinstance(i, list) :
+			for j in i :
+				out.append(j)
+		else :
+			out.append(i)
+	return out
+
+
+
+
+
+
+
+# Remove all layers from a list
+def flatten(m) :
+	out = m[:]
+
+	while out != delayer(out) :
+		out = delayer(out)
+
+	return out
+
+
+
+
+
+
+
+
+# Generate all combinations of objects in a list
+def combinatorial(l) :
+	out = []
+
+	for numel in range(len(l)) :
+		for i in itertools.combinations(l, numel+1) :
+			out.append(list(i))
+
+	return out
+
+
+
+
+
+
+
+
+
+
+def pcaplot(df) :
+
+	# PCA
+	pca = decomposition.PCA(whiten = True)
+	pca.fit(df)
+	p1 = pca.components_[0] / np.abs(pca.components_[0]).max() * np.sqrt(2)/2
+	p2 = pca.components_[1] / np.abs(pca.components_[1]).max() * np.sqrt(2)/2
+
+	# Normalise
+	norms = np.max([np.sqrt((np.array(zip(p1, p2)[i])**2).sum()) for i in range(len(p1))])
+	c = plt.Circle( (0, 0), radius = 1, alpha = 0.2)
+	plt.axes(aspect = 1)
+	plt.gca().add_artist(c)
+
+	plt.scatter(p1 / norms, p2 / norms)
+	plt.xlim([-1, 1])
+	plt.ylim([-1, 1])
+
+	for i, text in enumerate(df.columns) :
+		plt.annotate(text, xy = [p1[i], p2[i]])
+
+	plt.tight_layout()
+
+
+
+
+
+
+
+
+
+
+
+def test_all_linear(df, y, x, return_significant = True, group = None) :
+
+	# All possible combinations of independent variables
+	independent = combinatorial(x)
+
+	fits = {}
+	pval = {}
+	linmodels = {}
+	qsum = {}
+
+	# For all dependent variables, one at a time
+	for dependent in y :
+
+		print "Fitting for %s." % dependent
+
+		# For all combinations of independent variables
+		for covariate in independent :
+
+			# Standard mixed model
+			if group is None :
+
+				# Fit a linear model
+				ols = pd.stats.api.ols(y = df[dependent], x = df[covariate])
+
+				# Save the results
+				if (return_significant and ols.f_stat["p-value"] < 0.05) or (not return_significant) :
+					linmodels.setdefault(dependent, []).append(ols)
+					fits.setdefault(dependent, []).append(ols.r2)
+					pval.setdefault(dependent, []).append(ols.f_stat["p-value"])
+
+
+			# Mixed effects model
+			else :
+				subset = delayer([covariate, dependent, group])
+				df2 = df[delayer(subset)].dropna()
+
+				# Fit a mixed effects model
+				ols = MixedLM(endog = df2[dependent], exog = df2[covariate], groups = df2[group]).fit()
+
+				# Calculate AIC
+				linmodels.setdefault(dependent, []).append(ols)
+				fits.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf)
+				pval.setdefault(dependent, []).append(ols.pvalues)
+
+	if group is not None :
+		for i in y :
+			f = np.array(fits[i])
+			models = np.array(linmodels[i])
+			idx = np.where(f - f.min() <= 2)[0]
+			bestmodelDoF = [j.k_fe for j in np.array(linmodels[i])[idx]]
+			bestmodels = [idx[j] for j in np.where(bestmodelDoF == np.min(bestmodelDoF))[0]]
+			qsum[i] = models[idx[np.where(f[bestmodels] == np.min(f[bestmodels]))]]
+
+
+		return linmodels, fits, pval, qsum
+
+	return linmodels, fits, pval
+
+	
+		
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def summary(models) :
+
+	# Generate list of everything
+	r2 = np.array([m.r2 for dependent in models.keys() for m in models[dependent]])
+	p = np.array([m.f_stat["p-value"] for dependent in models.keys() for m in models[dependent]])
+	mod = np.array([m for dependent in models.keys() for m in models[dependent]])
+	dependent = np.array([dependent for dependent in models.keys() for m in models[dependent]])
+
+	# Sort by R2
+	idx = np.argsort(r2)[::-1]
+
+	# Output string
+	s = "%d significant regressions.\n\n" % len(r2)
+	s += "Ten most correlated :\n\n"
+
+	# Print a summary of the top ten correlations
+	for i in idx[:10] :
+		s += ("%s ~ %s\n" % (dependent[i], " + ".join(mod[i].x.columns[:-1])))
+		s += ("R^2 = %f\tp = %f\n\n" % (r2[i], p[i]))
+
+	print s
+	#return s
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Read physical data
+physical = pd.read_csv("../physical.csv").drop(["CurrTag", "DeathDate", "Category"], 1)
+
+
+
+# Read improc data
+ent = pd.read_csv("results/entropy.csv").drop(["Unnamed: 0"], 1)
+foci = pd.read_csv("results/foci.csv").drop(["Unnamed: 0"], 1)
+lac = pd.read_csv("results/normalised_lacunarity.csv").drop(["Unnamed: 0"], 1)
+gabor = pd.read_csv("results/gabor_filters.csv").drop(["Unnamed: 0"], 1)
+ts = pd.read_csv("results/tissue_sinusoid_ratio.csv").drop(["Unnamed: 0"], 1)
+
+
+
+# Merge dataframes
+sheep = np.unique(ent.Sheep)
+
+Ent = [np.mean(ent.loc[ent.Sheep == i]).Entropy for i in sheep]
+EntVar = [np.var(ent.loc[ent.Sheep == i]).Entropy for i in sheep]
+
+Focicount = [np.mean(foci.loc[foci.Sheep == i]).Count for i in sheep]
+FocicountVar = [np.var(foci.loc[foci.Sheep == i]).Count for i in sheep]
+Focisize = [np.mean(foci.loc[foci.Sheep == i]).meanSize for i in sheep]
+FocisizeVar = [np.var(foci.loc[foci.Sheep == i]).meanSize for i in sheep]
+
+Lac = [np.mean(lac.loc[lac.Sheep == i]).Lacunarity for i in sheep]
+LacVar = [np.var(lac.loc[lac.Sheep == i]).Lacunarity for i in sheep]
+
+Scale = [np.mean(gabor.loc[gabor.Sheep == i]).Scale for i in sheep]
+ScaleVar = [np.var(gabor.loc[gabor.Sheep == i]).Scale for i in sheep]
+Dir = [np.mean(gabor.loc[gabor.Sheep == i]).Directionality for i in sheep]
+DirVar = [np.var(gabor.loc[gabor.Sheep == i]).Directionality for i in sheep]
+
+TS = [np.mean(ts.loc[ts.Sheep == i]).TSRatio for i in sheep]
+TSVar = [np.var(ts.loc[ts.Sheep == i]).TSRatio for i in sheep]
+
+improc = pd.DataFrame({	"Sheep" : sheep,
+						"Entropy" : Ent,
+						"EntropyVar" : EntVar,
+						"FociCount" : Focicount,
+						"FociCountVar" : FocicountVar,
+						"FociSize" : Focisize,
+						"FociSizeVar" : FocisizeVar,
+						"Lacunarity" : Lac,
+						"LacunarityVar" : LacVar,
+						"Scale" : Scale,
+						"ScaleVar" : ScaleVar,
+						"Directionality" : Dir,
+						"DirectionalityVar" : DirVar,
+						"TissueToSinusoid" : TS,
+						"TissueToSinusoidVar" : TSVar
+						})
+
+
+
+
+physcols = ["Weight", "Sex", "AgeAtDeath", "Foreleg", "Hindleg"]
+imagecols = ["Entropy", "Lacunarity", "Scale", "Directionality", "FociCount", "FociSize", "TissueToSinusoid"]
+
+
+
+
+# Merges :
+
+# Sheep-centred dataframe
+rawsheepdata = pd.merge(physical, improc, on="Sheep", how="outer")
+sheepdata = normalise(rawsheepdata, skip = "Sheep")
+
+# Image-centred dataframe
+rawimagedata = pd.merge(lac, ent,
+	on=["Sheep", "Image"]).merge(foci.drop("stdSize", 1), 
+	on=["Sheep", "Image"]).merge(gabor,
+	on=["Sheep", "Image"]).merge(ts, 
+	on=["Sheep", "Image"]).merge(normalise(physical, skip = "Sheep"),
+	on="Sheep")
+rawimagedata.rename(columns = {	"meanSize" : "FociSize", 
+								"TSRatio" : "TissueToSinusoid",
+								"Count" : "FociCount" }, inplace=True)
+imagedata = normalise(rawimagedata, skip = physcols + ["Sheep"])
+
+
+
+
+
+
+# COLUMN ON NUMBER OF IMAGES
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+"""
+# CV
+
+m = [	[[linear_model.ElasticNet(max_iter=1000000, alpha = i, l1_ratio = j) 
+			for i in np.linspace(0.1, 1.0, 100)]
+			for j in np.linspace(0., 1.0, 100)],
+		[[[[linear_model.BayesianRidge(n_iter = 10000, alpha_1 = a1, alpha_2 = a2, lambda_1 = l1, lambda_2 = l2) 
+			for a1 in np.logspace(1e-8, 1e-4, 20)] 
+			for a2 in np.logspace(1e-8, 1e-4, 20)]
+			for l1 in np.logspace(1e-8, 1e-4, 20)]
+			for l2 in np.logspace(1e-8, 1e-4, 20)],
+		[ensemble.RandomForestRegressor(n_estimators = i) for i in np.unique(np.logspace(1, 4, 100).astype(int))],
+		[[[[ensemble.GradientBoostingRegressor(n_estimators = e, loss=l, learning_rate = r, max_depth = m) 
+			for l in ["ls", "lad", "huber"]] 
+			for e in np.unique(np.logspace(1, 4, 100).astype(int))]
+			for r in np.logspace(-4, 0, 200)]
+			for m in np.arange(1, 20)],
+		#ensemble.AdaBoostRegressor(n_estimators = 100),
+		#ensemble.AdaBoostRegressor(base_estimator = ensemble.GradientBoostingRegressor(n_estimators = 100), n_estimators = 100),
+		#ensemble.AdaBoostRegressor(base_estimator = ensemble.RandomForestRegressor(n_estimators = 50), n_estimators = 100),
+		[ensemble.BaggingRegressor(n_estimators = e) for e in np.unique(np.logspace(1, 4, 100).astype(int))]
+	]
+
+methods = flatten(m)
+
+
+
+
+
+# For each physical measurement, perform fits using all above methods
+scores = {}
+
+
+
+for col in physcols :
+	for q, method in enumerate(methods[10000:10100]) :
+		if not scores.has_key(col) :
+			scores[col] = []
+		scores[col].append(
+			cross_validation.cross_val_score(
+				method, d[imagecols], d[col], cv=cross_validation.ShuffleSplit(len(d), 100, test_size=0.1), 
+			n_jobs = -1, scoring = "r2").mean())
+		print "Done %d" % q
+
+"""
+
+
+