--- a +++ b/4x/Data_analysis.py @@ -0,0 +1,776 @@ +# -*- coding: utf-8 -*- +# <nbformat>3.0</nbformat> + +# <codecell> + +import numpy as np +from matplotlib import rcParams +import matplotlib.pyplot as plt +import seaborn +import pandas as pd +import itertools +import os +from sklearn import linear_model, ensemble, decomposition, cross_validation, preprocessing +from statsmodels.regression.mixed_linear_model import MixedLM +import statsmodels +import statsmodels.api as sm +from statsmodels.regression.linear_model import OLSResults +from statsmodels.tools.tools import add_constant +from sklearn.neighbors import KernelDensity + +from mpl_toolkits.mplot3d import Axes3D, proj3d + +print statsmodels.__version__ + +%matplotlib inline +rcParams["figure.figsize"] = (14, 8) + +rcParams["text.usetex"] = False +rcParams["xtick.labelsize"] = 12 +rcParams["ytick.labelsize"] = 12 +rcParams["font.size"] = 14 +rcParams["axes.titlesize"] = 16 +#rcParams["text.usetex"] = False +rcParams["font.family"] = "Serif" +rcParams["figure.dpi"] = 600 + +# <codecell> + +# RAW DATA + +raw_physical = pd.read_csv("../data/physical.csv") +raw_histo = pd.read_csv("../data/tawfik.csv") +ent = pd.read_csv("results/entropy.csv").drop(["Unnamed: 0"], 1) +foci = pd.read_csv("results/foci.csv").drop(["Unnamed: 0"], 1) +lac = pd.read_csv("results/normalised_lacunarity.csv").drop(["Unnamed: 0"], 1) +gabor = pd.read_csv("results/gabor_filters.csv").drop(["Unnamed: 0"], 1) +ts = pd.read_csv("results/tissue_sinusoid_ratio.csv").drop(["Unnamed: 0"], 1) +blur = pd.read_csv("results/blur.csv").drop(["Unnamed: 0"], 1) +distances = pd.read_csv("results/interfoci_dist.csv").drop(["Unnamed: 0"], 1) + +raw_image = pd.merge(lac, ent, + on=["Sheep", "Image"]).merge(foci, + on=["Sheep", "Image"]).merge(gabor, + on=["Sheep", "Image"]).merge(ts, + on=["Sheep", "Image"]).merge(blur, + on=["Sheep", "Image"]).merge(distances, + on=["Sheep", "Image"]) +raw_image.rename(columns = { "meanSize" : "FociSize", + "TSRatio" : "TissueToSinusoid", + "Count" : "FociCount" }, inplace=True) + +# <codecell> + +def normalise(df, skip = []) : + for i in df.columns : + if i not in skip : + df[i] -= df[i].mean() + df[i] /= df[i].std() + return df + + + + + + +def rescale(df, skip = []) : + for i in df.columns : + if i not in skip : + df[i] -= df[i].min() + df[i] /= df[i].max() + return df + + + +# Remove a layer from a list +def delayer(m) : + out = [] + for i in m : + if isinstance(i, list) : + for j in i : + out.append(j) + else : + out.append(i) + return out + + + + + + + +# Remove all layers from a list +def flatten(m) : + out = m[:] + + while out != delayer(out) : + out = delayer(out) + + return out + + + + + + + + +# Generate all combinations of objects in a list +def combinatorial(l, short = np.inf) : + out = [] + + for numel in range(len(l)) : + for i in itertools.combinations(l, numel+1) : + if len(list(i)) < short : + out.append(list(i)) + + return out + + + + + + + + + + +def pcaplot(df) : + + # PCA + pca = decomposition.PCA(whiten = True) + pca.fit(df) + p1 = pca.components_[0] / np.abs(pca.components_[0]).max() * np.sqrt(2)/2 + p2 = pca.components_[1] / np.abs(pca.components_[1]).max() * np.sqrt(2)/2 + + # Normalise + norms = np.max([np.sqrt((np.array(zip(p1, p2)[i])**2).sum()) for i in range(len(p1))]) + c = plt.Circle( (0, 0), radius = 1, alpha = 0.2) + plt.axes(aspect = 1) + plt.gca().add_artist(c) + + plt.scatter(p1 / norms, p2 / norms) + plt.xlim([-1, 1]) + plt.ylim([-1, 1]) + + for i, text in enumerate(df.columns) : + plt.annotate(text, xy = [p1[i], p2[i]]) + + plt.tight_layout() + + + + + + + + + +def big_ass_matrix(df, y, x, group = None, short = True) : + + independent = combinatorial(x, short) + + models = {} + p = {} + aic = {} + r2 = {} + best = {} + dfs = {} + bestdf = {} + + for dependent in y : + + print "Regressing for %s" % dependent + + for covariate in independent : + + if group is None : + + subset = delayer([covariate, dependent]) + df2 = df[subset].dropna() + df2["Intercept"] = np.ones(len(df2)) + dfs.setdefault(dependent, []).append(df2) + + ols = sm.GLS(endog=df2[dependent], exog=df2[delayer([covariate, "Intercept"])]).fit() + + models.setdefault(dependent, []).append(ols) + p.setdefault(dependent, []).append(ols.pvalues[:-1].values) + aic.setdefault(dependent, []).append(ols.aic) + r2.setdefault(dependent, []).append(ols.rsquared) + + else : + + subset = delayer([covariate, dependent, group]) + df2 = df[subset].dropna() + dfs.setdefault(dependent, []).append(df2) + + ols = MixedLM.from_formula(rstr(y=dependent, x=covariate), data=df2, groups=df2[group]).fit() + + models.setdefault(dependent, []).append(ols) + aic.setdefault(dependent, []).append(2 * (ols.k_fe + 1) - 2 * ols.llf) + p.setdefault(dependent, []).append(ols.pvalues[1:-1].values) + r2.setdefault(dependent, []).append(mmR2(df2, ols)) + + + + bestAIC = np.min(aic[dependent]) + + for i, val in enumerate(models[dependent]) : + + if aic[dependent][i] < 2 + bestAIC : + + if np.sum(p[dependent][i] > 0.05) == 0 : + + if group is None : + + best.setdefault(dependent, []).append(val) + bestdf.setdefault(dependent, []).append(dfs[dependent][i]) + + else : + + if val.random_effects.abs().mean()[0] > 0.01 : + + best.setdefault(dependent, []).append(val) + bestdf.setdefault(dependent, []).append(dfs[dependent][i]) + + + if best.has_key(dependent) : + for i, model in enumerate(best[dependent]) : + + if not os.path.exists("regressions/%s" % dependent) : + os.mkdir("regressions/%s" % dependent) + + if not os.path.exists("../talk/figures/regressions/%s" % dependent) : + os.mkdir("../talk/figures/regressions/%s" % dependent) + + if group is None : + + dfx = bestdf[dependent][i] + plt.scatter(model.fittedvalues.values, dfx[model.model.endog_names].values, c=seaborn.color_palette("deep", 8)[0]) + plt.plot(dfx[model.model.endog_names].values, dfx[model.model.endog_names].values, c=seaborn.color_palette("deep", 8)[2]) + plt.ylabel(model.model.endog_names) + yl = model.model.exog_names[:] + yl.remove("Intercept") + plt.xlabel("Estimate using " + ", ".join(yl)) + plt.title(rstr(dependent, model.model.exog_names).replace(" + Intercept", "")) + #plt.title(r"$R^2$ = %.02f" % model.rsquared) + st = ("$R^2$ = %.03f\n\n"% model.rsquared) + for coefnum, coef in enumerate(yl) : + st += ("%s" % coef) + st += (" : %.03f\n" % model.params[coef]) + st += ("$p$ = %.01e\n\n" % model.pvalues[coefnum]) + #plt.suptitle(st) + plt.text(0.01, .99, st, va="top", ha="left") + plt.xlim([-0.05, 1.05]) + plt.ylim([-0.05, 1.05]) + plt.savefig("regressions/%s/lm-%d.pdf" % (dependent, i)) + plt.savefig("../talk/figures/regressions/%s/lm-%d.png" % (dependent, i), dpi=300, jpeg_quality=90) + plt.close() + + else : + dfx = bestdf[dependent][i] + y, yhat = mmPredict(model.model.data.frame, model) + plt.scatter(yhat, y, c=seaborn.color_palette("deep", 8)[0]) + plt.plot(y, y, c=seaborn.color_palette("deep", 8)[2]) + plt.ylabel(model.model.endog_names) + yl = model.model.exog_names[:] + yl.remove("Intercept") + plt.xlabel("Estimate using " + ", ".join(yl)) + plt.title(rstr(dependent, model.model.exog_names).replace("Intercept + ", "")) + + #plt.title(r"$R^2$ = %.02f" % mmR2(dfx, model)) + st = ("$R^2$ = %.03f\n\n" % mmR2(dfx, model)) + for coefnum, coef in enumerate(yl) : + st += coef + st += " : %.03f\n" % model.fe_params[1+coefnum] + st += "$p$ = %.01e\n\n" % model.pvalues[coef] + st += ("Avg. abs. RE coef. : %.03f" % model.random_effects.abs().mean()) + plt.text(0.01, .99, st, va="top", ha="left") + + plt.xlim([-0.05, 1.05]) + plt.ylim([-0.05, 1.05]) + plt.savefig("regressions/%s/mm_%d.pdf" % (dependent, i)) + plt.savefig("../talk/figures/regressions/%s/mm_%d.png" % (dependent, i), dpi=300, jpeg_quality=90) + plt.close() + + return best, (models, p, r2, aic) + + + + + + + + + + + + + + + + + + +def summary(models) : + + # Generate list of everything + r2 = np.array([m.r2 for dependent in models.keys() for m in models[dependent]]) + p = np.array([m.f_stat["p-value"] for dependent in models.keys() for m in models[dependent]]) + mod = np.array([m for dependent in models.keys() for m in models[dependent]]) + dependent = np.array([dependent for dependent in models.keys() for m in models[dependent]]) + + # Sort by R2 + idx = np.argsort(r2)[::-1] + + # Output string + s = "%d significant regressions.\n\n" % len(r2) + s += "Ten most correlated :\n\n" + + # Print a summary of the top ten correlations + for i in idx[:10] : + s += ("%s ~ %s\n" % (dependent[i], " + ".join(mod[i].x.columns[:-1]))) + s += ("R^2 = %f\tp = %f\n\n" % (r2[i], p[i])) + + print s + + + + + + + +def rstr(y, x) : + formatstr = "%s ~ " % y + for i in x[:-1] : + formatstr += str(i) + formatstr += " + " + formatstr += str(x[-1]) + return formatstr + + + + + + + + + +def mmR2(df, ols) : + + y = df[ols.model.endog_names] + sigma_a = ols.random_effects.values.var() + + yhat = np.zeros_like(y) + for i, coef in enumerate(ols.fe_params) : + yhat += ols.model.data.exog[:, i] * coef + + sigma_f = yhat.var() + + idxx = [] + + grouplabel = list(df.columns) + grouplabel.remove(ols.model.endog_names) + remove_these = ols.model.exog_names[:] + remove_these.remove("Intercept") + for i in remove_these : + grouplabel.remove(i) + + for j, u in enumerate(ols.model.group_labels) : + idx = np.where(df[grouplabel] == u)[0] + yhat[idx] += ols.random_effects.values[j] + + sigma_e = np.var(yhat - y) + + return (sigma_f + sigma_a) / (sigma_f + sigma_a + sigma_e) + + + + + +def mmPredict(df, ols) : + + y = df[ols.model.endog_names] + sigma_a = ols.random_effects.values.var() + + yhat = np.zeros_like(y) + for i, coef in enumerate(ols.fe_params) : + yhat += ols.model.data.exog[:, i] * coef + + sigma_f = yhat.var() + + idxx = [] + + grouplabel = list(df.columns) + grouplabel.remove(ols.model.endog_names) + remove_these = ols.model.exog_names[:] + remove_these.remove("Intercept") + for i in remove_these : + grouplabel.remove(i) + + for j, u in enumerate(ols.model.group_labels) : + idx = np.where(df[grouplabel] == u)[0] + yhat[idx] += ols.random_effects.values[j] + + return y, yhat + + + + +#best, stuff = big_ass_matrix(df=sheep, y=["Weight"], x=imagecols, group="AgeAtDeath", short=True) + +# <codecell> + +raw_image.columns + +# <codecell> + +# CLEAN DATA + +physcols = ["Weight", "Sex", "AgeAtDeath", "Foreleg", "Hindleg", "E", "CES", "RES"] +imagecols = ["Entropy", "Lacunarity", "Inflammation", "Scale", "Directionality", "FociCount", "FociSize", "TissueToSinusoid", "Blur", "MinDist", "IFDist"] +histcols = ["LobularCollapse", "InterfaceHepatitis", "ConfluentNecrosis", "LnApRi", "PortalInflammation", "BDHyperplasia", "Fibrosis", "TawfikTotal", "MeanHepSize", "MinHepSize", "MaxHepSize"] + + + + + +# IMAGE + +# Set FociSize to zero if FociCount is zero +# Drop stdSize +image = raw_image +image = image.drop("stdSize", 1) +image.FociSize[raw_image.FociCount == 0] = 0 + + + +# HISTO + +histo = raw_histo +histo = histo.drop(["Vessels", "Vacuol", "Pigment", "Std_hep_size"], 1) + + + +# PHYSICAL +physical = raw_physical +physical = physical.drop(["CurrTag", "DeathDate", "Category"], 1) +exposure = pd.read_csv("../data/exposure.csv") +exposure.rename(columns={"BirthYear" : "BirthYear", "AvgOfLambWS" : "E"}, inplace=True) +exposure["AgeAtDeath"] = 2011 - exposure.BirthYear +exposure.E = np.round(1. - exposure.E, 6) +exposure["CES"] = np.flipud(np.flipud(exposure.E).cumsum()) +exposure["RES"] = np.flipud(np.flipud(exposure.E - exposure.CES / (exposure.AgeAtDeath + 1)).cumsum()) +physical = pd.merge(physical, exposure, on="AgeAtDeath", how="inner") + + + + +# COMPLETE DATASET + +raw_data = pd.merge(pd.merge(image, histo, on="Sheep", how="outer"), physical, on="Sheep", how="outer") +raw_data.to_csv("../data/tentative_complete.csv") + + + + +# AVERAGED BY SHEEP +data = raw_data +data["Inflammation"] = data.FociCount * data.FociSize + +sheep = np.round(rescale(data.groupby("Sheep").mean()), 9) + + + +sheep.rename(columns = {"Lobular_collapse" : "LobularCollapse", + "Interface_hepatitis" : "InterfaceHepatitis", + "Confluent_necrosis" : "ConfluentNecrosis", + "Ln_ap_ri" : "LnApRi", + "Portal_inflammation" : "PortalInflammation", + "BD_hyperplasia" : "BDHyperplasia", + "Mean_hep_size" : "MeanHepSize", + "Min_hep_size" : "MinHepSize", + "Max_hep_size" : "MaxHepSize"}, inplace=True) + + + +data.rename(columns = {"Lobular_collapse" : "LobularCollapse", + "Interface_hepatitis" : "InterfaceHepatitis", + "Confluent_necrosis" : "ConfluentNecrosis", + "Ln_ap_ri" : "LnApRi", + "Portal_inflammation" : "PortalInflammation", + "BD_hyperplasia" : "BDHyperplasia", + "Mean_hep_size" : "MeanHepSize", + "Min_hep_size" : "MinHepSize", + "Max_hep_size" : "MaxHepSize"}, inplace=True) + +#age = rescale(data.groupby("Sheep").mean().dropna(subset=delayer([imagecols, histcols, "AgeAtDeath"])).groupby("AgeAtDeath").mean()) + +# <codecell> + +pcols = physcols[:] +pcols.remove("AgeAtDeath") + +print " -------- 1" +best_lm_phys, stuff_lm_phys = big_ass_matrix(df=sheep, y=physcols, x=imagecols, group=None, short=5) + +print " -------- 2" +best_lm_hist, stuff_lm_hist = big_ass_matrix(df=sheep, y=histcols, x=imagecols, group=None, short=5) + +print " -------- 3" +best_mm_phys, stuff_mm_phys = big_ass_matrix(df=sheep, y=pcols, x=imagecols, group="AgeAtDeath", short=5) + +print " -------- 4" +best_mm_hist, stuff_mm_hist = big_ass_matrix(df=sheep, y=histcols, x=imagecols, group="AgeAtDeath", short=5) + +# <codecell> + +y = "BDHyperplasia" +x = ["Inflammation", "Scale", "Directionality"] + +dfx = sheep[delayer([x, y, "AgeAtDeath"])].dropna() +model = MixedLM.from_formula(rstr(y, x), data=dfx, groups="AgeAtDeath").fit() +#model = sm.GLS(endog=dfx.Portal_inflammation, exog=dfx[["FociSize", "AgeAtDeath"]]).fit() + +dfx = sheep[["BDHyperplasia", "Inflammation", "AgeAtDeath"]].dropna() +model2 = MixedLM.from_formula(rstr(y, ["Inflammation"]), data=dfx, groups="AgeAtDeath").fit() + +dfx = sheep[["BDHyperplasia", "FociSize", "AgeAtDeath"]].dropna() +model3 = MixedLM.from_formula(rstr(y, ["FociSize"]), data=dfx, groups="AgeAtDeath").fit() + +# <codecell> + +ss = "E" +s = np.array([sheep[sheep.AgeAtDeath == model.random_effects.index.values[i]][ss].iloc[0] for i in range(len(model.random_effects.index.values))]) +s -= s.min() +s /= s.max() + +idx = np.round(s * 2000).astype(int) +C = seaborn.color_palette("coolwarm", 2001) +CC = [] +for i in idx : + CC.append(C[i]) + + +fig = plt.figure() +ax = fig.add_subplot(111, projection='3d') +plt.scatter(model.random_effects, model2.random_effects, zs=model3.random_effects, s=450, c=CC, alpha=1)#, c=np.array(C)[idx.astype(int)], alpha=1) +plt.title("Biliary Hyperplasia : Coefficients of Birth Year Random Effect\nFirst component of PCA explains 98.3%% of variance.\nWarmer colours - higher %s" % ss, y=0.9) + +for i, sheep_age in enumerate(model.random_effects.index) : + sheep_e = sheep[sheep.AgeAtDeath == sheep_age].E.iloc[0] + x2, y2, _ = proj3d.proj_transform(model.random_effects.values[i], model2.random_effects.values[i], model3.random_effects.values[i], ax.get_proj()) + plt.annotate(int(np.round(2011 - sheep_age * 11)), xy=(x2, y2), xytext=(x2-0.0105, y2-0.002)) + +ax.set_axis_bgcolor("white") +plt.savefig("../talk/figures/regressions/BDHyperplasia/mm_coefs_color_%s.png" % ss, dpi=600, jpeg_quality=100) + +# <codecell> + +plt.scatter(exposure.BirthYear, exposure.CES) + +# <codecell> + +plt.scatter(model.random_effects.index.values, idx) + +# <codecell> + + +#plt.plot([sheep[np.round(sheep.AgeAtDeath * 11) == i].E.iloc[0] for i in range(11)]) +#plt.plot(exposure.AgeAtDeath, exposure.E) +C2 = [] +for i in (np.unique(sheep[["AgeAtDeath", "BD_hyperplasia"]].dropna().AgeAtDeath) * 11).astype(int) : + C2.append(sheep[np.round(sheep.AgeAtDeath * 11) == i].E.iloc[0]) +#sheep[["AgeAtDeath", "E"]].dropna() + +# <codecell> + +plt.plot(exposure.BirthYear, exposure.E) + +# <codecell> + +exposure + +# <codecell> + +plt.text() + +# <codecell> + +df = sheep[["Interface_hepatitis", "AgeAtDeath", "Entropy"]].dropna() +ols = MixedLM.from_formula(rstr("Interface_hepatitis", ["Entropy"]), data=df, groups=df.AgeAtDeath).fit() +ols.summary() + +# <codecell> + +print mmR2(df, ols) +print ols.random_effects.abs().mean() + +# <codecell> + +print ols.summary() + +print ols.bse +print ols.bse_re +print ols.bse_fe + +# <codecell> + +histo.columns + +# <codecell> + +y1, y2 = mmPredict(df, ols) +plt.scatter(y1, y2) +plt.plot(y1,y1) + +# <codecell> + + +# <codecell> + +ols = pd.stats.api.ols(y = avesc.Portal_inflammation, x = avesc.Inflammation) +print ols.summary + +# <codecell> + +plt.scatter(avesc.FociSize, avesc.Portal_inflammation) +plt.plot(avesc.FociSize, avesc.FociSize * ols.beta[0] + ols.beta[1]) +plt.title("p = %.02e" % ols.p_value[0]) + +# <codecell> + +ols2 = pd.stats.api.ols(y = avesc.BD_hyperplasia, x = avesc[["FociSize", "Directionality", "Scale"]]) +print ols2.summary + +# <codecell> + +X = ["FociSize", "Directionality", "Scale"] +x = (ols2.beta[-1] + + ols2.beta[0] * avesc[X[0]] + + ols2.beta[1] * avesc[X[1]] + + ols2.beta[2] * avesc[X[2]]) +y = avesc.BD_hyperplasia + +plt.scatter(x, y) +#plt.plot([0, 1], [- 0.0504, 1- 0.0504] ) +#plt.plot(y, x) + +#plt.scatter(avesc.FociSize, avesc.Portal_inflammation) +#plt.plot(avesc.FociSize, avesc.FociSize * ols.beta[0] + ols.beta[1]) +#plt.title("p = %.02e" % ols.p_value[0]) + +# <codecell> + +ols3 = pd.stats.api.ols(y = avesc.Portal_inflammation, x = avesc.Inflammation) +plt.scatter(avesc.Inflammation, avesc.Portal_inflammation) +plt.plot(avesc.Inflammation, avesc.Inflammation * ols3.beta[0] + ols3.beta[1]) +print ols3.summary + +# <codecell> + +ols4 = pd.stats.api.ols(y = avesc.QTotal, x = avesc.TawfikTotal) +plt.scatter(avesc.TawfikTotal, avesc.QTotal) +plt.plot(avesc.TawfikTotal, avesc.TawfikTotal * ols4.beta[0] + ols4.beta[1]) +print ols4.summary + +# <codecell> + +df2 = sheep[["TawfikTotal", "Entropy", "AgeAtDeath"]] +df2.dropna(inplace=True) +ols = MixedLM.from_formula(rstr("TawfikTotal", ["Entropy"]), data=df2, groups="AgeAtDeath").fit() + +ols.pvalues[1:-1].values + +# <codecell> + +df = sheep[["Inflammation", "Mean_hep_size", "FociSize"]].dropna() +df["Intercept"] = np.ones(len(df)) +ols = sm.GLS(endog=df.Mean_hep_size, exog=df[["Inflammation", "FociSize"]]).fit() +ols.rsquared + +# <codecell> + +mmR2(df, ols) + + + + + + +print "YO" + +# <codecell> + +print "YO" + +# <codecell> + +y = pca.fit_transform(s[histcols]) + +# <codecell> + +mm, mmfits, mmpvals, mmqsum = test_all_linear(sheep, ["TawfikTotal"], imagecols, group="AgeAtDeath") + +# <codecell> + +df = sheep[delayer(["TawfikTotal", "Inflammation", "ResidualES"])].dropna() +df["Intercept"] = np.ones(len(df)) +tt = MixedLM(endog = df.TawfikTotal, exog = df[["Inflammation"]], groups=df.ResidualES).fit() +tt.summary() +#del fibrosis.tables[2] + + +# <codecell> + +models, fits, pvals, blah = test_all_linear(sheep, ["Ln_ap_ri"], imagecols, group="AgeAtDeath") + +# <codecell> + +np.where(np.array(fits["Ln_ap_ri"]) < (2 + np.min(fits["Ln_ap_ri"]))) + +# <codecell> + +idx = 0 +print models["Ln_ap_ri"][idx].summary() +print fits["Ln_ap_ri"][idx] + +# <codecell> + +a.model.endog_names + +plt.scatter(y, yhat, c=seaborn.color_palette("deep", 8)[0]) +plt.plot(y, y, c=seaborn.color_palette("deep", 8)[2]) +plt.xlabel(a.model.endog_names) +yl = a.model.exog_names +#yl.remove("Intercept") +plt.ylabel(", ".join(yl)) +plt.title("R2 = %.02f") +st = "%s : %.03f, p = %.03e.\n" * len(yl) +stl = [] +for i in range(len(yl)) : + stl.append(yl[i]) + stl.append(p[dependent][i]) +plt.suptitle(st % tuple(delayer([yl[i], p[dependent][i] for i in range(len(yl)) + +# <codecell> + +a.model.exog_names + +# <codecell> + + +# <codecell> + +raw_data[raw_data.TissueToSinusoid == raw_data.TissueToSinusoid.max()] + +# <codecell> + +df = sheep[["MeanHepSize", "Directionality"]].dropna() +df["Intercept"] = np.ones(len(df)) +ols = sm.GLS(endog=df.MeanHepSize, exog=df[["Directionality", "Intercept"]]).fit() +ols.summary() + +# <codecell> + +