Switch to side-by-side view

--- a
+++ b/cluster/make_clustering.py
@@ -0,0 +1,123 @@
+import argparse
+import collections
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import re
+import scipy.cluster
+import scipy.spatial.distance
+import sklearn.cluster
+import sklearn.feature_extraction
+import sklearn.manifold
+import sklearn.metrics.pairwise
+
+if True:
+  p = argparse.ArgumentParser()
+  p.add_argument("--tag", required=True)
+  args = p.parse_args()
+  tag = args.tag
+else:
+  tag = "Hemoglobin_CTEP Trials_072018"
+  #tag = "Platelets_CTEP Trials_072018"
+  #tag = "WBC_CTEP Trials_072018"
+  #tag = "HIV_CTEPTrials_072018"
+
+input_tsv = "../nci_data/dataset1-trials/" + tag + ".tsv"
+output_pdf = "./" + tag + ".clustering.pdf"
+features_csv = "./" + tag + ".features.csv"
+linkage_matrix_csv = "./" + tag + ".linkage_matrix.csv"
+
+# Load data.
+tb = pd.read_table(input_tsv)
+num_rows_excluded = sum(pd.isnull(tb["Boolean"]))
+num_rows_orig = tb.shape[0]
+tb = tb.loc[~pd.isnull(tb["Boolean"]),:]
+tb = tb.reset_index(drop=True)
+num_rows = tb.shape[0]
+print("Excluding %d of %d rows" % (num_rows_excluded, num_rows_orig))
+print("After exclusion, %d rows remain" % num_rows)
+
+# Parse boolean.
+def f(b):
+  b = re.sub(r"[()]", "", b)
+  operators = [w for w in b.split() if w in ("OR", "AND")]
+  as_ops = b.replace("OR", "OP").replace("AND", "OP")
+  triples = [tuple(re.split(r'(>=|<=|>|<|==|=)', t.strip(), maxsplit=1)) for t in as_ops.split("OP")]
+  triples = [tuple(ti.strip() for ti in t) for t in triples]
+  for i, t in enumerate(triples):
+    print(t)
+    if len(t) == 2:
+      new_triple = (t[0], t[1], "?")
+      print("Warning: {} is not of length 3, replacing with {}".format(t, new_triple))
+      triples[i] = new_triple
+    if len(t) == 1:
+      new_triple = (t[0], "?", "?")
+      print("Warning: {} is not of length 3, replacing with {}".format(t, new_triple))
+      triples[i] = new_triple
+  return {"triples": triples, "operators": operators}
+def g(b):
+  if pd.isnull(b):
+    return b
+  else:
+    return f(b)
+tb["parsed"] = [g(b) for b in tb["Boolean"]]
+
+triples = [x["triples"] for x in tb["parsed"] if x]
+operators = [x["operators"] for x in tb["parsed"]]
+
+# Make features.
+feat = [collections.defaultdict(float) for i in range(tb.shape[0])]
+for i, triple_list in enumerate(triples):
+  for l, c, r in triple_list:
+    # Add count of each element alone within each triple.
+    feat[i]["l_count_%s" % l] += 1
+    feat[i]["c_count_%s" % c] += 1
+    feat[i]["r_count_%s" % r] += 1
+    # Add count of each pair of elements within each triple.
+    feat[i]["lc_count_(%s, %s)" % (l, c)] += 1
+    feat[i]["lr_count_(%s, %s)" % (l, r)] += 1
+    feat[i]["cr_count_(%s, %s)" % (c, r)] += 1
+    # Add count of each triple.
+    t1 = (l, c, r)
+    feat[i]["triple_count_%s" % str(t1)] += 1
+    # Add count of each pair of triples.
+    for t2 in triple_list:
+      feat[i]["triple_pair_count_%s_%s" % (str(t1), str(t2))] += 1
+for i, operator_list in enumerate(operators):
+  for o1 in operator_list:
+    # Add count for each operator.
+    feat[i]["operator_count_%s" % o1] += 1
+    # Add count for each pair of operators.
+    for o2 in operator_list:
+      feat[i]["operator_pair_count_%s_%s" % (o1, o2)] += 1
+
+# Make feature matrix.
+feature_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=False)
+X = feature_vectorizer.fit_transform(feat)
+
+# Carry out hierarchical clustering.
+#hc_linkage = scipy.cluster.hierarchy.linkage(X, method="ward", metric="euclidean")
+hc_linkage = scipy.cluster.hierarchy.linkage(X, method="complete", metric="cosine")
+#hc_linkage = scipy.cluster.hierarchy.linkage(X, method="average", metric="cosine")
+
+# Plot clustering.
+h = 25.0 * tb.shape[0] / 174
+fig = plt.figure(figsize=(25, h))
+leaf_labels = [x for x in tb["Boolean"]]
+dn = scipy.cluster.hierarchy.dendrogram(hc_linkage, labels=leaf_labels, orientation="left")
+plt.title("Hierarchical clustering of %s " % tag)
+plt.axis('tight')
+plt.subplots_adjust(right=0.45)
+plt.savefig(output_pdf)
+plt.close(fig)
+
+# Save features used for clustering.
+feature_colnames = ["feature_%s" % x for x in feature_vectorizer.get_feature_names()]
+feature_tb = pd.DataFrame(X, index=tb.index, columns=feature_colnames)
+feature_with_orig_tb = pd.concat((tb, feature_tb), axis=1)
+feature_with_orig_tb.to_csv(features_csv)
+assert feature_tb.shape[0] == feature_with_orig_tb.shape[0]
+
+# Save clustering output.
+linkage_matrix_tb = pd.DataFrame(hc_linkage, columns=["hc_1", "hc_2", "hc_3", "hc_4"])
+linkage_matrix_tb.to_csv(linkage_matrix_csv)