CTEligible / Git / [cbecd8] /cluster/make

Models:
joseph-gordon/
CTEligible
Downloads: 1
[cbecd8]: / cluster / make_clustering.py
History
Download this file
124 lines (112 with data), 4.4 kB

import argparse
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import scipy.cluster
import scipy.spatial.distance
import sklearn.cluster
import sklearn.feature_extraction
import sklearn.manifold
import sklearn.metrics.pairwise

if True:
  p = argparse.ArgumentParser()
  p.add_argument("--tag", required=True)
  args = p.parse_args()
  tag = args.tag
else:
  tag = "Hemoglobin_CTEP Trials_072018"
  #tag = "Platelets_CTEP Trials_072018"
  #tag = "WBC_CTEP Trials_072018"
  #tag = "HIV_CTEPTrials_072018"

input_tsv = "../nci_data/dataset1-trials/" + tag + ".tsv"
output_pdf = "./" + tag + ".clustering.pdf"
features_csv = "./" + tag + ".features.csv"
linkage_matrix_csv = "./" + tag + ".linkage_matrix.csv"

# Load data.
tb = pd.read_table(input_tsv)
num_rows_excluded = sum(pd.isnull(tb["Boolean"]))
num_rows_orig = tb.shape[0]
tb = tb.loc[~pd.isnull(tb["Boolean"]),:]
tb = tb.reset_index(drop=True)
num_rows = tb.shape[0]
print("Excluding %d of %d rows" % (num_rows_excluded, num_rows_orig))
print("After exclusion, %d rows remain" % num_rows)

# Parse boolean.
def f(b):
  b = re.sub(r"[()]", "", b)
  operators = [w for w in b.split() if w in ("OR", "AND")]
  as_ops = b.replace("OR", "OP").replace("AND", "OP")
  triples = [tuple(re.split(r'(>=|<=|>|<|==|=)', t.strip(), maxsplit=1)) for t in as_ops.split("OP")]
  triples = [tuple(ti.strip() for ti in t) for t in triples]
  for i, t in enumerate(triples):
    print(t)
    if len(t) == 2:
      new_triple = (t[0], t[1], "?")
      print("Warning: {} is not of length 3, replacing with {}".format(t, new_triple))
      triples[i] = new_triple
    if len(t) == 1:
      new_triple = (t[0], "?", "?")
      print("Warning: {} is not of length 3, replacing with {}".format(t, new_triple))
      triples[i] = new_triple
  return {"triples": triples, "operators": operators}
def g(b):
  if pd.isnull(b):
    return b
  else:
    return f(b)
tb["parsed"] = [g(b) for b in tb["Boolean"]]

triples = [x["triples"] for x in tb["parsed"] if x]
operators = [x["operators"] for x in tb["parsed"]]

# Make features.
feat = [collections.defaultdict(float) for i in range(tb.shape[0])]
for i, triple_list in enumerate(triples):
  for l, c, r in triple_list:
    # Add count of each element alone within each triple.
    feat[i]["l_count_%s" % l] += 1
    feat[i]["c_count_%s" % c] += 1
    feat[i]["r_count_%s" % r] += 1
    # Add count of each pair of elements within each triple.
    feat[i]["lc_count_(%s, %s)" % (l, c)] += 1
    feat[i]["lr_count_(%s, %s)" % (l, r)] += 1
    feat[i]["cr_count_(%s, %s)" % (c, r)] += 1
    # Add count of each triple.
    t1 = (l, c, r)
    feat[i]["triple_count_%s" % str(t1)] += 1
    # Add count of each pair of triples.
    for t2 in triple_list:
      feat[i]["triple_pair_count_%s_%s" % (str(t1), str(t2))] += 1
for i, operator_list in enumerate(operators):
  for o1 in operator_list:
    # Add count for each operator.
    feat[i]["operator_count_%s" % o1] += 1
    # Add count for each pair of operators.
    for o2 in operator_list:
      feat[i]["operator_pair_count_%s_%s" % (o1, o2)] += 1

# Make feature matrix.
feature_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=False)
X = feature_vectorizer.fit_transform(feat)

# Carry out hierarchical clustering.
#hc_linkage = scipy.cluster.hierarchy.linkage(X, method="ward", metric="euclidean")
hc_linkage = scipy.cluster.hierarchy.linkage(X, method="complete", metric="cosine")
#hc_linkage = scipy.cluster.hierarchy.linkage(X, method="average", metric="cosine")

# Plot clustering.
h = 25.0 * tb.shape[0] / 174
fig = plt.figure(figsize=(25, h))
leaf_labels = [x for x in tb["Boolean"]]
dn = scipy.cluster.hierarchy.dendrogram(hc_linkage, labels=leaf_labels, orientation="left")
plt.title("Hierarchical clustering of %s " % tag)
plt.axis('tight')
plt.subplots_adjust(right=0.45)
plt.savefig(output_pdf)
plt.close(fig)

# Save features used for clustering.
feature_colnames = ["feature_%s" % x for x in feature_vectorizer.get_feature_names()]
feature_tb = pd.DataFrame(X, index=tb.index, columns=feature_colnames)
feature_with_orig_tb = pd.concat((tb, feature_tb), axis=1)
feature_with_orig_tb.to_csv(features_csv)
assert feature_tb.shape[0] == feature_with_orig_tb.shape[0]

# Save clustering output.
linkage_matrix_tb = pd.DataFrame(hc_linkage, columns=["hc_1", "hc_2", "hc_3", "hc_4"])
linkage_matrix_tb.to_csv(linkage_matrix_csv)