--- a +++ b/cluster/make_clustering.py @@ -0,0 +1,123 @@ +import argparse +import collections +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import re +import scipy.cluster +import scipy.spatial.distance +import sklearn.cluster +import sklearn.feature_extraction +import sklearn.manifold +import sklearn.metrics.pairwise + +if True: + p = argparse.ArgumentParser() + p.add_argument("--tag", required=True) + args = p.parse_args() + tag = args.tag +else: + tag = "Hemoglobin_CTEP Trials_072018" + #tag = "Platelets_CTEP Trials_072018" + #tag = "WBC_CTEP Trials_072018" + #tag = "HIV_CTEPTrials_072018" + +input_tsv = "../nci_data/dataset1-trials/" + tag + ".tsv" +output_pdf = "./" + tag + ".clustering.pdf" +features_csv = "./" + tag + ".features.csv" +linkage_matrix_csv = "./" + tag + ".linkage_matrix.csv" + +# Load data. +tb = pd.read_table(input_tsv) +num_rows_excluded = sum(pd.isnull(tb["Boolean"])) +num_rows_orig = tb.shape[0] +tb = tb.loc[~pd.isnull(tb["Boolean"]),:] +tb = tb.reset_index(drop=True) +num_rows = tb.shape[0] +print("Excluding %d of %d rows" % (num_rows_excluded, num_rows_orig)) +print("After exclusion, %d rows remain" % num_rows) + +# Parse boolean. +def f(b): + b = re.sub(r"[()]", "", b) + operators = [w for w in b.split() if w in ("OR", "AND")] + as_ops = b.replace("OR", "OP").replace("AND", "OP") + triples = [tuple(re.split(r'(>=|<=|>|<|==|=)', t.strip(), maxsplit=1)) for t in as_ops.split("OP")] + triples = [tuple(ti.strip() for ti in t) for t in triples] + for i, t in enumerate(triples): + print(t) + if len(t) == 2: + new_triple = (t[0], t[1], "?") + print("Warning: {} is not of length 3, replacing with {}".format(t, new_triple)) + triples[i] = new_triple + if len(t) == 1: + new_triple = (t[0], "?", "?") + print("Warning: {} is not of length 3, replacing with {}".format(t, new_triple)) + triples[i] = new_triple + return {"triples": triples, "operators": operators} +def g(b): + if pd.isnull(b): + return b + else: + return f(b) +tb["parsed"] = [g(b) for b in tb["Boolean"]] + +triples = [x["triples"] for x in tb["parsed"] if x] +operators = [x["operators"] for x in tb["parsed"]] + +# Make features. +feat = [collections.defaultdict(float) for i in range(tb.shape[0])] +for i, triple_list in enumerate(triples): + for l, c, r in triple_list: + # Add count of each element alone within each triple. + feat[i]["l_count_%s" % l] += 1 + feat[i]["c_count_%s" % c] += 1 + feat[i]["r_count_%s" % r] += 1 + # Add count of each pair of elements within each triple. + feat[i]["lc_count_(%s, %s)" % (l, c)] += 1 + feat[i]["lr_count_(%s, %s)" % (l, r)] += 1 + feat[i]["cr_count_(%s, %s)" % (c, r)] += 1 + # Add count of each triple. + t1 = (l, c, r) + feat[i]["triple_count_%s" % str(t1)] += 1 + # Add count of each pair of triples. + for t2 in triple_list: + feat[i]["triple_pair_count_%s_%s" % (str(t1), str(t2))] += 1 +for i, operator_list in enumerate(operators): + for o1 in operator_list: + # Add count for each operator. + feat[i]["operator_count_%s" % o1] += 1 + # Add count for each pair of operators. + for o2 in operator_list: + feat[i]["operator_pair_count_%s_%s" % (o1, o2)] += 1 + +# Make feature matrix. +feature_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=False) +X = feature_vectorizer.fit_transform(feat) + +# Carry out hierarchical clustering. +#hc_linkage = scipy.cluster.hierarchy.linkage(X, method="ward", metric="euclidean") +hc_linkage = scipy.cluster.hierarchy.linkage(X, method="complete", metric="cosine") +#hc_linkage = scipy.cluster.hierarchy.linkage(X, method="average", metric="cosine") + +# Plot clustering. +h = 25.0 * tb.shape[0] / 174 +fig = plt.figure(figsize=(25, h)) +leaf_labels = [x for x in tb["Boolean"]] +dn = scipy.cluster.hierarchy.dendrogram(hc_linkage, labels=leaf_labels, orientation="left") +plt.title("Hierarchical clustering of %s " % tag) +plt.axis('tight') +plt.subplots_adjust(right=0.45) +plt.savefig(output_pdf) +plt.close(fig) + +# Save features used for clustering. +feature_colnames = ["feature_%s" % x for x in feature_vectorizer.get_feature_names()] +feature_tb = pd.DataFrame(X, index=tb.index, columns=feature_colnames) +feature_with_orig_tb = pd.concat((tb, feature_tb), axis=1) +feature_with_orig_tb.to_csv(features_csv) +assert feature_tb.shape[0] == feature_with_orig_tb.shape[0] + +# Save clustering output. +linkage_matrix_tb = pd.DataFrame(hc_linkage, columns=["hc_1", "hc_2", "hc_3", "hc_4"]) +linkage_matrix_tb.to_csv(linkage_matrix_csv)