[cbecd8]: / cluster / make_clustering.py

Download this file

124 lines (112 with data), 4.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import argparse
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import scipy.cluster
import scipy.spatial.distance
import sklearn.cluster
import sklearn.feature_extraction
import sklearn.manifold
import sklearn.metrics.pairwise
if True:
p = argparse.ArgumentParser()
p.add_argument("--tag", required=True)
args = p.parse_args()
tag = args.tag
else:
tag = "Hemoglobin_CTEP Trials_072018"
#tag = "Platelets_CTEP Trials_072018"
#tag = "WBC_CTEP Trials_072018"
#tag = "HIV_CTEPTrials_072018"
input_tsv = "../nci_data/dataset1-trials/" + tag + ".tsv"
output_pdf = "./" + tag + ".clustering.pdf"
features_csv = "./" + tag + ".features.csv"
linkage_matrix_csv = "./" + tag + ".linkage_matrix.csv"
# Load data.
tb = pd.read_table(input_tsv)
num_rows_excluded = sum(pd.isnull(tb["Boolean"]))
num_rows_orig = tb.shape[0]
tb = tb.loc[~pd.isnull(tb["Boolean"]),:]
tb = tb.reset_index(drop=True)
num_rows = tb.shape[0]
print("Excluding %d of %d rows" % (num_rows_excluded, num_rows_orig))
print("After exclusion, %d rows remain" % num_rows)
# Parse boolean.
def f(b):
b = re.sub(r"[()]", "", b)
operators = [w for w in b.split() if w in ("OR", "AND")]
as_ops = b.replace("OR", "OP").replace("AND", "OP")
triples = [tuple(re.split(r'(>=|<=|>|<|==|=)', t.strip(), maxsplit=1)) for t in as_ops.split("OP")]
triples = [tuple(ti.strip() for ti in t) for t in triples]
for i, t in enumerate(triples):
print(t)
if len(t) == 2:
new_triple = (t[0], t[1], "?")
print("Warning: {} is not of length 3, replacing with {}".format(t, new_triple))
triples[i] = new_triple
if len(t) == 1:
new_triple = (t[0], "?", "?")
print("Warning: {} is not of length 3, replacing with {}".format(t, new_triple))
triples[i] = new_triple
return {"triples": triples, "operators": operators}
def g(b):
if pd.isnull(b):
return b
else:
return f(b)
tb["parsed"] = [g(b) for b in tb["Boolean"]]
triples = [x["triples"] for x in tb["parsed"] if x]
operators = [x["operators"] for x in tb["parsed"]]
# Make features.
feat = [collections.defaultdict(float) for i in range(tb.shape[0])]
for i, triple_list in enumerate(triples):
for l, c, r in triple_list:
# Add count of each element alone within each triple.
feat[i]["l_count_%s" % l] += 1
feat[i]["c_count_%s" % c] += 1
feat[i]["r_count_%s" % r] += 1
# Add count of each pair of elements within each triple.
feat[i]["lc_count_(%s, %s)" % (l, c)] += 1
feat[i]["lr_count_(%s, %s)" % (l, r)] += 1
feat[i]["cr_count_(%s, %s)" % (c, r)] += 1
# Add count of each triple.
t1 = (l, c, r)
feat[i]["triple_count_%s" % str(t1)] += 1
# Add count of each pair of triples.
for t2 in triple_list:
feat[i]["triple_pair_count_%s_%s" % (str(t1), str(t2))] += 1
for i, operator_list in enumerate(operators):
for o1 in operator_list:
# Add count for each operator.
feat[i]["operator_count_%s" % o1] += 1
# Add count for each pair of operators.
for o2 in operator_list:
feat[i]["operator_pair_count_%s_%s" % (o1, o2)] += 1
# Make feature matrix.
feature_vectorizer = sklearn.feature_extraction.DictVectorizer(sparse=False)
X = feature_vectorizer.fit_transform(feat)
# Carry out hierarchical clustering.
#hc_linkage = scipy.cluster.hierarchy.linkage(X, method="ward", metric="euclidean")
hc_linkage = scipy.cluster.hierarchy.linkage(X, method="complete", metric="cosine")
#hc_linkage = scipy.cluster.hierarchy.linkage(X, method="average", metric="cosine")
# Plot clustering.
h = 25.0 * tb.shape[0] / 174
fig = plt.figure(figsize=(25, h))
leaf_labels = [x for x in tb["Boolean"]]
dn = scipy.cluster.hierarchy.dendrogram(hc_linkage, labels=leaf_labels, orientation="left")
plt.title("Hierarchical clustering of %s " % tag)
plt.axis('tight')
plt.subplots_adjust(right=0.45)
plt.savefig(output_pdf)
plt.close(fig)
# Save features used for clustering.
feature_colnames = ["feature_%s" % x for x in feature_vectorizer.get_feature_names()]
feature_tb = pd.DataFrame(X, index=tb.index, columns=feature_colnames)
feature_with_orig_tb = pd.concat((tb, feature_tb), axis=1)
feature_with_orig_tb.to_csv(features_csv)
assert feature_tb.shape[0] == feature_with_orig_tb.shape[0]
# Save clustering output.
linkage_matrix_tb = pd.DataFrame(hc_linkage, columns=["hc_1", "hc_2", "hc_3", "hc_4"])
linkage_matrix_tb.to_csv(linkage_matrix_csv)