[c09aa8]: / clusters / scripts / clustering.py

Download this file

123 lines (93 with data), 4.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import mdtraj as md
from sparsehc_dm import sparsehc_dm
from __future__ import print_function
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
import sys
from time import time
import numpy as np
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s'
# Uncomment the following to do the analysis on all the categories
# categories = None
dataset = ####GET DATASET HERE
labels = dataset.target
true_k = np.unique(labels).shape[0]
t0 = time()
if opts.use_hashing:
if opts.use_idf:
# Perform an IDF normalization on the output of HashingVectorizer
hasher = HashingVectorizer(n_features=opts.n_features,
stop_words='english', alternate_sign=False,
norm=None, binary=False)
vectorizer = make_pipeline(hasher, TfidfTransformer())
else:
vectorizer = HashingVectorizer(n_features=opts.n_features,
stop_words='english',
alternate_sign=False, norm='l2',
binary=False)
else:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
min_df=2, stop_words='english',
use_idf=opts.use_idf)
X = vectorizer.fit_transform(dataset.data)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()
if opts.n_components:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(opts.n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
print("done in %fs" % (time() - t0))
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
int(explained_variance * 100)))
print()
# #############################################################################
# Do the actual clustering
if opts.minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
verbose=opts.verbose)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
% metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, km.labels_, sample_size=1000))
print()
if not opts.use_hashing:
print("Top terms per cluster:")
if opts.n_components:
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
else:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster %d:" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()