--- a +++ b/clusters/scripts/clustering.py @@ -0,0 +1,122 @@ +import mdtraj as md +from sparsehc_dm import sparsehc_dm + +from __future__ import print_function +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.feature_extraction.text import HashingVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import Normalizer +from sklearn import metrics + +from sklearn.cluster import KMeans, MiniBatchKMeans + +import logging +import sys +from time import time + +import numpy as np + + +# Display progress logs on stdout +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s' + + +# Uncomment the following to do the analysis on all the categories +# categories = None + +dataset = ####GET DATASET HERE + +labels = dataset.target +true_k = np.unique(labels).shape[0] + + +t0 = time() +if opts.use_hashing: + if opts.use_idf: + # Perform an IDF normalization on the output of HashingVectorizer + hasher = HashingVectorizer(n_features=opts.n_features, + stop_words='english', alternate_sign=False, + norm=None, binary=False) + vectorizer = make_pipeline(hasher, TfidfTransformer()) + else: + vectorizer = HashingVectorizer(n_features=opts.n_features, + stop_words='english', + alternate_sign=False, norm='l2', + binary=False) +else: + vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, + min_df=2, stop_words='english', + use_idf=opts.use_idf) +X = vectorizer.fit_transform(dataset.data) + +print("done in %fs" % (time() - t0)) +print("n_samples: %d, n_features: %d" % X.shape) +print() + +if opts.n_components: + print("Performing dimensionality reduction using LSA") + t0 = time() + # Vectorizer results are normalized, which makes KMeans behave as + # spherical k-means for better results. Since LSA/SVD results are + # not normalized, we have to redo the normalization. + svd = TruncatedSVD(opts.n_components) + normalizer = Normalizer(copy=False) + lsa = make_pipeline(svd, normalizer) + + X = lsa.fit_transform(X) + + print("done in %fs" % (time() - t0)) + + explained_variance = svd.explained_variance_ratio_.sum() + print("Explained variance of the SVD step: {}%".format( + int(explained_variance * 100))) + + print() + + +# ############################################################################# +# Do the actual clustering + +if opts.minibatch: + km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, + init_size=1000, batch_size=1000, verbose=opts.verbose) +else: + km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, + verbose=opts.verbose) + +print("Clustering sparse data with %s" % km) +t0 = time() +km.fit(X) +print("done in %0.3fs" % (time() - t0)) +print() + +print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) +print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) +print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) +print("Adjusted Rand-Index: %.3f" + % metrics.adjusted_rand_score(labels, km.labels_)) +print("Silhouette Coefficient: %0.3f" + % metrics.silhouette_score(X, km.labels_, sample_size=1000)) + +print() + + +if not opts.use_hashing: + print("Top terms per cluster:") + + if opts.n_components: + original_space_centroids = svd.inverse_transform(km.cluster_centers_) + order_centroids = original_space_centroids.argsort()[:, ::-1] + else: + order_centroids = km.cluster_centers_.argsort()[:, ::-1] + + terms = vectorizer.get_feature_names() + for i in range(true_k): + print("Cluster %d:" % i, end='') + for ind in order_centroids[i, :10]: + print(' %s' % terms[ind], end='') + print() +