Switch to side-by-side view

--- a
+++ b/clusters/scripts/clustering.py
@@ -0,0 +1,122 @@
+import mdtraj as md
+from sparsehc_dm import sparsehc_dm
+
+from __future__ import print_function
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import Normalizer
+from sklearn import metrics
+
+from sklearn.cluster import KMeans, MiniBatchKMeans
+
+import logging
+import sys
+from time import time
+
+import numpy as np
+
+
+# Display progress logs on stdout
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s %(levelname)s %(message)s'
+
+
+# Uncomment the following to do the analysis on all the categories
+# categories = None
+
+dataset = ####GET DATASET HERE
+
+labels = dataset.target
+true_k = np.unique(labels).shape[0]
+
+
+t0 = time()
+if opts.use_hashing:
+    if opts.use_idf:
+        # Perform an IDF normalization on the output of HashingVectorizer
+        hasher = HashingVectorizer(n_features=opts.n_features,
+                                   stop_words='english', alternate_sign=False,
+                                   norm=None, binary=False)
+        vectorizer = make_pipeline(hasher, TfidfTransformer())
+    else:
+        vectorizer = HashingVectorizer(n_features=opts.n_features,
+                                       stop_words='english',
+                                       alternate_sign=False, norm='l2',
+                                       binary=False)
+else:
+    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
+                                 min_df=2, stop_words='english',
+                                 use_idf=opts.use_idf)
+X = vectorizer.fit_transform(dataset.data)
+
+print("done in %fs" % (time() - t0))
+print("n_samples: %d, n_features: %d" % X.shape)
+print()
+
+if opts.n_components:
+    print("Performing dimensionality reduction using LSA")
+    t0 = time()
+    # Vectorizer results are normalized, which makes KMeans behave as
+    # spherical k-means for better results. Since LSA/SVD results are
+    # not normalized, we have to redo the normalization.
+    svd = TruncatedSVD(opts.n_components)
+    normalizer = Normalizer(copy=False)
+    lsa = make_pipeline(svd, normalizer)
+
+    X = lsa.fit_transform(X)
+
+    print("done in %fs" % (time() - t0))
+
+    explained_variance = svd.explained_variance_ratio_.sum()
+    print("Explained variance of the SVD step: {}%".format(
+        int(explained_variance * 100)))
+
+    print()
+
+
+# #############################################################################
+# Do the actual clustering
+
+if opts.minibatch:
+    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
+                         init_size=1000, batch_size=1000, verbose=opts.verbose)
+else:
+    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
+                verbose=opts.verbose)
+
+print("Clustering sparse data with %s" % km)
+t0 = time()
+km.fit(X)
+print("done in %0.3fs" % (time() - t0))
+print()
+
+print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
+print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
+print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
+print("Adjusted Rand-Index: %.3f"
+      % metrics.adjusted_rand_score(labels, km.labels_))
+print("Silhouette Coefficient: %0.3f"
+      % metrics.silhouette_score(X, km.labels_, sample_size=1000))
+
+print()
+
+
+if not opts.use_hashing:
+    print("Top terms per cluster:")
+
+    if opts.n_components:
+        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
+        order_centroids = original_space_centroids.argsort()[:, ::-1]
+    else:
+        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
+
+    terms = vectorizer.get_feature_names()
+    for i in range(true_k):
+        print("Cluster %d:" % i, end='')
+        for ind in order_centroids[i, :10]:
+            print(' %s' % terms[ind], end='')
+        print()
+