Switch to unified view

a b/pipeline/main300/final340clusters.py
1
from __future__ import print_function
2
from sklearn.decomposition import TruncatedSVD
3
from sklearn.feature_extraction.text import TfidfVectorizer
4
from sklearn.feature_extraction.text import HashingVectorizer
5
from sklearn.feature_extraction.text import TfidfTransformer
6
from sklearn.preprocessing import Normalizer
7
from sklearn import metrics
8
9
from sklearn.cluster import KMeans, MiniBatchKMeans
10
11
import logging
12
import sys
13
from time import time
14
15
import numpy as np
16
import pandas as pd
17
18
data = pd.read_csv("cleanedsmplinds.csv", sep = ",", quoting = 1, quotechar = '"')
19
dataset = []
20
data = np.array(data)
21
for x in data :
22
    count = 0
23
    placeholder = ""
24
    for y in x :
25
        if (count == 2) :
26
            placeholder = y + "";
27
        elif (count == 3) :
28
            if y == True :
29
                #dataset.append(str(1) + " " + placeholder)
30
                dataset.append(placeholder)
31
            else :
32
                dataset.append(placeholder)
33
                #dataset.append(str(0) + " " + placeholder)
34
        count=count+1
35
36
nb_clust = 300
37
38
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, stop_words='english', use_idf=True)
39
X = vectorizer.fit_transform(dataset)
40
print(X.shape)
41
42
km = KMeans(n_clusters=nb_clust, init='k-means++', max_iter=100, random_state=1337, n_init=1,verbose=True)
43
km.fit(X)
44
45
data2 = pd.read_csv("cleansmplinds.csv", sep = ",", quoting = 1, quotechar = '"')
46
data2 = np.array(data2)
47
48
cluster_map = pd.DataFrame()
49
50
cluster_map['cluster'] = km.labels_
51
52
for x in range(nb_clust) :
53
    f1 = open('./final340numclusters2/clust_' + str(x) + '.txt', 'a')
54
    y = cluster_map[cluster_map.cluster == x]['cluster'].index
55
    for n in y :
56
        f1.write(data2[n][2])
57
        f1.write("\n")
58
59
f2 = open('./final340numclusters2/centers.txt', 'a')
60
centers = km.cluster_centers_.argsort()[:, ::-1]
61
terms = vectorizer.get_feature_names()
62
for i in range(nb_clust):
63
    f2.write("Cluster %d:" % i)
64
    for ind in centers[i, :10]:
65
        f2.write(' %s' % terms[ind])
66
    f2.write("\n")
67