[748a59]: / ct-eligible-flask / app / suggestions / tfidf_converter.py

Download this file

50 lines (36 with data), 1.4 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import math
class TfidfConverter:
def __init__(self, freq_vectors):
self.freq_vectors = freq_vectors
self.doc_size = len(freq_vectors)
self.term_freq = {}
self.doc_freq = {}
self.idf = {}
self.tfidf_vectors = {}
def generate_tfidf_vectors(self):
# Calculate TF and DF
self.populate_term_doc_freq()
# Calculate IDF
self.calculate_idf()
# Calculate TF-IDF
for cluster in self.freq_vectors.keys():
tf = self.term_freq[cluster]
tfidf_vector = {}
for term, freq in tf.items():
tfidf = freq * self.idf[term]
tfidf_vector[term] = tfidf
self.tfidf_vectors[cluster] = tfidf_vector
def calculate_idf(self):
for term, doc_freq in self.doc_freq.items():
self.idf[term] = math.log(self.doc_size / doc_freq)
def populate_term_doc_freq(self):
for cluster, freq_vector in self.freq_vectors.items():
total = float(sum(freq for freq in freq_vector.values()))
freq_dict = {}
for term, count in freq_vector.items():
freq_dict[term] = count / total
if term in self.doc_freq:
self.doc_freq[term] = self.doc_freq[term] + 1.0
else:
self.doc_freq[term] = 1.0
self.term_freq[cluster] = freq_dict