|
a |
|
b/extractiveSummarization/summarizers/lexrank.py |
|
|
1 |
''' |
|
|
2 |
Derived from https://github.com/crabcamp/lexrank/blob/dev/lexrank/lexrank.py |
|
|
3 |
|
|
|
4 |
MIT License |
|
|
5 |
|
|
|
6 |
Copyright (c) 2018 Ocean S.A. |
|
|
7 |
|
|
|
8 |
Permission is hereby granted, free of charge, to any person obtaining a copy |
|
|
9 |
of this software and associated documentation files (the "Software"), to deal |
|
|
10 |
in the Software without restriction, including without limitation the rights |
|
|
11 |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
|
12 |
copies of the Software, and to permit persons to whom the Software is |
|
|
13 |
furnished to do so, subject to the following conditions: |
|
|
14 |
|
|
|
15 |
The above copyright notice and this permission notice shall be included in all |
|
|
16 |
copies or substantial portions of the Software. |
|
|
17 |
|
|
|
18 |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
|
19 |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
|
20 |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
|
21 |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
|
22 |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
|
23 |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|
|
24 |
SOFTWARE. |
|
|
25 |
''' |
|
|
26 |
|
|
|
27 |
import math |
|
|
28 |
import numpy as np |
|
|
29 |
import pandas as pd |
|
|
30 |
import re |
|
|
31 |
|
|
|
32 |
from collections import Counter, defaultdict |
|
|
33 |
from scipy.sparse.csgraph import connected_components |
|
|
34 |
from nltk.tokenize import word_tokenize, sent_tokenize |
|
|
35 |
from nltk.corpus import stopwords |
|
|
36 |
|
|
|
37 |
PUNCTUATION_SIGNS = set('.,;:¡!¿?…⋯&‹›«»\"“”[]()⟨⟩}{/|\\') |
|
|
38 |
|
|
|
39 |
|
|
|
40 |
class Lexrank: |
|
|
41 |
def __init__(self, documents, stop_words=None, threshold=.03, include_new_words=True): |
|
|
42 |
if not stop_words: |
|
|
43 |
self.stopwords = set(stopwords.words('english')) |
|
|
44 |
else: |
|
|
45 |
self.stopwords = stop_words |
|
|
46 |
self.threshold = threshold |
|
|
47 |
self.include_new_words = include_new_words |
|
|
48 |
self.idf_score = self._calc_idf(documents) |
|
|
49 |
|
|
|
50 |
def get_summary(self, sentences, summary_size=1): |
|
|
51 |
if not isinstance(summary_size, int) or summary_size < 1: |
|
|
52 |
raise ValueError('summary_size should be a positive integer') |
|
|
53 |
|
|
|
54 |
lex_scores = self.rank_sentences(sentences) |
|
|
55 |
sorted_ix = np.argsort(lex_scores)[::-1] |
|
|
56 |
summary = [sentences[i] for i in sorted_ix[:summary_size]] |
|
|
57 |
|
|
|
58 |
return summary |
|
|
59 |
|
|
|
60 |
def rank_sentences(self, sentences): |
|
|
61 |
tf_scores = [ |
|
|
62 |
Counter(self.tokenize_into_words(sentence)) for sentence in sentences |
|
|
63 |
] |
|
|
64 |
|
|
|
65 |
sim_matrix = self._calc_sim_matrix(tf_scores) |
|
|
66 |
|
|
|
67 |
scores = degree_centrality_scores(sim_matrix, threshold=self.threshold) |
|
|
68 |
return scores |
|
|
69 |
|
|
|
70 |
def tokenize_into_words(self, sentence): |
|
|
71 |
tokens = word_tokenize(str(sentence).lower()) |
|
|
72 |
tokens = [w for w in tokens if not w in self.stopwords] |
|
|
73 |
tokens = [w for w in tokens if not w in PUNCTUATION_SIGNS] |
|
|
74 |
return tokens |
|
|
75 |
|
|
|
76 |
def _calc_idf(self, documents): |
|
|
77 |
#print("calculating idf") |
|
|
78 |
bags_of_words = [] |
|
|
79 |
|
|
|
80 |
for i, doc in enumerate(documents): |
|
|
81 |
doc_words = set() |
|
|
82 |
|
|
|
83 |
for sentence in doc: |
|
|
84 |
words = self.tokenize_into_words(sentence) |
|
|
85 |
doc_words.update(words) |
|
|
86 |
|
|
|
87 |
if doc_words: |
|
|
88 |
bags_of_words.append(doc_words) |
|
|
89 |
|
|
|
90 |
if not bags_of_words: |
|
|
91 |
raise ValueError('bag of words is empty') |
|
|
92 |
|
|
|
93 |
doc_number_total = len(bags_of_words) |
|
|
94 |
print("total docs processed %d" %doc_number_total) |
|
|
95 |
|
|
|
96 |
if self.include_new_words: |
|
|
97 |
default_value = 1 |
|
|
98 |
|
|
|
99 |
else: |
|
|
100 |
default_value = 0 |
|
|
101 |
|
|
|
102 |
idf_score = defaultdict(lambda: default_value) |
|
|
103 |
|
|
|
104 |
for word in set.union(*bags_of_words): |
|
|
105 |
doc_number_word = sum(1 for bag in bags_of_words if word in bag) |
|
|
106 |
idf_score[word] = math.log(doc_number_total / doc_number_word) |
|
|
107 |
#print("idf scores done") |
|
|
108 |
return idf_score |
|
|
109 |
|
|
|
110 |
def _calc_sim_matrix(self, tf_scores): |
|
|
111 |
length = len(tf_scores) |
|
|
112 |
|
|
|
113 |
matrix = np.zeros([length] * 2) |
|
|
114 |
|
|
|
115 |
for i in range(length): |
|
|
116 |
for j in range(i, length): |
|
|
117 |
similarity = self._idf_modified_cosine(tf_scores, i, j) |
|
|
118 |
|
|
|
119 |
if similarity: |
|
|
120 |
matrix[i, j] = similarity |
|
|
121 |
matrix[j, i] = similarity |
|
|
122 |
|
|
|
123 |
return matrix |
|
|
124 |
|
|
|
125 |
def _idf_modified_cosine(self, tf_scores, i, j): |
|
|
126 |
if i == j: |
|
|
127 |
return 1 |
|
|
128 |
|
|
|
129 |
tf_i, tf_j = tf_scores[i], tf_scores[j] |
|
|
130 |
words_i, words_j = set(tf_i.keys()), set(tf_j.keys()) |
|
|
131 |
|
|
|
132 |
nominator = 0 |
|
|
133 |
|
|
|
134 |
for word in words_i & words_j: |
|
|
135 |
idf = self.idf_score[word] |
|
|
136 |
nominator += tf_i[word] * tf_j[word] * idf ** 2 |
|
|
137 |
|
|
|
138 |
if math.isclose(nominator, 0): |
|
|
139 |
return 0 |
|
|
140 |
|
|
|
141 |
denominator_i, denominator_j = 0, 0 |
|
|
142 |
|
|
|
143 |
for word in words_i: |
|
|
144 |
tfidf = tf_i[word] * self.idf_score[word] |
|
|
145 |
denominator_i += tfidf ** 2 |
|
|
146 |
|
|
|
147 |
for word in words_j: |
|
|
148 |
tfidf = tf_j[word] * self.idf_score[word] |
|
|
149 |
denominator_j += tfidf ** 2 |
|
|
150 |
|
|
|
151 |
similarity = nominator / math.sqrt(denominator_i * denominator_j) |
|
|
152 |
|
|
|
153 |
return similarity |
|
|
154 |
|
|
|
155 |
def create_markov_matrix(weights_matrix): |
|
|
156 |
n_1, n_2 = weights_matrix.shape |
|
|
157 |
if n_1 != n_2: |
|
|
158 |
raise ValueError('weights_matrix should be square') |
|
|
159 |
|
|
|
160 |
row_sum = weights_matrix.sum(axis=1, keepdims=True) |
|
|
161 |
|
|
|
162 |
return weights_matrix / row_sum |
|
|
163 |
|
|
|
164 |
|
|
|
165 |
def create_markov_matrix_discrete(weights_matrix, threshold): |
|
|
166 |
discrete_weights_matrix = np.zeros(weights_matrix.shape) |
|
|
167 |
ixs = np.where(weights_matrix >= threshold) |
|
|
168 |
discrete_weights_matrix[ixs] = 1 |
|
|
169 |
|
|
|
170 |
return create_markov_matrix(discrete_weights_matrix) |
|
|
171 |
|
|
|
172 |
def _power_method(transition_matrix): |
|
|
173 |
eigenvector = np.ones(len(transition_matrix)) |
|
|
174 |
|
|
|
175 |
if len(eigenvector) == 1: |
|
|
176 |
return eigenvector |
|
|
177 |
|
|
|
178 |
transition = transition_matrix.transpose() |
|
|
179 |
|
|
|
180 |
while True: |
|
|
181 |
eigenvector_next = np.dot(transition, eigenvector) |
|
|
182 |
|
|
|
183 |
if np.allclose(eigenvector_next, eigenvector): |
|
|
184 |
return eigenvector_next |
|
|
185 |
|
|
|
186 |
eigenvector = eigenvector_next |
|
|
187 |
#increases speed but also increases space taken |
|
|
188 |
transition = np.dot(transition, transition) |
|
|
189 |
|
|
|
190 |
def degree_centrality_scores(sim_matrix, threshold=None): |
|
|
191 |
if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1): |
|
|
192 |
raise ValueError('threshold should be a floating-point number ''from the interval [0, 1) or None') |
|
|
193 |
|
|
|
194 |
if threshold is None: |
|
|
195 |
markov_matrix = create_markov_matrix(sim_matrix) |
|
|
196 |
else: |
|
|
197 |
markov_matrix = create_markov_matrix_discrete(sim_matrix, threshold) |
|
|
198 |
scores = stationary_distribution(markov_matrix, normalized=False) |
|
|
199 |
return scores |
|
|
200 |
|
|
|
201 |
def stationary_distribution(transition_matrix, normalized=True): |
|
|
202 |
n_1, n_2 = transition_matrix.shape |
|
|
203 |
if n_1 != n_2: |
|
|
204 |
raise ValueError('transition_matrix should be square') |
|
|
205 |
|
|
|
206 |
distribution = np.zeros(n_1) |
|
|
207 |
grouped_indices = connected_nodes(transition_matrix) |
|
|
208 |
|
|
|
209 |
for group in grouped_indices: |
|
|
210 |
t_matrix = transition_matrix[np.ix_(group, group)] |
|
|
211 |
eigenvector = _power_method(t_matrix) |
|
|
212 |
distribution[group] = eigenvector |
|
|
213 |
|
|
|
214 |
if normalized: |
|
|
215 |
distribution /= n_1 |
|
|
216 |
|
|
|
217 |
return distribution |
|
|
218 |
|
|
|
219 |
def connected_nodes(matrix): |
|
|
220 |
_, labels = connected_components(matrix) |
|
|
221 |
groups = [] |
|
|
222 |
for tag in np.unique(labels): |
|
|
223 |
group = np.where(labels == tag)[0] |
|
|
224 |
groups.append(group) |
|
|
225 |
|
|
|
226 |
return groups |