Switch to unified view

a b/extractiveSummarization/summarizers/lexrank.py
1
'''
2
Derived from https://github.com/crabcamp/lexrank/blob/dev/lexrank/lexrank.py
3
4
MIT License
5
6
Copyright (c) 2018 Ocean S.A.
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in all
16
copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
SOFTWARE.
25
'''
26
27
import math
28
import numpy as np  
29
import pandas as pd 
30
import re   
31
32
from collections import Counter, defaultdict
33
from scipy.sparse.csgraph import connected_components
34
from nltk.tokenize import word_tokenize, sent_tokenize
35
from nltk.corpus import stopwords
36
37
PUNCTUATION_SIGNS = set('.,;:¡!¿?…⋯&‹›«»\"“”[]()⟨⟩}{/|\\')
38
39
40
class Lexrank:
41
    def __init__(self, documents, stop_words=None, threshold=.03, include_new_words=True):
42
        if not stop_words:
43
            self.stopwords = set(stopwords.words('english'))
44
        else:
45
            self.stopwords = stop_words
46
        self.threshold = threshold
47
        self.include_new_words = include_new_words
48
        self.idf_score = self._calc_idf(documents)
49
        
50
    def get_summary(self, sentences, summary_size=1):
51
        if not isinstance(summary_size, int) or summary_size < 1:
52
            raise ValueError('summary_size should be a positive integer')
53
54
        lex_scores = self.rank_sentences(sentences)
55
        sorted_ix = np.argsort(lex_scores)[::-1]
56
        summary = [sentences[i] for i in sorted_ix[:summary_size]]
57
58
        return summary
59
60
    def rank_sentences(self, sentences):
61
        tf_scores = [
62
            Counter(self.tokenize_into_words(sentence)) for sentence in sentences
63
        ]
64
65
        sim_matrix = self._calc_sim_matrix(tf_scores)
66
67
        scores = degree_centrality_scores(sim_matrix, threshold=self.threshold)
68
        return scores
69
70
    def tokenize_into_words(self, sentence):
71
        tokens = word_tokenize(str(sentence).lower())
72
        tokens = [w for w in tokens if not w in self.stopwords]
73
        tokens = [w for w in tokens if not w in PUNCTUATION_SIGNS]
74
        return tokens
75
76
    def _calc_idf(self, documents):
77
        #print("calculating idf")
78
        bags_of_words = []
79
80
        for i, doc in enumerate(documents):
81
            doc_words = set()
82
83
            for sentence in doc:
84
                words = self.tokenize_into_words(sentence)
85
                doc_words.update(words)
86
87
            if doc_words:
88
                bags_of_words.append(doc_words)
89
90
        if not bags_of_words:
91
            raise ValueError('bag of words is empty')
92
93
        doc_number_total = len(bags_of_words)
94
        print("total docs processed %d" %doc_number_total)
95
96
        if self.include_new_words:
97
            default_value = 1
98
99
        else:
100
            default_value = 0
101
102
        idf_score = defaultdict(lambda: default_value)
103
104
        for word in set.union(*bags_of_words):
105
            doc_number_word = sum(1 for bag in bags_of_words if word in bag)
106
            idf_score[word] = math.log(doc_number_total / doc_number_word)
107
        #print("idf scores done")
108
        return idf_score
109
110
    def _calc_sim_matrix(self, tf_scores):
111
        length = len(tf_scores)
112
113
        matrix = np.zeros([length] * 2)
114
115
        for i in range(length):
116
            for j in range(i, length):
117
                similarity = self._idf_modified_cosine(tf_scores, i, j)
118
119
                if similarity:
120
                    matrix[i, j] = similarity
121
                    matrix[j, i] = similarity
122
123
        return matrix
124
125
    def _idf_modified_cosine(self, tf_scores, i, j):
126
        if i == j:
127
            return 1
128
129
        tf_i, tf_j = tf_scores[i], tf_scores[j]
130
        words_i, words_j = set(tf_i.keys()), set(tf_j.keys())
131
132
        nominator = 0
133
134
        for word in words_i & words_j:
135
            idf = self.idf_score[word]
136
            nominator += tf_i[word] * tf_j[word] * idf ** 2
137
138
        if math.isclose(nominator, 0):
139
            return 0
140
141
        denominator_i, denominator_j = 0, 0
142
143
        for word in words_i:
144
            tfidf = tf_i[word] * self.idf_score[word]
145
            denominator_i += tfidf ** 2
146
147
        for word in words_j:
148
            tfidf = tf_j[word] * self.idf_score[word]
149
            denominator_j += tfidf ** 2
150
151
        similarity = nominator / math.sqrt(denominator_i * denominator_j)
152
153
        return similarity
154
155
def create_markov_matrix(weights_matrix):
156
    n_1, n_2 = weights_matrix.shape
157
    if n_1 != n_2:
158
        raise ValueError('weights_matrix should be square')
159
160
    row_sum = weights_matrix.sum(axis=1, keepdims=True)
161
162
    return weights_matrix / row_sum
163
164
165
def create_markov_matrix_discrete(weights_matrix, threshold):
166
    discrete_weights_matrix = np.zeros(weights_matrix.shape)
167
    ixs = np.where(weights_matrix >= threshold)
168
    discrete_weights_matrix[ixs] = 1
169
170
    return create_markov_matrix(discrete_weights_matrix)
171
172
def _power_method(transition_matrix):
173
    eigenvector = np.ones(len(transition_matrix))
174
175
    if len(eigenvector) == 1:
176
        return eigenvector
177
178
    transition = transition_matrix.transpose()
179
180
    while True:
181
        eigenvector_next = np.dot(transition, eigenvector)
182
183
        if np.allclose(eigenvector_next, eigenvector):
184
            return eigenvector_next
185
186
        eigenvector = eigenvector_next
187
        #increases speed but also increases space taken
188
        transition = np.dot(transition, transition)
189
190
def degree_centrality_scores(sim_matrix, threshold=None):
191
    if not (threshold is None or isinstance(threshold, float) and 0 <= threshold < 1):
192
        raise ValueError('threshold should be a floating-point number ''from the interval [0, 1) or None')
193
194
    if threshold is None:
195
        markov_matrix = create_markov_matrix(sim_matrix)
196
    else:
197
        markov_matrix = create_markov_matrix_discrete(sim_matrix, threshold)
198
    scores = stationary_distribution(markov_matrix, normalized=False)
199
    return scores
200
201
def stationary_distribution(transition_matrix, normalized=True):
202
    n_1, n_2 = transition_matrix.shape
203
    if n_1 != n_2:
204
        raise ValueError('transition_matrix should be square')
205
206
    distribution = np.zeros(n_1)
207
    grouped_indices = connected_nodes(transition_matrix)
208
209
    for group in grouped_indices:
210
        t_matrix = transition_matrix[np.ix_(group, group)]
211
        eigenvector = _power_method(t_matrix)
212
        distribution[group] = eigenvector
213
214
    if normalized:
215
        distribution /= n_1
216
217
    return distribution
218
219
def connected_nodes(matrix):
220
    _, labels = connected_components(matrix)
221
    groups = []
222
    for tag in np.unique(labels):
223
        group = np.where(labels == tag)[0]
224
        groups.append(group)
225
226
    return groups