|
a |
|
b/QueryExtraction/rake_test_query.py |
|
|
1 |
''' |
|
|
2 |
|
|
|
3 |
https://github.com/csurfer/rake-nltk |
|
|
4 |
https://towardsdatascience.com/extracting-keyphrases-from-text-rake-and-gensim-in-python-eefd0fad582f |
|
|
5 |
|
|
|
6 |
Paper: Automatic Keyword Extraction from Individual Documents |
|
|
7 |
easily applied to new domains, and operates well on multiple types of documents. And efficiency. |
|
|
8 |
Method is based on frequency. |
|
|
9 |
|
|
|
10 |
''' |
|
|
11 |
from rake_nltk import Rake |
|
|
12 |
|
|
|
13 |
# Uses stopwords for english from NLTK, and all puntuation characters by |
|
|
14 |
# default |
|
|
15 |
r = Rake() |
|
|
16 |
|
|
|
17 |
|
|
|
18 |
|
|
|
19 |
# Extraction given the text. |
|
|
20 |
# r.extract_keywords_from_text(test_free_text) |
|
|
21 |
|
|
|
22 |
# Extraction given the list of strings where each string is a sentence. |
|
|
23 |
# r.extract_keywords_from_sentences(<list of sentences>) |
|
|
24 |
|
|
|
25 |
# To get keyword phrases ranked highest to lowest. |
|
|
26 |
# r.get_ranked_phrases() |
|
|
27 |
|
|
|
28 |
# To get keyword phrases ranked highest to lowest with scores. |
|
|
29 |
# print (r.get_ranked_phrases_with_scores()) |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
def rake_extract(test_free_text,topk=30): |
|
|
33 |
r.extract_keywords_from_text(test_free_text) |
|
|
34 |
results = r.get_ranked_phrases()[:topk] |
|
|
35 |
|
|
|
36 |
return results |
|
|
37 |
|
|
|
38 |
|