|
a |
|
b/QueryExtraction/yake_test_query.py |
|
|
1 |
''' |
|
|
2 |
YAKE: https://github.com/LIAAD/yake |
|
|
3 |
|
|
|
4 |
YAKE! is a light-weight unsupervised automatic keyword extraction method which rests on text statistical features extracted from single documents to select the most important keywords of a text. |
|
|
5 |
We compare it against ten state-of-the-art unsupervised approaches (TF.IDF, KP-Miner, RAKE, TextRank, SingleRank, ExpandRank, TopicRank, TopicalPageRank, PositionRank and MultipartiteRank), and one supervised method (KEA). |
|
|
6 |
''' |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
import yake |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
language = "en" |
|
|
13 |
max_ngram_size = 3 |
|
|
14 |
deduplication_thresold = 0.9 |
|
|
15 |
deduplication_algo = 'seqm' |
|
|
16 |
windowSize = 1 |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
def yake_extract(text,topk=30): |
|
|
20 |
|
|
|
21 |
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=topk, features=None) |
|
|
22 |
keywords = custom_kw_extractor.extract_keywords(text) |
|
|
23 |
|
|
|
24 |
results = [] |
|
|
25 |
for kw in keywords: |
|
|
26 |
results.append(kw[0]) |
|
|
27 |
|
|
|
28 |
return results |
|
|
29 |
|