a b/QueryExtraction/keybert_test_query.py
1
'''
2
https://github.com/MaartenGr/KeyBERT
3
4
Other choices: xlm-r-distilroberta-base-paraphrase-v1
5
6
Cite:
7
@misc{grootendorst2020keybert,
8
  author       = {Maarten Grootendorst},
9
  title        = {KeyBERT: Minimal keyword extraction with BERT.},
10
  year         = 2020,
11
  publisher    = {Zenodo},
12
  version      = {v0.1.3},
13
  doi          = {10.5281/zenodo.4461265},
14
  url          = {https://doi.org/10.5281/zenodo.4461265}
15
}
16
17
'''
18
19
from keybert import KeyBERT
20
21
22
def keybert_extract(doc,topk=30):
23
    '''
24
    Return 1-gram,2-gram and 3-gram, return top 30
25
    :param doc:
26
    :param topk:
27
    :return:
28
    '''
29
    model = KeyBERT('distilbert-base-nli-mean-tokens')
30
31
    results = model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), top_n=100,
32
                                     use_mmr=True, diversity=0.7,
33
                                     stop_words='english')
34
35
    selected = [k for k, v in sorted(results, key=lambda item: item[1], reverse=True)][:topk]
36
37
    return selected