|
a |
|
b/QueryExtraction/spacy_test_query.py |
|
|
1 |
''' |
|
|
2 |
test |
|
|
3 |
''' |
|
|
4 |
|
|
|
5 |
import json |
|
|
6 |
import spacy |
|
|
7 |
import pytextrank |
|
|
8 |
from collections import defaultdict |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
|
|
|
12 |
|
|
|
13 |
nlp = spacy.load('en_core_web_sm') |
|
|
14 |
|
|
|
15 |
# load |
|
|
16 |
nlp = spacy.load("en_core_web_sm") |
|
|
17 |
# add PyTextRank to the spaCy pipeline |
|
|
18 |
tr = pytextrank.TextRank() |
|
|
19 |
nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) |
|
|
20 |
|
|
|
21 |
|
|
|
22 |
# method |
|
|
23 |
def pytextrank_extract(free_text,topk=30): |
|
|
24 |
query_set = defaultdict(float) |
|
|
25 |
|
|
|
26 |
'textrank extraction' |
|
|
27 |
doc = nlp(free_text) |
|
|
28 |
|
|
|
29 |
for p in doc._.phrases: |
|
|
30 |
|
|
|
31 |
if len(p.text) > 5: |
|
|
32 |
query_set[p.text] = query_set[p.text] + p.rank |
|
|
33 |
|
|
|
34 |
ordered_query_set = [(k,v) for k, v in sorted(query_set.items(), key=lambda item: item[1],reverse=True)][:topk] |
|
|
35 |
|
|
|
36 |
result_list = [] |
|
|
37 |
for query, score in ordered_query_set: |
|
|
38 |
# print(query,score) |
|
|
39 |
result_list.append(query) |
|
|
40 |
return result_list |
|
|
41 |
|
|
|
42 |
|
|
|
43 |
# ordered_query_set = extract(test_free_text) |
|
|
44 |
|
|
|
45 |
|
|
|
46 |
|
|
|
47 |
# |
|
|
48 |
# # print out |