|
a |
|
b/ehrkit/solr_lib.py |
|
|
1 |
import requests |
|
|
2 |
import logging |
|
|
3 |
logger = logging.getLogger(__name__) |
|
|
4 |
from nltk.stem import WordNetLemmatizer |
|
|
5 |
lemmatizer = WordNetLemmatizer() |
|
|
6 |
|
|
|
7 |
def solr_escape(query: str) -> str: |
|
|
8 |
# These special Solr characters need to be escaped. We deal with some of them |
|
|
9 |
# "+ - && || ! ( ) { } [ ] ^ " ~ * ? : \ /" |
|
|
10 |
char_translation_table = str.maketrans({ |
|
|
11 |
'[': '', |
|
|
12 |
']': '', |
|
|
13 |
'{': '', |
|
|
14 |
'}': '', |
|
|
15 |
'^': '', |
|
|
16 |
'~': '', |
|
|
17 |
'*': '', |
|
|
18 |
'\\': '', |
|
|
19 |
'/': '', |
|
|
20 |
'"': '', |
|
|
21 |
'!': '\!', |
|
|
22 |
'?': '\?', |
|
|
23 |
':': '\:', |
|
|
24 |
}) |
|
|
25 |
return query.translate(char_translation_table) |
|
|
26 |
|
|
|
27 |
def get_solr_response_generic(solr_formatted_query: str, solr_core_name: str): |
|
|
28 |
solr_response_json = requests.get( |
|
|
29 |
url ='http://tangra.cs.yale.edu:8983/solr/{}/select?'.format(solr_core_name), |
|
|
30 |
params={ |
|
|
31 |
'indent': 'on', |
|
|
32 |
'q': solr_formatted_query, |
|
|
33 |
'rows': '100', |
|
|
34 |
'wt': 'json', |
|
|
35 |
'fl': 'id, score' |
|
|
36 |
} |
|
|
37 |
).json() |
|
|
38 |
solr_response = solr_response_json['response'] |
|
|
39 |
num_rows = solr_response['numFound'] |
|
|
40 |
rows = solr_response['docs'] |
|
|
41 |
solr_matched_ids = [] |
|
|
42 |
id_to_score_map = {} |
|
|
43 |
for row in rows: |
|
|
44 |
item_id = int(row['id']) |
|
|
45 |
item_solr_score = float(row['score']) |
|
|
46 |
solr_matched_ids.append(item_id) |
|
|
47 |
id_to_score_map[item_id] = item_solr_score |
|
|
48 |
|
|
|
49 |
return solr_matched_ids, id_to_score_map |
|
|
50 |
def get_solr_response_mimic(raw_query): |
|
|
51 |
escaped_query = solr_escape(raw_query) |
|
|
52 |
query_words = ['"' + lemmatizer.lemmatize(word) + '"' for word in escaped_query.split()] |
|
|
53 |
if len(query_words) > 0: |
|
|
54 |
query = ' AND '.join(query_words) |
|
|
55 |
solr_formatted_query = 'abbreviations:({}) OR abbreviations_sent_id:({}) OR doctext_not_stored:({})'.format( |
|
|
56 |
query, query, query) |
|
|
57 |
else: |
|
|
58 |
solr_formatted_query = '*:*' |
|
|
59 |
|
|
|
60 |
return get_solr_response_generic(solr_formatted_query, 'ehr_abbs_mimic') |
|
|
61 |
def get_solr_response_umn_wrap(raw_query): |
|
|
62 |
escaped_query = solr_escape(raw_query) |
|
|
63 |
query_words = ['"' + lemmatizer.lemmatize(word) + '"' for word in escaped_query.split()] |
|
|
64 |
if len(query_words) > 0: |
|
|
65 |
query = ' AND '.join(query_words) |
|
|
66 |
solr_formatted_query = 'short_form:({})'.format( |
|
|
67 |
query) |
|
|
68 |
else: |
|
|
69 |
solr_formatted_query = '*:*' |
|
|
70 |
|
|
|
71 |
return get_solr_response_umn(solr_formatted_query, 'ehr_abbsense_umn') |
|
|
72 |
|
|
|
73 |
def get_solr_response_umn(solr_formatted_query: str, solr_core_name: str): |
|
|
74 |
solr_response_json = requests.get( |
|
|
75 |
url ='http://tangra.cs.yale.edu:8983/solr/{}/select?'.format(solr_core_name), |
|
|
76 |
params={ |
|
|
77 |
'indent': 'on', |
|
|
78 |
'q': solr_formatted_query, |
|
|
79 |
'rows': '100', |
|
|
80 |
'wt': 'json', |
|
|
81 |
'fl': 'id, long_form, score' |
|
|
82 |
} |
|
|
83 |
).json() |
|
|
84 |
solr_response = solr_response_json['response'] |
|
|
85 |
num_rows = solr_response['numFound'] |
|
|
86 |
rows = solr_response['docs'] |
|
|
87 |
solr_matched_longforms = [] |
|
|
88 |
long_form_to_score_map = {} |
|
|
89 |
for row in rows: |
|
|
90 |
item = row['long_form'] |
|
|
91 |
item_solr_score = float(row['score']) |
|
|
92 |
solr_matched_longforms.append(item) |
|
|
93 |
long_form_to_score_map[item] = item_solr_score |
|
|
94 |
return solr_matched_longforms, long_form_to_score_map |