Diff of /ehrkit/solr_lib.py [000000] .. [2d4573]

Switch to unified view

a b/ehrkit/solr_lib.py
1
import requests
2
import logging
3
logger = logging.getLogger(__name__)
4
from nltk.stem import WordNetLemmatizer
5
lemmatizer = WordNetLemmatizer()
6
7
def solr_escape(query: str) -> str:
8
    # These special Solr characters need to be escaped. We deal with some of them
9
    # "+ - && || ! ( ) { } [ ] ^ " ~ * ? : \ /"
10
    char_translation_table = str.maketrans({
11
        '[': '',
12
        ']': '',
13
        '{': '',
14
        '}': '',
15
        '^': '',
16
        '~': '',
17
        '*': '',
18
        '\\': '',
19
        '/': '',
20
        '"': '',
21
        '!': '\!',
22
        '?': '\?',
23
        ':': '\:',
24
    })
25
    return query.translate(char_translation_table)
26
27
def get_solr_response_generic(solr_formatted_query: str, solr_core_name: str):
28
    solr_response_json = requests.get(
29
        url ='http://tangra.cs.yale.edu:8983/solr/{}/select?'.format(solr_core_name),
30
        params={
31
            'indent': 'on',
32
            'q': solr_formatted_query,
33
            'rows': '100',
34
            'wt': 'json',
35
            'fl': 'id, score'
36
        }
37
    ).json()
38
    solr_response = solr_response_json['response']
39
    num_rows = solr_response['numFound']
40
    rows = solr_response['docs']
41
    solr_matched_ids = []
42
    id_to_score_map = {}
43
    for row in rows:
44
        item_id = int(row['id'])
45
        item_solr_score = float(row['score'])
46
        solr_matched_ids.append(item_id)
47
        id_to_score_map[item_id] = item_solr_score
48
49
    return solr_matched_ids, id_to_score_map
50
def get_solr_response_mimic(raw_query):
51
    escaped_query = solr_escape(raw_query)
52
    query_words = ['"' + lemmatizer.lemmatize(word) + '"' for word in escaped_query.split()]
53
    if len(query_words) > 0:
54
            query = ' AND '.join(query_words)
55
            solr_formatted_query = 'abbreviations:({}) OR abbreviations_sent_id:({}) OR doctext_not_stored:({})'.format(
56
                query, query, query)
57
    else:
58
        solr_formatted_query = '*:*'
59
60
    return get_solr_response_generic(solr_formatted_query, 'ehr_abbs_mimic')
61
def get_solr_response_umn_wrap(raw_query):
62
    escaped_query = solr_escape(raw_query)
63
    query_words = ['"' + lemmatizer.lemmatize(word) + '"' for word in escaped_query.split()]
64
    if len(query_words) > 0:
65
            query = ' AND '.join(query_words)
66
            solr_formatted_query = 'short_form:({})'.format(
67
                query)
68
    else:
69
        solr_formatted_query = '*:*'
70
71
    return get_solr_response_umn(solr_formatted_query, 'ehr_abbsense_umn')
72
73
def get_solr_response_umn(solr_formatted_query: str, solr_core_name: str):
74
    solr_response_json = requests.get(
75
        url ='http://tangra.cs.yale.edu:8983/solr/{}/select?'.format(solr_core_name),
76
        params={
77
            'indent': 'on',
78
            'q': solr_formatted_query,
79
            'rows': '100',
80
            'wt': 'json',
81
            'fl': 'id, long_form, score'
82
        }
83
    ).json()
84
    solr_response = solr_response_json['response']
85
    num_rows = solr_response['numFound']
86
    rows = solr_response['docs']
87
    solr_matched_longforms = []
88
    long_form_to_score_map = {}
89
    for row in rows:
90
        item = row['long_form']
91
        item_solr_score = float(row['score'])
92
        solr_matched_longforms.append(item)
93
        long_form_to_score_map[item] = item_solr_score
94
    return solr_matched_longforms, long_form_to_score_map