a b/load_parse.py
1
#!/usr/bin/python !/usr/bin/env python
2
# -*- coding: utf-8 -*
3
4
5
# Function to extract knowledge from medical text
6
7
import json
8
import os
9
import py2neo
10
import csv
11
import subprocess
12
import urllib2
13
import requests
14
import unicodecsv as csv2
15
import pandas as pd
16
from nltk.tokenize import sent_tokenize
17
from config import settings
18
19
20
21
22
23
24
def mmap_extract(text):
25
    """
26
    Function-wrapper for metamap binary. Extracts concepts
27
    found in text.
28
29
    !!!! REMEMBER TO START THE METAMAP TAGGER AND
30
        WordSense DISAMBIGUATION SERVER !!!!
31
    
32
    Input:
33
        - text: str,
34
        a piece of text or sentence
35
    Output:
36
        - concepts: list,
37
        list of metamap concepts extracted
38
    """
39
40
    # Tokenize into sentences
41
    sents = sent_tokenize(text)
42
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
43
    concepts, errors = mm.extract_concepts(sents, range(len(sents)), 
44
                                         word_sense_disambiguation=True)
45
    if errors:
46
        print 'Errors with extracting concepts!'
47
        print errors
48
    return concepts
49
50
51
def runProcess(exe, working_dir):    
52
    """
53
    Function that opens a command line and runs a command.
54
    Captures the output and returns.
55
    Input:
56
        - exe: str,
57
        string of the command to be run. ! REMEMBER TO ESCAPE CHARS!
58
        - working_dir: str,
59
        directory where the cmd should be executed
60
    Output:
61
        - lines: list,
62
        list of strings generated from the command
63
    """
64
65
    p = subprocess.Popen(exe, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=working_dir, shell=True)
66
    lines = p.stdout.readlines()
67
    return lines
68
69
70
def stopw_removal(inp, stop):
71
    """
72
    Stopwords removal in line of text.
73
    Input:
74
        - inp: str,
75
        string of the text input
76
        - stop: list,
77
        list of stop-words to be removed 
78
    """
79
80
    # Final string to be returned
81
    final = ''
82
    for w in inp.lower().split():
83
        if w not in stop:
84
            final += w + ' '
85
    # Remove last whitespace that was added ' '
86
    final = final[:-1]
87
    return final
88
89
90
def reverb_wrapper(text, stop=None):
91
    """
92
    Function-wrapper for ReVerb binary. Extracts relations
93
    found in text.
94
    Input:
95
        - text: str,
96
        a piece of text or sentence
97
        - stop: list,
98
        list of stopwords to remove from the relations
99
    Output:
100
        - total: list,
101
        list of lists. Each inner list contains one relation in the form
102
        [subject, predicate, object]
103
    """
104
    total = []
105
    for sent in sent_tokenize(text):
106
        cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
107
        reverb_dir = settings['load']['path']['reverb']
108
        result = runProcess(cmd, reverb_dir)
109
        # Extract relations from reverb output
110
        result = result[-3:]
111
        result = [row.split('\t')[1].strip('\n') for row in result]
112
        # Remove common stopwords from relations
113
        if stop:
114
            result = [stopw_removal(res, stop) for res in result]
115
        total.append(result)
116
    # Remove empty relations
117
    total = [t for t in total if t]
118
    return total
119
120
121
122
123
def cui_to_uri(api_key, cui):
124
    """
125
    Function to map from cui to uri if possible. Uses biontology portal
126
    Input:
127
        - api_key: str,
128
        api usage key change it in setting.yaml
129
        - cui: str,
130
        cui of the entity we wish to map the uri
131
    Output:
132
        - the uri found in string format or None
133
    """
134
135
    REST_URL = "http://data.bioontology.org"
136
    annotations = get_json_with_api(api_key, REST_URL + "/search?include_properties=true&q=" + urllib2.quote(cui))
137
    try:
138
        return annotations['collection'][0]['@id']
139
    except Exception,e:
140
        print Exception
141
        print e
142
        return None
143
144
def get_json_with_api(api_key, url):
145
    """
146
    Helper funtion to retrieve a json from a url through urlib2
147
    Input:
148
        - api_key: str,
149
        api usage key change it in setting.yaml
150
        - url: str,
151
        url to curl
152
    Output:
153
        - json-style dictionary with the curl results 
154
    """
155
156
    opener = urllib2.build_opener()
157
    opener.addheaders = [('Authorization', 'apikey token=' + api_key)]
158
    return json.loads(opener.open(url).read())
159
160
161
def threshold_concepts(concepts, hard_num=3, score=None):
162
    """
163
    Thresholding concepts from metamap to keep only the most probable ones.
164
    Currently supporting thresholding on the first-N (hard_num) or based on
165
    the concept score.
166
    Input:
167
        - concepts: list,
168
        list of Metamap Class concepts
169
        - hard_num: int,
170
        the first-N concepts to keep, if this thresholidng is selected
171
        - score: float,
172
        lowest accepted concept score, if this thresholidng is selected 
173
    """
174
175
    if hard_num:
176
        if hard_num >= len(concepts):
177
            return concepts
178
        elif hard_num < len(concepts):
179
            return concepts[:hard_num]
180
    elif score:
181
            return [c for c in concepts if c.score > score]
182
    else:
183
        return concepts
184
        
185
186
187
188
def get_name_concept(concept):
189
    """
190
    Get name from the metamap concept. Tries different variations and
191
    returns the name found.
192
    Input:
193
        - concept: Metamap class concept, as generated from mmap_extract
194
        for example
195
    Output:
196
        - name: str,
197
        the name found for this concept
198
    """
199
200
    name = ''
201
    if hasattr(concept, 'preferred_name'):
202
        name = concept.preferred_name
203
    elif hasattr(concept, 'long_form') and hasattr(concept, 'short_form'):
204
        name = concept.long_form + '|' + concept.short_form
205
    elif hasattr(concept, 'long_form'):
206
        name = concept.long_form
207
    elif hasattr(concept, 'short_form'):
208
        name =  concept.short_form
209
    else:
210
        name = 'NO NAME IN CONCEPT'
211
    return name
212
213
214
215
def metamap_ents(x):
216
    """
217
    Function to get entities in usable form.
218
    Exctracts metamap concepts first, thresholds them and
219
    tries to extract names and uris for the concepts to be
220
    more usable.
221
    Input:
222
        - x: str,
223
        sentence to extract entities
224
    Output:
225
        - ents: list,
226
        list of entities found. Each entity is a dictionary with
227
        fields id (no. found in sentence), name if retrieved, cui if 
228
        available and uri if found
229
    """
230
231
    # API KEY to biontology mapping from cui to uri
232
    API_KEY = settings['apis']['biont']
233
    concepts = mmap_extract(x)
234
    concepts = threshold_concepts(concepts)
235
    ents = []
236
    for i, concept in enumerate(concepts):
237
        ent = {}
238
        ent['ent_id'] = i
239
        ent['name'] = get_name_concept(concept)
240
        if hasattr(concept, 'cui'):
241
            ent['cui'] = concept.cui
242
            ent['uri'] = cui_to_uri(API_KEY, ent['cui']) 
243
        else:
244
            ent['cui'] = None
245
            ent['uri'] = None
246
        ents.append(ent)
247
    return ents
248
249
250
def extract_entities(text, json_={}):
251
    """
252
    Extract entities from a given text using metamap and
253
    generate a json, preserving infro regarding the sentence
254
    of each entity that was found. For the time being, we preserve
255
    both concepts and the entities related to them
256
    Input:
257
         - text: str,
258
        a piece of text or sentence
259
        - json_: dic,
260
        sometimes the json to be returned is given to us to be enriched
261
        Defaults to an empty json_
262
    Output:
263
        - json_: dic,
264
        json with fields text, sents, concepts and entities
265
        containg the final results
266
    """
267
    json_['text'] = text
268
    # Tokenize the text
269
    sents = sent_tokenize(text)
270
    json_['sents'] = [{'sent_id': i, 'sent_text': sent} for i, sent in enumerate(sents)]
271
    json_['concepts'], _ = mmap_extract(text)
272
    json_['entities'] = {}
273
    for i, sent in enumerate(json_['sents']):
274
        ents = metamap_ents(sent)
275
        json_['entities'][sent['sent_id']] = ents
276
    return json_
277
278
279
def enrich_with_triples(results, subject, pred='MENTIONED_IN'):
280
    """
281
    Enrich with rdf triples a json dictionary in the form of:
282
    entity-URI -- MENTIONED_IN -- 'Text 'Title'. Only entities with
283
    uri's are considered.
284
    Input:
285
        - results: dic,
286
        json-style dictionary genereated from the extract_entities function
287
        - subject: str,
288
        the name of the text document in which the entities are mentioned
289
        - pred: str,
290
        the predicate to be used as a link between the uri and the title
291
    Output:
292
        - results: dic,
293
        the same dictionary with one more 
294
    """
295
    triples = []
296
    for sent_key, ents in results['entities'].iteritems():
297
        for ent in ents:
298
            if ent['uri']:
299
               triples.append({'subj': ent['uri'], 'pred': pred, 'obj': subject})
300
    results['triples'] = triples
301
    return results
302
        
303
304
305
def semrep_wrapper(text):
306
    """
307
    Function wrapper for SemRep binary. It is called with flags
308
    -F only and changing this will cause this parsing to fail, cause
309
    the resulting lines won't have the same structure.
310
    Input:
311
        - text: str,
312
        a piece of text or sentence
313
    Output:
314
        - results: dic,
315
        jston-style dictionary with fields text and sents. Each
316
        sentence has entities and relations found in it. Each entity and
317
        each relation has attributes denoted in the corresponding
318
        mappings dictionary. 
319
    """
320
    # Exec the binary
321
    cmd = "echo " + text + " | ./semrep.v1.7 -L 2015 -Z 2015AA -F"
322
    semrep_dir = settings['load']['path']['semrep']
323
    lines = runProcess(cmd, semrep_dir)
324
    # mapping of line elements to fields
325
    mappings = {
326
        "text": {
327
            "sent_id": 4,
328
            "sent_text": 6
329
        },
330
        "entity": {
331
            'cuid': 6,
332
            'label': 7,
333
            'sem_types': 8,
334
            'score': 15
335
        },
336
        "relation": {
337
            'subject__cui': 8,
338
            'subject__label': 9,
339
            'subject__sem_types': 10,
340
            'subject__sem_type': 11,
341
            'subject__score': 18,
342
            'predicate__type': 21,
343
            'predicate': 22,
344
            'negation': 23,
345
            'object__cui': 28,
346
            'object__label': 29,
347
            'object__sem_types': 30,
348
            'object__sem_type': 31,
349
            'object__score': 38,
350
        }
351
    }
352
    results = {'sents': [], 'text': text}
353
    for line in lines:
354
        # If Sentence
355
        if line.startswith('SE'):
356
            elements = line.split('|')
357
            # New sentence that was processed
358
            if elements[5] == 'text':
359
                tmp = {"entities": [], "relations": []}
360
                for key, ind in mappings['text'].iteritems():
361
                    tmp[key] = elements[ind]
362
                results['sents'].append(tmp)
363
            # A line containing entity info
364
            if elements[5] == 'entity':
365
                tmp = {}
366
                for key, ind in mappings['entity'].iteritems():
367
                    if key == 'sem_types':
368
                        tmp[key] = elements[ind].split(',')
369
                    tmp[key] = elements[ind]
370
                results['sents'][-1]['entities'].append(tmp)
371
            # A line containing relation info
372
            if elements[5] == 'relation':
373
                tmp = {}
374
                for key, ind in mappings['relation'].iteritems():
375
                    if 'sem_types' in key:
376
                        tmp[key] = elements[ind].split(',')
377
                    else:
378
                        tmp[key] = elements[ind]
379
                results['sents'][-1]['relations'].append(tmp)
380
    return results
381
382
383
384
385
386
387
results = extract_entities(text)
388
results = enrich_with_triples(results, subject='Text Title')
389
390