a b/data_extractor.py
1
#!/usr/bin/python !/usr/bin/env python
2
# -*- coding: utf-8 -*
3
4
5
# Functions to extract knowledge from medical text. Everything related to
6
# extraction needed for the knowledge base. Also, some wrappers for SemRep,
7
# MetaMap and Reverb. Contains some enrichment routines for utilizing UTS
8
# services.
9
10
11
import json
12
import subprocess
13
import urllib2
14
import pymongo
15
import numpy as np
16
from nltk.tokenize import sent_tokenize
17
from config import settings
18
from pymetamap import MetaMap
19
from utilities import time_log, get_concept_from_cui, get_concept_from_source
20
from itertools import product
21
from multiprocessing import cpu_count, Pool
22
from unidecode import unidecode
23
24
def metamap_wrapper(text):
25
    """
26
    Function-wrapper for metamap binary. Extracts concepts
27
    found in text.
28
29
    !!!! REMEMBER TO START THE METAMAP TAGGER AND
30
        WordSense DISAMBIGUATION SERVER !!!!
31
32
    Input:
33
        - text: str,
34
        a piece of text or sentence
35
    Output:
36
       - a dictionary with key sents and values
37
       a list of the concepts found
38
    """
39
40
    # Tokenize into sentences
41
    sents = sent_tokenize(text)
42
    # Load Metamap Instance
43
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
44
    concepts, errors = mm.extract_concepts(sents, range(len(sents)))
45
    # Keep the sentence ids
46
    ids = np.array([int(concept[0]) for concept in concepts])
47
    sentences = []
48
    for i in xrange(len(sents)):
49
        tmp = {'sent_id': i+1, 'entities': [], 'relations': []}
50
        # Wanted concepts according to sentence
51
        wanted = np.where(ids == i)[0].tolist()
52
        for w_ind in wanted:
53
            w_conc = concepts[w_ind]
54
            if hasattr(w_conc, 'cui'):
55
                tmp_conc = {'label': w_conc.preferred_name, 'cui': w_conc.cui,
56
                            'sem_types': w_conc.semtypes, 'score': w_conc.score}
57
                tmp['entities'].append(tmp_conc)
58
        sentences.append(tmp)
59
    if errors:
60
        time_log('Errors with extracting concepts!')
61
        time_log(errors)
62
    return {'sents': sentences, 'sent_text':text}
63
64
65
def runProcess(exe, working_dir):
66
    """
67
    Function that opens a command line and runs a command.
68
    Captures the output and returns.
69
    Input:
70
        - exe: str,
71
        string of the command to be run. ! REMEMBER TO ESCAPE CHARS!
72
        - working_dir: str,
73
        directory where the cmd should be executed
74
    Output:
75
        - lines: list,
76
        list of strings generated from the command
77
    """
78
79
    p = subprocess.Popen(exe, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=working_dir, shell=True)
80
    lines = p.stdout.readlines()
81
    return lines
82
83
84
def stopw_removal(inp, stop):
85
    """
86
    Stopwords removal in line of text.
87
    Input:
88
        - inp: str,
89
        string of the text input
90
        - stop: list,
91
        list of stop-words to be removed
92
    """
93
94
    # Final string to be returned
95
    final = ''
96
    for w in inp.lower().split():
97
        if w not in stop:
98
            final += w + ' '
99
    # Remove last whitespace that was added ' '
100
    final = final[:-1]
101
    return final
102
103
104
def create_text_batches(text, N=5000, buffer_ = 100):
105
    """
106
    Function that takes a long string and split it into
107
    batches of approximately length N. The actual length
108
    of each batch differs, as each batch end in the next
109
    dot found in the string after the N chars.
110
    Input:
111
        - text: str,
112
        piece of text to clean
113
        - N: int,
114
        split into strings of 5000 characters each
115
    Output:
116
        - chunks: list,
117
        list containing the string parts
118
    """
119
    M = len(text)
120
    chunks_num = M // N
121
    if M % N != 0:
122
        chunks_num += 1
123
    chunks = []
124
    end_ind = 0
125
    start_ind = 0
126
    i = 0
127
    while i < chunks_num:
128
        start_ind = end_ind
129
        prob_text = text[start_ind + N: start_ind + N + buffer_]
130
        if '.' in prob_text:
131
            end_ind = start_ind + N + prob_text.index('.')+1
132
        else:
133
            end_ind = start_ind + N
134
        chunks.append(text[start_ind:end_ind])
135
        i += 1
136
    chunks = [ch for ch in chunks if ch]
137
    return chunks
138
139
140
141
def reverb_wrapper(text, stop=None):
142
    """
143
    Function-wrapper for ReVerb binary. Extracts relations
144
    found in text.
145
    Input:
146
        - text: str,
147
        a piece of text or sentence
148
        - stop: list,
149
        list of stopwords to remove from the relations
150
    Output:
151
        - total: list,
152
        list of lists. Each inner list contains one relation in the form
153
        [subject, predicate, object]
154
    """
155
    total = []
156
    for sent in sent_tokenize(text):
157
        cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
158
        reverb_dir = settings['load']['path']['reverb']
159
        result = runProcess(cmd, reverb_dir)
160
        # Extract relations from reverb output
161
        result = result[-3:]
162
        result = [row.split('\t')[1].strip('\n') for row in result]
163
        # Remove common stopwords from relations
164
        if stop:
165
            result = [stopw_removal(res, stop) for res in result]
166
        total.append(result)
167
    # Remove empty relations
168
    total = [t for t in total if t]
169
    return total
170
171
172
173
174
def cui_to_uri(api_key, cui):
175
    """
176
    Function to map from cui to uri if possible. Uses biontology portal
177
    Input:
178
        - api_key: str,
179
        api usage key change it in setting.yaml
180
        - cui: str,
181
        cui of the entity we wish to map the uri
182
    Output:
183
        - the uri found in string format or None
184
    """
185
186
    REST_URL = "http://data.bioontology.org"
187
    annotations = get_json_with_api(api_key, REST_URL + "/search?include_properties=true&q=" + urllib2.quote(cui))
188
    try:
189
        return annotations['collection'][0]['@id']
190
    except Exception, e:
191
        time_log(Exception)
192
        time_log(e)
193
        return None
194
195
def get_json_with_api(api_key, url):
196
    """
197
    Helper funtion to retrieve a json from a url through urlib2
198
    Input:
199
        - api_key: str,
200
        api usage key change it in setting.yaml
201
        - url: str,
202
        url to curl
203
    Output:
204
        - json-style dictionary with the curl results
205
    """
206
207
    opener = urllib2.build_opener()
208
    opener.addheaders = [('Authorization', 'apikey token=' + api_key)]
209
    return json.loads(opener.open(url).read())
210
211
212
def threshold_concepts(concepts, hard_num=3, score=None):
213
    """
214
    Thresholding concepts from metamap to keep only the most probable ones.
215
    Currently supporting thresholding on the first-N (hard_num) or based on
216
    the concept score.
217
    Input:
218
        - concepts: list,
219
        list of Metamap Class concepts
220
        - hard_num: int,
221
        the first-N concepts to keep, if this thresholidng is selected
222
        - score: float,
223
        lowest accepted concept score, if this thresholidng is selected
224
    """
225
226
    if hard_num:
227
        if hard_num >= len(concepts):
228
            return concepts
229
        elif hard_num < len(concepts):
230
            return concepts[:hard_num]
231
    elif score:
232
            return [c for c in concepts if c.score > score]
233
    else:
234
        return concepts
235
236
237
238
239
def get_name_concept(concept):
240
    """
241
    Get name from the metamap concept. Tries different variations and
242
    returns the name found.
243
    Input:
244
        - concept: Metamap class concept, as generated from mmap_extract
245
        for example
246
    Output:
247
        - name: str,
248
        the name found for this concept
249
    """
250
251
    name = ''
252
    if hasattr(concept, 'preferred_name'):
253
        name = concept.preferred_name
254
    elif hasattr(concept, 'long_form') and hasattr(concept, 'short_form'):
255
        name = concept.long_form + '|' + concept.short_form
256
    elif hasattr(concept, 'long_form'):
257
        name = concept.long_form
258
    elif hasattr(concept, 'short_form'):
259
        name =  concept.short_form
260
    else:
261
        name = 'NO NAME IN CONCEPT'
262
    return name
263
264
265
266
def metamap_ents(x):
267
    """
268
    Function to get entities in usable form.
269
    Exctracts metamap concepts first, thresholds them and
270
    tries to extract names and uris for the concepts to be
271
    more usable.
272
    Input:
273
        - x: str,
274
        sentence to extract entities
275
    Output:
276
        - ents: list,
277
        list of entities found. Each entity is a dictionary with
278
        fields id (no. found in sentence), name if retrieved, cui if
279
        available and uri if found
280
    """
281
282
    # API KEY to biontology mapping from cui to uri
283
    API_KEY = settings['apis']['biont']
284
    concepts = mmap_extract(x)
285
    concepts = threshold_concepts(concepts)
286
    ents = []
287
    for i, concept in enumerate(concepts):
288
        ent = {}
289
        ent['ent_id'] = i
290
        ent['name'] = get_name_concept(concept)
291
        if hasattr(concept, 'cui'):
292
            ent['cui'] = concept.cui
293
            ent['uri'] = cui_to_uri(API_KEY, ent['cui'])
294
        else:
295
            ent['cui'] = None
296
            ent['uri'] = None
297
        ents.append(ent)
298
    return ents
299
300
301
def extract_entities(text, json_={}):
302
    """
303
    Extract entities from a given text using metamap and
304
    generate a json, preserving infro regarding the sentence
305
    of each entity that was found. For the time being, we preserve
306
    both concepts and the entities related to them
307
    Input:
308
         - text: str,
309
        a piece of text or sentence
310
        - json_: dic,
311
        sometimes the json to be returned is given to us to be enriched
312
        Defaults to an empty json_
313
    Output:
314
        - json_: dic,
315
        json with fields text, sents, concepts and entities
316
        containg the final results
317
    """
318
    json_['text'] = text
319
    # Tokenize the text
320
    sents = sent_tokenize(text)
321
    json_['sents'] = [{'sent_id': i, 'sent_text': sent} for i, sent in enumerate(sents)]
322
    json_['concepts'], _ = mmap_extract(text)
323
    json_['entities'] = {}
324
    for i, sent in enumerate(json_['sents']):
325
        ents = metamap_ents(sent)
326
        json_['entities'][sent['sent_id']] = ents
327
    return json_
328
329
def extract_metamap(json_, key):
330
    """
331
    Task function to parse and extract concepts from json_ style dic, using
332
    the MetaMap binary.
333
    Input:
334
        - json_ : dic,
335
        json-style dictionary generated from the Parse object related
336
        to the specific type of input
337
        - key : str,
338
        string denoting the type of medical text to read from. Used to
339
        find the correct paragraph in the settings.yaml file.
340
    Output:
341
        - json_ : dic,
342
        the previous json-style dictionary enriched with medical concepts
343
    """
344
    # outerfield for the documents in json
345
    docfield = settings['out']['json']['itemfield']
346
    # textfield to read text from
347
    textfield = settings['out']['json']['json_text_field']
348
    N = len(json_[docfield])
349
    for i, doc in enumerate(json_[docfield]):
350
        text = clean_text(doc[textfield])
351
        if len(text) > 5000:
352
            chunks = create_text_batches(text)
353
            results = {'text': text, 'sents': []}
354
            sent_id = 0
355
            for chunk in chunks:
356
                tmp = metamap_wrapper(chunk)
357
                for sent in tmp['sents']:
358
                    sent['sent_id'] = sent_id
359
                    sent_id += 1
360
                    results['sents'].append(sent)
361
        else:
362
            results = metamap_wrapper(text)
363
        json_[docfield][i].update(results)
364
        proc = int(i/float(N)*100)
365
        if proc % 10 == 0 and proc > 0:
366
            time_log('We are at %d/%d documents -- %0.2f %%' % (i, N, proc))
367
    return json_
368
369
370
def enrich_with_triples(results, subject, pred='MENTIONED_IN'):
371
    """
372
    Enrich with rdf triples a json dictionary in the form of:
373
    entity-URI -- MENTIONED_IN -- 'Text 'Title'. Only entities with
374
    uri's are considered.
375
    Input:
376
        - results: dic,
377
        json-style dictionary genereated from the extract_entities function
378
        - subject: str,
379
        the name of the text document in which the entities are mentioned
380
        - pred: str,
381
        the predicate to be used as a link between the uri and the title
382
    Output:
383
        - results: dic,
384
        the same dictionary with one more
385
    """
386
    triples = []
387
    for sent_key, ents in results['entities'].iteritems():
388
        for ent in ents:
389
            if ent['uri']:
390
               triples.append({'subj': ent['uri'], 'pred': pred, 'obj': subject})
391
    results['triples'] = triples
392
    return results
393
394
def force_to_unicode(text):
395
    "If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding"
396
    return text if isinstance(text, unicode) else text.decode('utf8', 'ignore')
397
398
399
def toAscii_wrapper(text):
400
    """
401
    Function wrapper for Lexical Tool toAscii:
402
    https://lexsrv3.nlm.nih.gov/LexSysGroup/Projects/lvg/current/docs/userDoc/tools/toAscii.html
403
    Converts input to ascii ready for SemRep
404
    Input:
405
        - text: str,
406
        a piece of text or sentence'
407
    Output:
408
        - text: str,
409
        the same text with changes
410
    """
411
    text = clean_text(text)
412
    #text = repr(text)
413
    cmd = 'echo "' + text + '" | ./toAscii'
414
    toAscii_dir = settings['load']['path']['toAscii']
415
    lines = runProcess(cmd, toAscii_dir)
416
    return lines[0]
417
418
def semrep_wrapper(text):
419
    """
420
    Function wrapper for SemRep binary. It is called with flags
421
    -F only and changing this will cause this parsing to fail, cause
422
    the resulting lines won't have the same structure.
423
    Input:
424
        - text: str,
425
        a piece of text or sentence
426
    Output:
427
        - results: dic,
428
        jston-style dictionary with fields text and sents. Each
429
        sentence has entities and relations found in it. Each entity and
430
        each relation has attributes denoted in the corresponding
431
        mappings dictionary.
432
    """
433
    # Exec the binary
434
    # THIS SHOULD FIX ENCODING PROBLEMS???
435
    text = clean_text(text)
436
    utf8 = force_to_unicode(text)
437
    text = unidecode(utf8)
438
    # text = toAscii_wrapper(text)
439
    # THIS IS NEEDED FOR ANY ARTIFACTS!
440
    text = repr(text)
441
    cmd = "echo " + text + " | ./semrep.v1.7 -L 2015 -Z 2015AA -F"
442
    #print cmd
443
    semrep_dir = settings['load']['path']['semrep']
444
    lines = runProcess(cmd, semrep_dir)
445
    #print(lines)
446
    # mapping of line elements to fields
447
    mappings = {
448
        "text": {
449
            "sent_id": 4,
450
            "sent_text": 6
451
        },
452
        "entity": {
453
            'cuid': 6,
454
            'label': 7,
455
            'sem_types': 8,
456
            'score': 15
457
        },
458
        "relation": {
459
            'subject__cui': 8,
460
            'subject__label': 9,
461
            'subject__sem_types': 10,
462
            'subject__sem_type': 11,
463
            'subject__score': 18,
464
            'predicate__type': 21,
465
            'predicate': 22,
466
            'negation': 23,
467
            'object__cui': 28,
468
            'object__label': 29,
469
            'object__sem_types': 30,
470
            'object__sem_type': 31,
471
            'object__score': 38,
472
        }
473
    }
474
    results = {'sents': [], 'text': text}
475
    for line in lines:
476
        # If Sentence
477
        if line.startswith('SE'):
478
            ##### DEPRECATED AS IN CLEAN TEXT WE REMOVE TABS FROM TEXT #######
479
            # Temporary workaround to remove read |-delimited semrep output
480
            # Without mixing up tabs contained in the text
481
            # line = line.replace('\|', '!@#$')
482
            # elements = line.split('|')
483
            # elements = [el.replace('!@#$', '\|') for el in elements]
484
            #########################  DEPRECATED ###########################
485
            elements = line.split('|')
486
            # New sentence that was processed
487
            if elements[5] == 'text':
488
                tmp = {"entities": [], "relations": []}
489
                for key, ind in mappings['text'].iteritems():
490
                    tmp[key] = elements[ind]
491
                results['sents'].append(tmp)
492
            # A line containing entity info
493
            if elements[5] == 'entity':
494
                tmp = {}
495
                for key, ind in mappings['entity'].iteritems():
496
                    if key == 'sem_types':
497
                        tmp[key] = elements[ind].split(',')
498
                    tmp[key] = elements[ind]
499
                results['sents'][-1]['entities'].append(tmp)
500
            # A line containing relation info
501
            if elements[5] == 'relation':
502
                tmp = {}
503
                for key, ind in mappings['relation'].iteritems():
504
                    if 'sem_types' in key:
505
                        tmp[key] = elements[ind].split(',')
506
                    else:
507
                        tmp[key] = elements[ind]
508
                results['sents'][-1]['relations'].append(tmp)
509
    return results
510
511
512
def clean_text(text):
513
    """
514
    Escape specific characters for command line call of SemRep. This
515
    could be updated in the future to more sophisticated transformations.
516
    Input:
517
        - text: str,
518
        piece of text to clean
519
    Output:
520
        - text: str,
521
        the same text with cmd escaped parenthesis and removing '
522
    """
523
    replace_chars = [('(', ' '), (')', ' '), ("'",  ' '), ('\n', " "), ('\t', ' '), (';', " "),
524
                     ("}", " "), ("{", " "), ("|", " "), ("&", " "), ("/", ' ')]
525
    for unw_pair in replace_chars:
526
        text = text.replace(unw_pair[0], unw_pair[1])
527
    text = ' '.join(text.split())
528
    return text
529
530
531
def extract_semrep(json_, key):
532
    """
533
    Task function to parse and extract concepts from json_ style dic, using
534
    the SemRep binary.
535
    Input:
536
        - json_ : dic,
537
        json-style dictionary generated from the Parse object related
538
        to the specific type of input
539
        - key : str,
540
        string denoting the type of medical text to read from. Used to
541
        find the correct paragraph in the settings.yaml file.
542
    Output:
543
        - json_ : dic,
544
        the previous json-style dictionary enriched with medical concepts
545
    """
546
    # outerfield for the documents in json
547
    if key == 'mongo':
548
        key = 'json'
549
    docfield = settings['out']['json']['itemfield']
550
    # textfield to read text from
551
    textfield = settings['out']['json']['json_text_field']
552
    N = len(json_[docfield])
553
    for i, doc in enumerate(json_[docfield]):
554
        print doc['id']
555
        text = doc[textfield]
556
        if len(text) > 5000:
557
            chunks = create_text_batches(text)
558
            results = {'text': text, 'sents': []}
559
            sent_id = 0
560
            c = 0
561
            for chunk in chunks:
562
                c += 1
563
                tmp = semrep_wrapper(chunk)
564
                for sent in tmp['sents']:
565
                    sent['sent_id'] = sent_id
566
                    sent_id += 1
567
                    results['sents'].append(sent)
568
        else:
569
            results = semrep_wrapper(text)
570
        json_[docfield][i].update(results)
571
        proc = int(i/float(N)*100)
572
        if proc % 10 == 0 and proc > 0:
573
            time_log('We are at %d/%d documents -- %0.2f %%' % (i, N, proc))
574
    return json_
575
576
577
578
579
def extract_semrep_parallel(json_, key):
580
    """
581
    Task function to parse and extract concepts from json_ style dic, using
582
    the SemRep binary. It uses multiprocessing for efficiency.
583
    Input:
584
        - json_ : dic,
585
        json-style dictionary generated from the Parse object related
586
        to the specific type of input
587
        - key : str,
588
        string denoting the type of medical text to read from. Used to
589
        find the correct paragraph in the settings.yaml file.
590
    Output:
591
        - json_ : dic,
592
        the previous json-style dictionary enriched with medical concepts
593
    """
594
    # outerfield for the documents in json
595
    docfield = settings['out']['json']['itemfield']
596
    N = len(json_[docfield])
597
    try:
598
        N_THREADS = int(settings['num_cores'])
599
    except:
600
        N_THREADS = cpu_count()
601
    batches = chunk_document_collection(json_[docfield], N_THREADS)
602
    len_col = " | ".join([str(len(b)) for b in batches])
603
    time_log('Will break the collection into batches of: %s documents!' % len_col)
604
    batches = [{docfield: batch} for batch in batches]
605
    data = zip(batches, [key for batch in batches])
606
    pool = Pool(N_THREADS, maxtasksperchild=1)
607
    res = pool.map(semrep_parallel_worker, data)
608
    pool.close()
609
    pool.join()
610
    del pool
611
    tmp = {docfield: []}
612
    for batch_res in res:
613
        tmp[docfield].extend(batch_res[docfield])
614
    for i, sub_doc in enumerate(json_[docfield]):
615
        for sub_doc_new in tmp[docfield]:
616
            if sub_doc_new['id'] == sub_doc['id']:
617
                json_[docfield][i].update(sub_doc_new)
618
                break
619
    time_log('Completed multiprocessing extraction!')
620
    return json_
621
622
623
def chunk_document_collection(seq, num):
624
    """
625
    Helper function to break a collection of N = len(seq) documents
626
    to num batches.
627
    Input:
628
        - seq: list,
629
        a list of documents
630
        - num: int,
631
        number of batches to be broken into. This will usually be
632
        equal to the number of cores available
633
    Output:
634
        - out: list,
635
        a list of lists. Each sublist contains the batch-collection
636
        of documents to be used.
637
    """
638
    avg = len(seq) / float(num)
639
    out = []
640
    last = 0.0
641
642
    while last < len(seq):
643
        out.append(seq[int(last):int(last + avg)])
644
        last += avg
645
646
    return out
647
648
649
def semrep_parallel_worker((json_, key)):
650
    """
651
    Just a worker interface for the different SemRep
652
    executions.
653
    Input:
654
        - json_ : dic,
655
        json-style dictionary generated from the Parse object related
656
        to the specific type of input
657
        - key : str,
658
        string denoting the type of medical text to read from. Used to
659
        find the correct paragraph in the settings.yaml file.
660
    Output:
661
        - res : dic,
662
        the previous json-style dictionary enriched with medical concepts
663
664
    """
665
    res = extract_semrep(json_, key)
666
    return res
667
668
669
670
def get_concepts_from_edges_parallel(json_, key):
671
    """
672
    Same work as the get_concepts_from_edges_paralle. It uses multiprocessing
673
    for efficiency.
674
    Input:
675
        - json: dict,
676
        json-style dictionary with a field containing
677
        relations
678
        - key : str,
679
        string denoting the type of medical text to read from. Used to
680
        find the correct paragraph in the settings.yaml file.
681
    Output:
682
        - json: dict,
683
        the updated json-style dictionary where the relations
684
        in the list have been updated and each subject-object has been
685
        mapped to the according
686
687
    """
688
    outfield = settings['load'][key]['itemfield']
689
    N = len(json_[outfield])
690
    try:
691
        N_THREADS = int(settings['num_cores'])
692
    except:
693
        N_THREADS = cpu_count()
694
    batches = chunk_document_collection(json_[outfield], N_THREADS)
695
    len_col = " | ".join([str(len(b)) for b in batches])
696
    time_log('Will break the edges into batches of: %s documents!' % len_col)
697
    batches = [{outfield: batch} for batch in batches]
698
    data = zip(batches, [key for batch in batches])
699
    pool = Pool(N_THREADS, maxtasksperchild=1)
700
    res = pool.map(edges_parallel_worker, data)
701
    pool.close()
702
    pool.join()
703
    del pool
704
    json_ = {outfield: []}
705
    for batch_res in res:
706
        json_[outfield].extend(batch_res[outfield])
707
    time_log('Completed multiprocessing extraction!')
708
    return json_
709
710
711
712
713
def edges_parallel_worker((json_, key)):
714
    """
715
    Just a worker interface for the parallel enrichment
716
    executions.
717
    Input:
718
        - json_ : dic,
719
        json-style dictionary generated from the Parse object related
720
        to the specific type of input
721
        - key : str,
722
        string denoting the type of medical text to read from. Used to
723
        find the correct paragraph in the settings.yaml file.
724
    Output:
725
        - res : dic,
726
        expected outcome of get_concepts_from_edges
727
728
    """
729
    res = get_concepts_from_edges(json_, key)
730
    return res
731
732
733
def get_concepts_from_edges(json_, key):
734
    """
735
    Get concept-specific info related to an entity from a list
736
    containing relations. Each subject-object in the relations
737
    list is expressed in a another data source(MESH, DRUGBANK etc)
738
    and their unique identifier is provided. Also, articles and new
739
    kinde of sub-obj are handled.
740
    Input:
741
        - json: dict,
742
        json-style dictionary with a field containing
743
        relations
744
        - key : str,
745
        string denoting the type of medical text to read from. Used to
746
        find the correct paragraph in the settings.yaml file.
747
    Output:
748
        - json: dict,
749
        the updated json-style dictionary where the relations
750
        in the list have been updated and each subject-object has been
751
        mapped to the according
752
753
    """
754
755
    # docfield containing list of elements containing the relations
756
    outfield = settings['load'][key]['itemfield']
757
    # field containing the type of the node for the subject
758
    sub_type = settings['load'][key]['sub_type']
759
    # field containing the source of the node for the subject
760
    sub_source = settings['load'][key]['sub_source']
761
    # field containing the type of the node for the object
762
    obj_type = settings['load'][key]['obj_type']
763
    # field containing the source of the node for the object
764
    obj_source = settings['load'][key]['obj_source']
765
    new_relations = []
766
    uri = settings['load']['mongo']['uri']
767
    db_name = settings['load']['mongo']['db']
768
    collection_name = settings['load']['mongo']['cache_collection']
769
    client = pymongo.MongoClient(uri)
770
    db = client[db_name]
771
    collection = db[collection_name]
772
    cur = collection.find({})
773
    cache = {}
774
    for item in cur:
775
        cache[item['key']] = item['value']
776
    N = len(json_[outfield])
777
    for ii, triple in enumerate(json_[outfield]):
778
        print triple
779
        try:
780
            if sub_source == 'UMLS':
781
                if not(triple['s'] in cache):
782
                    ent = get_concept_from_cui(triple['s'])
783
                    cache[triple['s']] = ent
784
                    collection.insert_one({'key':triple['s'],'value':ent})
785
                    print 'INSERTED in UMLS %s' % triple['s']
786
                else:
787
                    ent = cache[triple['s']]
788
                if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1):
789
                    sem_types = ';'.join(ent['sem_types'])
790
                elif (',' in ent['sem_types']):
791
                    sem_types = ';'.join(ent['sem_types'].split(','))
792
                else:
793
                    sem_types = ent['sem_types']
794
795
                triple_subj = [{'id:ID': ent['cuid'],
796
                                'label': ent['label'],
797
                                'sem_types:string[]': sem_types}]
798
            elif (sub_source == 'PMC') or (sub_source == 'TEXT') or (sub_source == 'None'):
799
                triple_subj = [{'id:ID': triple['s']}]
800
            else:
801
                if not(triple['s'] in cache):
802
                    ents = get_concept_from_source(triple['s'], sub_source)
803
                    cache[triple['s']] = ents
804
                    collection.insert_one({'key':triple['s'],'value':ents})
805
                    print 'INSERTED in other %s' % triple['s']
806
                else:
807
                    ents = cache[triple['s']]
808
                triple_subj = []
809
                for ent in ents:
810
                    if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1):
811
                        sem_types = ';'.join(ent['sem_types'])
812
                    elif (',' in ent['sem_types']):
813
                        sem_types = ';'.join(ent['sem_types'].split(','))
814
                    else:
815
                        sem_types = ent['sem_types']
816
817
                    triple_subj.append({'id:ID': ent['cuid'],
818
                                    'label': ent['label'],
819
                                    'sem_types:string[]': sem_types})
820
            if obj_source == 'UMLS':
821
                if not(triple['o'] in cache):
822
                    ent = get_concept_from_cui(triple['o'])
823
                    cache[triple['o']] = ent
824
                    collection.insert_one({'key':triple['o'],'value':ent})
825
                    print 'INSERTED in UMLS %s' % triple['o']
826
                else:
827
                    ent = cache[triple['o']]
828
                if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1):
829
                    sem_types = ';'.join(ent['sem_types'])
830
                elif (',' in ent['sem_types']):
831
                    sem_types = ';'.join(ent['sem_types'].split(','))
832
                else:
833
                    sem_types = ent['sem_types']
834
                triple_obj = [{'id:ID': ent['cuid'],
835
                                'label': ent['label'],
836
                                'sem_types:string[]': sem_types}]
837
            elif (obj_source == 'PMC') or (obj_source == 'TEXT') or (obj_source == 'None'):
838
                triple_obj = [{'id:ID': triple['o']}]
839
            else:
840
                if not(triple['o'] in cache):
841
                    ents = get_concept_from_source(triple['o'], obj_source)
842
                    cache[triple['o']] = ents
843
                    collection.insert_one({'key':triple['o'],'value':ents})
844
                    print 'INSERTED in other %s' % triple['o']
845
                else:
846
                    ents = cache[triple['o']]
847
                triple_obj = []
848
                for ent in ents:
849
                    if (type(ent['sem_types']) == list and len(ent['sem_types']) > 1):
850
                        sem_types = ';'.join(ent['sem_types'])
851
                    elif (',' in ent['sem_types']):
852
                        sem_types = ';'.join(ent['sem_types'].split(','))
853
                    else:
854
                        sem_types = ent['sem_types']
855
856
                    triple_obj.append({'id:ID': ent['cuid'],
857
                                    'label': ent['label'],
858
                                    'sem_types:string[]': sem_types})
859
            combs = product(triple_subj, triple_obj)
860
            for comb in combs:
861
                new_relations.append({'s':comb[0], 'p':triple['p'], 'o':comb[1]})
862
        except Exception, e:
863
            time_log(e)
864
            time_log('S: %s | P: %s | O: %s' % (triple['s'],triple['p'],triple['o']))
865
            time_log('Skipped the above edge! Probably due to concept-fetching errors!')
866
        proc = int(ii/float(N)*100)
867
        if proc % 10 == 0 and proc > 0:
868
            time_log('We are at %d/%d edges transformed -- %0.2f %%' % (ii, N, proc))
869
        # if ii % 100 == 0 and ii > 9:
870
        #     time_log("Edges Transformation Process: %d -- %0.2f %%" % (ii, 100*ii/float(len(json_[outfield]))))
871
    json_[outfield] = new_relations
872
    return json_