a b/build_graphs.py
1
import numpy as np
2
import matplotlib.pyplot as pl
3
from language_processing import *
4
from extract_data import *
5
from loader import get_data
6
from extract_data import get_doc_rel_dates, get_ef_values, get_doc_keywords
7
import re
8
9
def plot_num_docs(patient_range = range(90)):
10
    rel_dates = dict()
11
    keyword_counts = dict()
12
    keywords = ['ef\w+(.+)%', 'ejection fraction:\w*(.+)%', 'ef of (.+)%','ejection fraction of (.+)%', 'ef is (.+)%', 'ef:\w*(.+)%','ejection fraction is (.+)%', 'ef:\w*(.+)%']
13
    overall_counts = dict()
14
    for i in patient_range:
15
        if i % 25 == 0:
16
            print i
17
        data = get_data([i])[0]
18
        rel_dates = get_doc_rel_dates(data, rel_dates, True)
19
        #keyword_counts = get_doc_keywords(data, keywords, keyword_counts, True)
20
        #ef_occurances = get_ef_values(data, car_only = True)
21
        if False and len(ef_occurances) > 2: #REMOVE FALSE TO SEE PLOTS
22
            dates, efs = zip(*ef_occurances)
23
            pl.figure()
24
            pl.scatter(dates, efs)
25
            pl.show()
26
        #for doc in keyword_counts:
27
        #    s = 0
28
        #    for key in keyword_counts[doc]:
29
        #        s += len(keyword_counts[doc][key])
30
        #    
31
        #    if not doc in overall_counts:
32
        #        overall_counts[doc] = [s]
33
        #    else:
34
        #        overall_counts[doc] += [s]
35
    
36
   # print overall_counts['Car']
37
   # pl.figure()
38
   # pl.hist(overall_counts['Car'])
39
   # pl.show()
40
41
    #for keyword in keyword_counts:
42
    #    print keyword, ": ", str(sum(keyword_counts[keyword]))
43
    #for doc in keyword_counts:
44
    #    print doc
45
    #    for keyword in keyword_counts[doc]:
46
    #        print "\t", keyword, ": ", str(sum(keyword_counts[doc][keyword]))
47
    note_deltas = []
48
    struct_deltas = []
49
    for doc_type in rel_dates:
50
        if is_note_doc(doc_type):
51
            note_deltas += [x.days for x in rel_dates[doc_type]]
52
        else:
53
            struct_deltas += [x.days for x in rel_dates[doc_type]]
54
    for word in keyword_counts:
55
        keyword_counts[word] = [x.days for x in rel_dates[doc_type]]
56
    
57
    bins = 100    
58
    print
59
    print "Notes: ", len(note_deltas)
60
    print "Structs: ", len(struct_deltas)
61
    pl.figure()
62
    h = pl.hist([note_deltas, struct_deltas], bins,stacked = True, color = ['blue', 'red'], label = ['Number of sentences in\nunstructured notes', 'Number of structured entries'])
63
    pl.legend(loc = 2)
64
    pl.title("Frequency of Occurances of New Data in Patient")
65
    pl.xlabel("Days Since Implant Procedure")
66
    pl.ylabel("Number of Pieces of Information")
67
    pl.show()
68
69
70
    for word in keyword_counts:
71
        pl.figure()
72
        pl.hist(keyword_counts[word], bins, color = ['blue'])
73
        pl.title("Occurances of " + word + " in corpus at time from procedure")
74
        pl.show()
75
76