|
a |
|
b/build_graphs.py |
|
|
1 |
import numpy as np |
|
|
2 |
import matplotlib.pyplot as pl |
|
|
3 |
from language_processing import * |
|
|
4 |
from extract_data import * |
|
|
5 |
from loader import get_data |
|
|
6 |
from extract_data import get_doc_rel_dates, get_ef_values, get_doc_keywords |
|
|
7 |
import re |
|
|
8 |
|
|
|
9 |
def plot_num_docs(patient_range = range(90)): |
|
|
10 |
rel_dates = dict() |
|
|
11 |
keyword_counts = dict() |
|
|
12 |
keywords = ['ef\w+(.+)%', 'ejection fraction:\w*(.+)%', 'ef of (.+)%','ejection fraction of (.+)%', 'ef is (.+)%', 'ef:\w*(.+)%','ejection fraction is (.+)%', 'ef:\w*(.+)%'] |
|
|
13 |
overall_counts = dict() |
|
|
14 |
for i in patient_range: |
|
|
15 |
if i % 25 == 0: |
|
|
16 |
print i |
|
|
17 |
data = get_data([i])[0] |
|
|
18 |
rel_dates = get_doc_rel_dates(data, rel_dates, True) |
|
|
19 |
#keyword_counts = get_doc_keywords(data, keywords, keyword_counts, True) |
|
|
20 |
#ef_occurances = get_ef_values(data, car_only = True) |
|
|
21 |
if False and len(ef_occurances) > 2: #REMOVE FALSE TO SEE PLOTS |
|
|
22 |
dates, efs = zip(*ef_occurances) |
|
|
23 |
pl.figure() |
|
|
24 |
pl.scatter(dates, efs) |
|
|
25 |
pl.show() |
|
|
26 |
#for doc in keyword_counts: |
|
|
27 |
# s = 0 |
|
|
28 |
# for key in keyword_counts[doc]: |
|
|
29 |
# s += len(keyword_counts[doc][key]) |
|
|
30 |
# |
|
|
31 |
# if not doc in overall_counts: |
|
|
32 |
# overall_counts[doc] = [s] |
|
|
33 |
# else: |
|
|
34 |
# overall_counts[doc] += [s] |
|
|
35 |
|
|
|
36 |
# print overall_counts['Car'] |
|
|
37 |
# pl.figure() |
|
|
38 |
# pl.hist(overall_counts['Car']) |
|
|
39 |
# pl.show() |
|
|
40 |
|
|
|
41 |
#for keyword in keyword_counts: |
|
|
42 |
# print keyword, ": ", str(sum(keyword_counts[keyword])) |
|
|
43 |
#for doc in keyword_counts: |
|
|
44 |
# print doc |
|
|
45 |
# for keyword in keyword_counts[doc]: |
|
|
46 |
# print "\t", keyword, ": ", str(sum(keyword_counts[doc][keyword])) |
|
|
47 |
note_deltas = [] |
|
|
48 |
struct_deltas = [] |
|
|
49 |
for doc_type in rel_dates: |
|
|
50 |
if is_note_doc(doc_type): |
|
|
51 |
note_deltas += [x.days for x in rel_dates[doc_type]] |
|
|
52 |
else: |
|
|
53 |
struct_deltas += [x.days for x in rel_dates[doc_type]] |
|
|
54 |
for word in keyword_counts: |
|
|
55 |
keyword_counts[word] = [x.days for x in rel_dates[doc_type]] |
|
|
56 |
|
|
|
57 |
bins = 100 |
|
|
58 |
print |
|
|
59 |
print "Notes: ", len(note_deltas) |
|
|
60 |
print "Structs: ", len(struct_deltas) |
|
|
61 |
pl.figure() |
|
|
62 |
h = pl.hist([note_deltas, struct_deltas], bins,stacked = True, color = ['blue', 'red'], label = ['Number of sentences in\nunstructured notes', 'Number of structured entries']) |
|
|
63 |
pl.legend(loc = 2) |
|
|
64 |
pl.title("Frequency of Occurances of New Data in Patient") |
|
|
65 |
pl.xlabel("Days Since Implant Procedure") |
|
|
66 |
pl.ylabel("Number of Pieces of Information") |
|
|
67 |
pl.show() |
|
|
68 |
|
|
|
69 |
|
|
|
70 |
for word in keyword_counts: |
|
|
71 |
pl.figure() |
|
|
72 |
pl.hist(keyword_counts[word], bins, color = ['blue']) |
|
|
73 |
pl.title("Occurances of " + word + " in corpus at time from procedure") |
|
|
74 |
pl.show() |
|
|
75 |
|
|
|
76 |
|