a b/sag_sheet_src/main.py
1
import sys
2
import os
3
sys.path.append(os.path.abspath("../"))
4
5
import argparse
6
from enum import Enum
7
import io
8
from gcloud import storage
9
from google.cloud import vision
10
from google.cloud.vision import types
11
from PIL import Image, ImageDraw
12
import os
13
import tempfile
14
from pdf2image import convert_from_path, convert_from_bytes
15
from request_handling_aws import get_text, get_all_text
16
from patient import Patient
17
import re
18
import pandas as pd
19
import nltk
20
21
22
block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN']
23
breaking_phrase = 'SAINT ANTHONY'
24
25
26
def get_patients(text_block, form_data, block_markers, breaking_phrase):
27
28
    patient_list = []
29
30
    text_block = ['<START>'] + text_block
31
    blocks = {x:[] for x in block_markers}
32
    curr_marker = ''
33
    curr_patient = Patient(block_markers=block_markers)
34
35
    i = 0
36
    for line in text_block:
37
        nxt_ln = 0
38
        if nltk.edit_distance(breaking_phrase, line.strip())<3:
39
            for key,val in blocks.items():
40
                print("\n\n" + key)
41
                print(val)
42
            if len(blocks['<START>']) == 0:
43
                continue
44
            curr_patient.process_gen_info(blocks, form_data[i])
45
            i += 1
46
            patient_list.append(curr_patient)
47
            curr_marker = '<START>'
48
            blocks = {x:[] for x in block_markers}
49
            curr_patient = Patient(block_markers=block_markers)
50
51
        for block_marker in block_markers:
52
            if nltk.edit_distance(block_marker, line.strip())<3:
53
                curr_marker = block_marker
54
                nxt_ln = 1
55
                break
56
57
        if nxt_ln == 1:
58
            continue
59
60
        blocks[curr_marker].append(line)
61
    for key,val in blocks.items():
62
        print("\n\n" + key)
63
        print(val)
64
    curr_patient.process_gen_info(blocks, form_data[i])
65
    patient_list.append(curr_patient)
66
    return patient_list
67
68
def compile_dataframe(patient_list):
69
    pat_df = pd.DataFrame()
70
    for patient in patient_list:
71
        pat_df = pd.concat([pat_df,pd.DataFrame([patient.csv_rep()], columns=patient.csv_rep().keys())],axis=0,join='outer').reset_index(drop=True)
72
    pat_df = pat_df.dropna(axis=1, how='all')
73
74
    dob_cols = [col for col in pat_df.columns if 'dob' in col]
75
    print("DOB COLS:", dob_cols)
76
    print(pat_df)
77
    for col in dob_cols:
78
        pat_df[col] = pat_df[col].astype(str).apply(lambda x:  x.split(" ")[0] if (x != None) else x)
79
    return pat_df
80
81
def run_pipeline(full_body, form_data=None):
82
    print("RUNNING SAG PIPELINE........")
83
    block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN']
84
    breaking_phrase = 'SAINT ANTHONY'
85
    patient_list = get_patients(full_body, form_data, block_markers, breaking_phrase)
86
    fin_df = compile_dataframe(patient_list)
87
    return fin_df
88
89
if __name__ == "__main__":
90
    full_body, ids, form_data = get_all_text("facesheet-ap","facesheet_sag/", require_form=True)
91
    record = []
92
    pat_fl = 0
93
    nam_nl = 0
94
    block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN']
95
    breaking_phrase = 'SAINT ANTHONY'
96
    patient_list = get_patients(full_body, form_data, block_markers, breaking_phrase)
97
    fin_df = compile_dataframe(patient_list)
98
    fin_df.to_excel("./output_fin.xlsx")
99
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
100
        print(fin_df)