--- a +++ b/sag_sheet_src/main.py @@ -0,0 +1,100 @@ +import sys +import os +sys.path.append(os.path.abspath("../")) + +import argparse +from enum import Enum +import io +from gcloud import storage +from google.cloud import vision +from google.cloud.vision import types +from PIL import Image, ImageDraw +import os +import tempfile +from pdf2image import convert_from_path, convert_from_bytes +from request_handling_aws import get_text, get_all_text +from patient import Patient +import re +import pandas as pd +import nltk + + +block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN'] +breaking_phrase = 'SAINT ANTHONY' + + +def get_patients(text_block, form_data, block_markers, breaking_phrase): + + patient_list = [] + + text_block = ['<START>'] + text_block + blocks = {x:[] for x in block_markers} + curr_marker = '' + curr_patient = Patient(block_markers=block_markers) + + i = 0 + for line in text_block: + nxt_ln = 0 + if nltk.edit_distance(breaking_phrase, line.strip())<3: + for key,val in blocks.items(): + print("\n\n" + key) + print(val) + if len(blocks['<START>']) == 0: + continue + curr_patient.process_gen_info(blocks, form_data[i]) + i += 1 + patient_list.append(curr_patient) + curr_marker = '<START>' + blocks = {x:[] for x in block_markers} + curr_patient = Patient(block_markers=block_markers) + + for block_marker in block_markers: + if nltk.edit_distance(block_marker, line.strip())<3: + curr_marker = block_marker + nxt_ln = 1 + break + + if nxt_ln == 1: + continue + + blocks[curr_marker].append(line) + for key,val in blocks.items(): + print("\n\n" + key) + print(val) + curr_patient.process_gen_info(blocks, form_data[i]) + patient_list.append(curr_patient) + return patient_list + +def compile_dataframe(patient_list): + pat_df = pd.DataFrame() + for patient in patient_list: + pat_df = pd.concat([pat_df,pd.DataFrame([patient.csv_rep()], columns=patient.csv_rep().keys())],axis=0,join='outer').reset_index(drop=True) + pat_df = pat_df.dropna(axis=1, how='all') + + dob_cols = [col for col in pat_df.columns if 'dob' in col] + print("DOB COLS:", dob_cols) + print(pat_df) + for col in dob_cols: + pat_df[col] = pat_df[col].astype(str).apply(lambda x: x.split(" ")[0] if (x != None) else x) + return pat_df + +def run_pipeline(full_body, form_data=None): + print("RUNNING SAG PIPELINE........") + block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN'] + breaking_phrase = 'SAINT ANTHONY' + patient_list = get_patients(full_body, form_data, block_markers, breaking_phrase) + fin_df = compile_dataframe(patient_list) + return fin_df + +if __name__ == "__main__": + full_body, ids, form_data = get_all_text("facesheet-ap","facesheet_sag/", require_form=True) + record = [] + pat_fl = 0 + nam_nl = 0 + block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN'] + breaking_phrase = 'SAINT ANTHONY' + patient_list = get_patients(full_body, form_data, block_markers, breaking_phrase) + fin_df = compile_dataframe(patient_list) + fin_df.to_excel("./output_fin.xlsx") + with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also + print(fin_df)