medical_extraction / Git / [302778] /sag_sheet

Models:
philipB/
medical_extraction
Downloads: 1
[302778]: / sag_sheet_src / main.py
History
Download this file
101 lines (85 with data), 3.6 kB

import sys
import os
sys.path.append(os.path.abspath("../"))

import argparse
from enum import Enum
import io
from gcloud import storage
from google.cloud import vision
from google.cloud.vision import types
from PIL import Image, ImageDraw
import os
import tempfile
from pdf2image import convert_from_path, convert_from_bytes
from request_handling_aws import get_text, get_all_text
from patient import Patient
import re
import pandas as pd
import nltk


block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN']
breaking_phrase = 'SAINT ANTHONY'


def get_patients(text_block, form_data, block_markers, breaking_phrase):

    patient_list = []

    text_block = ['<START>'] + text_block
    blocks = {x:[] for x in block_markers}
    curr_marker = ''
    curr_patient = Patient(block_markers=block_markers)

    i = 0
    for line in text_block:
        nxt_ln = 0
        if nltk.edit_distance(breaking_phrase, line.strip())<3:
            for key,val in blocks.items():
                print("\n\n" + key)
                print(val)
            if len(blocks['<START>']) == 0:
                continue
            curr_patient.process_gen_info(blocks, form_data[i])
            i += 1
            patient_list.append(curr_patient)
            curr_marker = '<START>'
            blocks = {x:[] for x in block_markers}
            curr_patient = Patient(block_markers=block_markers)

        for block_marker in block_markers:
            if nltk.edit_distance(block_marker, line.strip())<3:
                curr_marker = block_marker
                nxt_ln = 1
                break

        if nxt_ln == 1:
            continue

        blocks[curr_marker].append(line)
    for key,val in blocks.items():
        print("\n\n" + key)
        print(val)
    curr_patient.process_gen_info(blocks, form_data[i])
    patient_list.append(curr_patient)
    return patient_list

def compile_dataframe(patient_list):
    pat_df = pd.DataFrame()
    for patient in patient_list:
        pat_df = pd.concat([pat_df,pd.DataFrame([patient.csv_rep()], columns=patient.csv_rep().keys())],axis=0,join='outer').reset_index(drop=True)
    pat_df = pat_df.dropna(axis=1, how='all')

    dob_cols = [col for col in pat_df.columns if 'dob' in col]
    print("DOB COLS:", dob_cols)
    print(pat_df)
    for col in dob_cols:
        pat_df[col] = pat_df[col].astype(str).apply(lambda x:  x.split(" ")[0] if (x != None) else x)
    return pat_df

def run_pipeline(full_body, form_data=None):
    print("RUNNING SAG PIPELINE........")
    block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN']
    breaking_phrase = 'SAINT ANTHONY'
    patient_list = get_patients(full_body, form_data, block_markers, breaking_phrase)
    fin_df = compile_dataframe(patient_list)
    return fin_df

if __name__ == "__main__":
    full_body, ids, form_data = get_all_text("facesheet-ap","facesheet_sag/", require_form=True)
    record = []
    pat_fl = 0
    nam_nl = 0
    block_markers = ['<START>', 'NAME AND ADDRESS', 'EMERGENCY CONTACT NAME AND ADDRESS', 'PRIMARY INSURANCE', 'GUARANTOR', 'ADMITTING PHYSICIAN']
    breaking_phrase = 'SAINT ANTHONY'
    patient_list = get_patients(full_body, form_data, block_markers, breaking_phrase)
    fin_df = compile_dataframe(patient_list)
    fin_df.to_excel("./output_fin.xlsx")
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
        print(fin_df)