[302778]: / moline_sheet_src / main.py

Download this file

92 lines (75 with data), 3.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import sys
import os
sys.path.append(os.path.abspath("./"))
# import argparse
# from enum import Enum
import io
# from PIL import Image, ImageDraw
import os
# import tempfile
# from pdf2image import convert_from_path, convert_from_bytes
from request_handling_aws import get_text, get_all_text
from patient import Patient
import re
import pandas as pd
block_markers = ['<START>', 'ENCOUNTER', 'PATIENT', 'GUARANTOR', 'COVERAGE']
breaking_phrase = 'QUAD CITIES'
def get_patients(text_block, block_markers,breaking_phrase):
patient_list = []
text_block = ['<START>'] + text_block
blocks = {x:[] for x in block_markers}
curr_marker = ''
curr_patient = Patient(block_markers=block_markers)
for line in text_block:
if breaking_phrase in line.strip():
for key,val in blocks.items():
print("\n\n" + key)
print(val)
curr_patient.process_gen_info(blocks)
patient_list.append(curr_patient)
curr_marker = '<START>'
blocks = {x:[] for x in block_markers}
curr_patient = Patient(block_markers=block_markers)
marker_change = 0
for block_marker in block_markers:
if block_marker in line.strip():
curr_marker = block_marker
marker_change = 1
break
if marker_change == 1:
continue
blocks[curr_marker].append(line)
curr_patient.process_gen_info(blocks)
patient_list.append(curr_patient)
return patient_list
def compile_dataframe(patient_list):
pat_df = pd.DataFrame()
for patient in patient_list:
to_concat_df = patient.csv_rep()
pat_df = pd.concat([pat_df,pd.DataFrame([to_concat_df], columns=patient.csv_rep().keys())],axis=0,join='outer').reset_index(drop=True)
pat_df = pat_df.dropna(axis=1, how='all')
dob_cols = [col for col in pat_df.columns if 'dob' in col]
print("DOB COLS:", dob_cols)
for col in dob_cols:
pat_df[col] = pat_df[col].apply(lambda x: x.split(" ")[0] if (x != None) else x)
return pat_df.iloc[1:]
def run_pipeline(full_body, form_data=None):
print("RUNNING MOLINE PIPELINE........")
block_markers = ['<START>', 'ENCOUNTER', 'PATIENT', 'GUARANTOR', 'COVERAGE']
breaking_phrase = 'QUAD CITIES'
patient_list = get_patients(full_body,block_markers,breaking_phrase)
fin_df = compile_dataframe(patient_list)
return fin_df
if __name__ == "__main__":
full_body, ids = get_all_text("facesheet-ap","facesheet_moline/")
print(full_body)
record = []
pat_fl = 0
nam_nl = 0
block_markers = ['<START>', 'ENCOUNTER', 'PATIENT', 'GUARANTOR', 'COVERAGE']
breaking_phrase = 'QUAD CITIES'
patient_list = get_patients(full_body,block_markers,breaking_phrase)
fin_df = compile_dataframe(patient_list)
fin_df.to_excel("./output_fin.xlsx")
with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
print(fin_df)