medical_extraction / Git / [302778] /request_handling

Models:
philipB/
medical_extraction
Downloads: 1
[302778]: / request_handling_aws.py
History
Download this file
91 lines (77 with data), 3.3 kB

import argparse
import io
import boto3
from trp import Document

# https://facesheet-ap.s3.amazonaws.com/000a5953-9b4a-4abd-9002-a1347ba949e2.png
client_text = boto3.client('textract')
client_s3 = boto3.client('s3')
client_dynamo = boto3.client('dynamodb')
med_comp_client = boto3.client("comprehendmedical")

def get_text(bucket_name, key):
    response = client_text.detect_document_text( Document={'S3Object': {'Bucket': bucket_name, 'Name': key}})
    doc_text = []
    for item in response["Blocks"]:
        if item["BlockType"] == "LINE":
            doc_text.append(item["Text"])
    return doc_text, key.split("/")[1].split(".")[0]

def get_all_text(bucket_name, directory, event_list = None, require_form=False):

    full_text = []
    ids = []
    form_data = []

    if event_list != None:
        for s3_item in event_list:
            next_doc, id = get_text(bucket_name, s3_item['s3']['object']['key'])
            ids.append(id)
            full_text += next_doc
            if require_form == True:
                form = get_text_analysis(bucket_name, s3_item['s3']['object']['key'])
                form_data.append(form)

    else:
        bucket_list = client_s3.list_objects_v2(Bucket = bucket_name, Prefix=directory)
        for blob in bucket_list['Contents']:
            if blob['Key'] != directory:
                print("----------------------" + blob['Key'] + "---------------------------")
                next_doc, id = get_text(bucket_name,blob['Key'])
                print(id)
                print(next_doc)
                ids.append(id)
                full_text += next_doc
                # print(full_text)
                for line in next_doc:
                    print (line)
                if require_form == True:
                    form = get_text_analysis(bucket_name, blob['Key'])
                    form_data.append(form)

    if len(form_data) > 0:
        return full_text, ids, form_data
    else:
        return full_text, ids

def get_comprehend(text_block):
    detection_map = med_comp_client.detect_entities_v2(Text='\n'.join(text_block))
    # out_map = {}
    # for entity in detection_map["Entities"]:
    #     out_map[entity['Type']] = entity['Text']

    # return out_map
    return detection_map["Entities"]

def get_text_analysis(bucket_name, key):
    response_analysis = client_text.analyze_document( Document={'S3Object': {'Bucket': bucket_name, 'Name': key}}, FeatureTypes=['FORMS'])
    extract = Document(response_analysis)
    form_ext = []
    for page in extract.pages:
        print("Key Value Pairs:")
        for headings in page.form.fields:
            print("Detected Key: {}, Detected Value: {}".format(headings.key, headings.value))
            form_ext.append((str(headings.key), str(headings.value)))

    return form_ext

def put_dynamo(table_name, df, ids):
    put_dict = {}
    list_records = df.to_dict("records")
    print(ids)
    for dict,id in zip(list_records,ids):
        for key, val in dict.items():
            if val != None:
                put_dict[key] = {"S" : val}

        print(id)
        put_dict["patient_id"] = {"S" : id}
        client_dynamo.put_item(TableName=table_name, Item=put_dict)
        print("Patient Item Stored. ID = " + id)