NER-Medical-Documents / Git / [9063a2] /run.py

Models:
philipB/
NER-Medical-Documents
Downloads: 1
[9063a2]: / run.py
History
Download this file
126 lines (95 with data), 5.2 kB

import streamlit
from ocr import extract_images_from_pdf, ocr_core
from evaluation import higlight
import os
import pdfplumber
import base64


def process_file(file, choices=[True, False, False, False, False]):
    ''' Takes as input a pdf file, store the file and run the pipeline '''

    dir_name = file.name[:-4]
    if not os.path.exists("./NER-Medical-Document/processed_files/"+dir_name):
        os.mkdir("./NER-Medical-Document/processed_files/"+dir_name)
        with open(os.path.join("./NER-Medical-Document/processed_files/"+dir_name,file.name),"wb") as f:
            f.write(file.getbuffer())
    extract_images_from_pdf(file.name)
    if file.name.endswith('.pdf'):
        ocr_core(file.name)
        higlight(file.name, choices)
    else:
        file_name = file.name[:-4] + ".pdf"
        ocr_core(file_name)
        higlight(file_name, choices)

    

def display_pdf(file: str):
    ''' Display the pdf file from its path '''

    # Opening file from file path
    with open(file, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode('utf-8')

    # Embedding PDF in HTML
    pdf_display = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="650" height="1000" type="application/pdf">'

    # Displaying File
    streamlit.markdown(pdf_display, unsafe_allow_html=True)
  


def run():
  ''' Main function to run the app '''

  # set the title
  streamlit.title('Prototype')

  # add a file explorer to load multiple files
  uploaded_files = streamlit.file_uploader('Upload your file here', type=['pdf', 'png', 'jpg'], accept_multiple_files=True)
  
  # create a bool static variable to save if the user want to display the full transcript
  streamlit.session_state["view_file"] = False

  # bool static variable: the value is True once the file has been processed and the results are ready
  streamlit.session_state["view_results"] = False

  codes = {'Chemicals': '#FFFF00', 'Diseases': '#00FF00', 'Dates': '#00B3FF', 'Adverse effects': '#FF0000', 'Doses': '#FF00FF'}

  # add a slectbox to select the file to process
  option = streamlit.selectbox(
    'Select the file you want to process',
    tuple((uploaded_file.name for uploaded_file in uploaded_files)))

  # add a selectbox to select the type of entities to extract
  with streamlit.expander(label='Select the entities you want to higlight', expanded=False):
        chemicals = streamlit.checkbox('Chemicals', value=True)
        diseases = streamlit.checkbox('Diseases', value=False)
        dates = streamlit.checkbox('Dates', value=False)
        adverse = streamlit.checkbox('Adverse effects', value=False)
        doses = streamlit.checkbox('Doses', value=False)

  choices = [chemicals, diseases, dates, adverse, doses]

  # add a button to process the selected file
  if streamlit.button('Process file') and option is not None:
      uploaded_file = [uploaded_file for uploaded_file in uploaded_files if uploaded_file.name == option][0]
      process_file(uploaded_file, choices)
      streamlit.session_state["view_results"] = True


  if streamlit.button('View results') and option is not None:
        col1, col2 = streamlit.columns([10000, 1], gap='large')
        with col1:
            uploaded_file = [uploaded_file for uploaded_file in uploaded_files if uploaded_file.name == option][0]
            file_name = uploaded_file.name[:-4]
            if not os.path.exists("./NER-Medical-Document/processed_files/"+file_name+"/"+file_name+"_highlighted.pdf"):
                streamlit.write("The file has not been processed yet")
            else:
                file_name = uploaded_file.name
                display_pdf("./NER-Medical-Document/processed_files/"+file_name[:-4]+"/"+file_name[:-4]+"_highlighted.pdf")
        with col2:
            i = 0
            for key, value in codes.items():
                if choices[i]:
                    streamlit.color_picker(key, value, disabled=False)
                i += 1

  # add a button to toggle or untoggle the full transcript
  if streamlit.button('View file') and option is not None:
      streamlit.session_state["view_file"] = not streamlit.session_state["view_file"]

      
  if streamlit.session_state["view_file"]:
      uploaded_file = [uploaded_file for uploaded_file in uploaded_files if uploaded_file.name == option][0]
      if uploaded_file.name.endswith('.pdf'):
          base64_pdf = base64.b64encode(uploaded_file.read()).decode('utf-8')
          pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="600" height="1300" type="application/pdf">' 
          streamlit.markdown(pdf_display, unsafe_allow_html=True)
      elif uploaded_file.name.endswith('.png') or uploaded_file.name.endswith('.jpg'):
          streamlit.image(uploaded_file, use_column_width=True)


  if streamlit.button('View transcript') and option is not None:
      uploaded_file = [uploaded_file for uploaded_file in uploaded_files if uploaded_file.name == option][0]
      try:
        with pdfplumber.open(uploaded_file) as pdf:
          for page in pdf.pages:
            streamlit.write(page.extract_text())
      except:
        streamlit.write("None")


if __name__ == '__main__':
    run()