NER-Medical-Documents / Git / [9063a2] /ocr.py

Models:
philipB/
NER-Medical-Documents
Downloads: 1
[9063a2]: / ocr.py
History
Download this file
46 lines (32 with data), 1.6 kB

import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import os 
from tqdm import tqdm
import streamlit




def extract_images_from_pdf(filename: str):
    ''' Takes as input a pdf file (its name) and convert each page into an image'''
    
    root = './HummingBird_prototype/processed_files/' + filename[:-4] + '/'

    # if the file is a pdf, convert it into images
    if filename.endswith('.pdf'):
        streamlit.write("Extracting images from pdf...")
        convert_from_path(root+filename, output_folder=root, fmt='png', output_file=filename[:1]+'page')
        streamlit.write("Extraction done")

    # if not a pdf, save a pdf copy (it will be used for highlighting later [UPDATE: higlighting does not work on images converted to pdf])
    elif filename.endswith('.png') or filename.endswith('.jpg'):
        image = Image.open(root+filename)
        image.convert('RGB').save(root+filename[:-4]+'.pdf')



def ocr_core(filename: str):
    ''' Takes as input a pdf file (its name) and run the OCR on each image created in
    the previous function
    '''

    streamlit.write("Performing OCR...")
    root = './HummingBird_prototype/processed_files/' + filename[:-4] + '/'
    for file in tqdm(os.listdir(root)):
        if file.endswith('.png') or file.endswith('.jpg'):
            result = pytesseract.image_to_string(Image.open(root+file))
            with open(root+file[:-4]+'-ocr.txt', 'w') as f:
                f.write(result)
    streamlit.write("OCR done")


if __name__ == '__main__':
    ocr_core('1.pdf')