--- a +++ b/ocr.py @@ -0,0 +1,45 @@ +import pytesseract +from PIL import Image +from pdf2image import convert_from_path +import os +from tqdm import tqdm +import streamlit + + + + +def extract_images_from_pdf(filename: str): + ''' Takes as input a pdf file (its name) and convert each page into an image''' + + root = './HummingBird_prototype/processed_files/' + filename[:-4] + '/' + + # if the file is a pdf, convert it into images + if filename.endswith('.pdf'): + streamlit.write("Extracting images from pdf...") + convert_from_path(root+filename, output_folder=root, fmt='png', output_file=filename[:1]+'page') + streamlit.write("Extraction done") + + # if not a pdf, save a pdf copy (it will be used for highlighting later [UPDATE: higlighting does not work on images converted to pdf]) + elif filename.endswith('.png') or filename.endswith('.jpg'): + image = Image.open(root+filename) + image.convert('RGB').save(root+filename[:-4]+'.pdf') + + + +def ocr_core(filename: str): + ''' Takes as input a pdf file (its name) and run the OCR on each image created in + the previous function + ''' + + streamlit.write("Performing OCR...") + root = './HummingBird_prototype/processed_files/' + filename[:-4] + '/' + for file in tqdm(os.listdir(root)): + if file.endswith('.png') or file.endswith('.jpg'): + result = pytesseract.image_to_string(Image.open(root+file)) + with open(root+file[:-4]+'-ocr.txt', 'w') as f: + f.write(result) + streamlit.write("OCR done") + + +if __name__ == '__main__': + ocr_core('1.pdf')