Diff of /ocr.py [000000] .. [9063a2]

Switch to unified view

a b/ocr.py
1
import pytesseract
2
from PIL import Image
3
from pdf2image import convert_from_path
4
import os 
5
from tqdm import tqdm
6
import streamlit
7
8
9
10
11
def extract_images_from_pdf(filename: str):
12
    ''' Takes as input a pdf file (its name) and convert each page into an image'''
13
    
14
    root = './HummingBird_prototype/processed_files/' + filename[:-4] + '/'
15
16
    # if the file is a pdf, convert it into images
17
    if filename.endswith('.pdf'):
18
        streamlit.write("Extracting images from pdf...")
19
        convert_from_path(root+filename, output_folder=root, fmt='png', output_file=filename[:1]+'page')
20
        streamlit.write("Extraction done")
21
22
    # if not a pdf, save a pdf copy (it will be used for highlighting later [UPDATE: higlighting does not work on images converted to pdf])
23
    elif filename.endswith('.png') or filename.endswith('.jpg'):
24
        image = Image.open(root+filename)
25
        image.convert('RGB').save(root+filename[:-4]+'.pdf')
26
27
28
29
def ocr_core(filename: str):
30
    ''' Takes as input a pdf file (its name) and run the OCR on each image created in
31
    the previous function
32
    '''
33
34
    streamlit.write("Performing OCR...")
35
    root = './HummingBird_prototype/processed_files/' + filename[:-4] + '/'
36
    for file in tqdm(os.listdir(root)):
37
        if file.endswith('.png') or file.endswith('.jpg'):
38
            result = pytesseract.image_to_string(Image.open(root+file))
39
            with open(root+file[:-4]+'-ocr.txt', 'w') as f:
40
                f.write(result)
41
    streamlit.write("OCR done")
42
43
44
if __name__ == '__main__':
45
    ocr_core('1.pdf')