|
a |
|
b/ocr.py |
|
|
1 |
import pytesseract |
|
|
2 |
from PIL import Image |
|
|
3 |
from pdf2image import convert_from_path |
|
|
4 |
import os |
|
|
5 |
from tqdm import tqdm |
|
|
6 |
import streamlit |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
|
|
|
10 |
|
|
|
11 |
def extract_images_from_pdf(filename: str): |
|
|
12 |
''' Takes as input a pdf file (its name) and convert each page into an image''' |
|
|
13 |
|
|
|
14 |
root = './HummingBird_prototype/processed_files/' + filename[:-4] + '/' |
|
|
15 |
|
|
|
16 |
# if the file is a pdf, convert it into images |
|
|
17 |
if filename.endswith('.pdf'): |
|
|
18 |
streamlit.write("Extracting images from pdf...") |
|
|
19 |
convert_from_path(root+filename, output_folder=root, fmt='png', output_file=filename[:1]+'page') |
|
|
20 |
streamlit.write("Extraction done") |
|
|
21 |
|
|
|
22 |
# if not a pdf, save a pdf copy (it will be used for highlighting later [UPDATE: higlighting does not work on images converted to pdf]) |
|
|
23 |
elif filename.endswith('.png') or filename.endswith('.jpg'): |
|
|
24 |
image = Image.open(root+filename) |
|
|
25 |
image.convert('RGB').save(root+filename[:-4]+'.pdf') |
|
|
26 |
|
|
|
27 |
|
|
|
28 |
|
|
|
29 |
def ocr_core(filename: str): |
|
|
30 |
''' Takes as input a pdf file (its name) and run the OCR on each image created in |
|
|
31 |
the previous function |
|
|
32 |
''' |
|
|
33 |
|
|
|
34 |
streamlit.write("Performing OCR...") |
|
|
35 |
root = './HummingBird_prototype/processed_files/' + filename[:-4] + '/' |
|
|
36 |
for file in tqdm(os.listdir(root)): |
|
|
37 |
if file.endswith('.png') or file.endswith('.jpg'): |
|
|
38 |
result = pytesseract.image_to_string(Image.open(root+file)) |
|
|
39 |
with open(root+file[:-4]+'-ocr.txt', 'w') as f: |
|
|
40 |
f.write(result) |
|
|
41 |
streamlit.write("OCR done") |
|
|
42 |
|
|
|
43 |
|
|
|
44 |
if __name__ == '__main__': |
|
|
45 |
ocr_core('1.pdf') |