[fb1bbb]: / backend / src / extractor.py

Download this file

33 lines (26 with data), 1.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from pdf2image import convert_from_path
import pytesseract
import util
from parser_prescription import PrescriptionParser
from parser_patient_details import PatientDetailsParser
pytesseract.pytesseract.tesseract_cmd=r'G:\Program Files\Tesseract-OCR\tesseract.exe'
POPPLER_PATH = r'G:\poppler-23.07.0\Library\bin'
def extract(file_path, file_format):
# extract text from pdf file
pages = convert_from_path(file_path, poppler_path=POPPLER_PATH)
document_text = ''
for page in pages:
processed_imaage = util.preprocess_image(page)
text = pytesseract.image_to_string(processed_imaage, lang='eng')
document_text += '\n' + text
# extract fields from text
if file_format == 'prescription':
extracted_data = PrescriptionParser(document_text).parse()
elif file_format == 'patient_details':
extracted_data = PatientDetailsParser(document_text).parse()
else:
raise Exception(f"Invalid document format: {file_format}")
return extracted_data
if __name__ == "__main__":
data = extract(r'backend\resources\prescription\pre_1.pdf', 'prescription')
print(data)