medical-record-nlp / Git / [4126ad] /engine.py

Models:
philipB/
medical-record-nlp
Downloads: 1
[4126ad]: / engine.py
History
Download this file
119 lines (100 with data), 4.8 kB

import numpy as np
from PIL import Image
import tempfile
import pytesseract
import os
import sys
import re
from pathlib import Path


"""
    Extract text from image
    
    @Param
        filename : Image file name
    
    @return
        file containing extracted text
"""
def ocr(filename,APP_ROOT):
    target = os.path.join(APP_ROOT, 'ocr/')
    if not os.path.isdir(target):
        os.mkdir(target)
            
    text = str(((pytesseract.image_to_string(Image.open(filename),lang='fra')))) 
    simple_filename = Path(filename).stem
    ocr_path =  "/".join([target, simple_filename])
    file = open(ocr_path,"w")
    file.write(text)
    file.close()
    return ocr_path

"""
    extract medical data from ocr text
    
    @Param
        textFile : text file
    @return
        data structure of medical data
"""    
def extract_medical_data(textfile,APP_ROOT):
    
    data = {}
    medical_item_data ={}
    medical_data ={}
    all_medical_data =[]
    #counter for doctor and patient match
    doctor_match_counter = 0
    patient_match_conter = 0
    print("app  root",APP_ROOT)
    target = os.path.join(APP_ROOT, 'medicaldata/')
    print("medical targer",target)
    if not os.path.isdir(target):
        os.mkdir(target)
        pass
        
    pattern_labels = os.path.join(APP_ROOT, 'dictionnary/medical_dict.txt')
    
    dictionnaireLines = open(pattern_labels, encoding="utf8").read()
    patterns = dictionnaireLines.split(",")
    print("text file",textfile)
    print(type(textfile))
    try:
        with open(textfile, encoding="utf8") as f:
        #with open('/home/kommit/Documents/cours/tx52/medical-record-nlp/ocr/output.txt', encoding="utf8") as f:
            lines = f.readlines()
            
            for line in lines:  
                #regex serch for patient and doctor
                patient_last_first_name_match = re.search ("(?P<title>(?:MR|MONSIEUR|MADEMOISELLE|MLLE|MADAME|MME) (?P<name>.*?)\s*$)", line, re.IGNORECASE)
                doctor_last_first_name_match = re.search ("(?P<title>(?:DR) (?P<name>.*?)\s*$)", line, re.IGNORECASE)
                
                if(patient_last_first_name_match):
                    print(patient_last_first_name_match.group(1).split("\n")[0])
                    #regex can match many patient and doctor so when filter then with counter and get the first
                    if patient_match_conter == 0: data['patient_name'] = patient_last_first_name_match.group(1).split("\n")[0] 
                    patient_match_conter +=1
                if(doctor_last_first_name_match):
                    print(doctor_last_first_name_match.group(1))
                    if doctor_match_counter == 0:data['doctor_name'] = doctor_last_first_name_match.group(1).split("\n")[0] 
                    doctor_match_counter +=1
                
                #regex search for medical entity 
                for pattern in patterns:
                    if re.search(pattern, line, re.I):                        
                        valeurItems = re.findall ("<?\d\d?.+[0-9]*", line)
                        print(pattern,valeurItems,"-----------------------\n")
                        str_valeur_items = list_to_str(valeurItems)
                        medical_item_data_match = re.search("^(\S+) (\S+) (\S+ \S+ \S+) ?(|(\S+))$",str_valeur_items)
                        if(medical_item_data_match):
                            group_len = len(medical_item_data_match.groups())
                            print(group_len,"************")
                            medical_item_data['code'] = pattern
                            medical_item_data['value'] = medical_item_data_match.group(1)
                            medical_item_data['unity'] = medical_item_data_match.group(2)
                            if group_len == 5 : medical_item_data['normal'] = medical_item_data_match.group(3)
                            medical_item_data['history'] = medical_item_data_match.group(4)                                
                            print('pattern',pattern) 
                            p = pattern  
                            #medical_data[pattern] = medical_item_data 
                            print("medicaldata",medical_data)  
                            all_medical_data.append(medical_item_data)
                            medical_item_data ={}
            print("all----------",all_medical_data,"all")
        data['medical_data'] = all_medical_data        
                                                
        print(data)
        return data
    except IOError: 
        print("Le fichier pas etre ouvert")          
                
    pass

def list_to_str(str_list):
   res =  ' '.join([str(elem) for elem in str_list]) 
   return res