[4126ad]: / engine.py

Download this file

119 lines (100 with data), 4.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import numpy as np
from PIL import Image
import tempfile
import pytesseract
import os
import sys
import re
from pathlib import Path
"""
Extract text from image
@Param
filename : Image file name
@return
file containing extracted text
"""
def ocr(filename,APP_ROOT):
target = os.path.join(APP_ROOT, 'ocr/')
if not os.path.isdir(target):
os.mkdir(target)
text = str(((pytesseract.image_to_string(Image.open(filename),lang='fra'))))
simple_filename = Path(filename).stem
ocr_path = "/".join([target, simple_filename])
file = open(ocr_path,"w")
file.write(text)
file.close()
return ocr_path
"""
extract medical data from ocr text
@Param
textFile : text file
@return
data structure of medical data
"""
def extract_medical_data(textfile,APP_ROOT):
data = {}
medical_item_data ={}
medical_data ={}
all_medical_data =[]
#counter for doctor and patient match
doctor_match_counter = 0
patient_match_conter = 0
print("app root",APP_ROOT)
target = os.path.join(APP_ROOT, 'medicaldata/')
print("medical targer",target)
if not os.path.isdir(target):
os.mkdir(target)
pass
pattern_labels = os.path.join(APP_ROOT, 'dictionnary/medical_dict.txt')
dictionnaireLines = open(pattern_labels, encoding="utf8").read()
patterns = dictionnaireLines.split(",")
print("text file",textfile)
print(type(textfile))
try:
with open(textfile, encoding="utf8") as f:
#with open('/home/kommit/Documents/cours/tx52/medical-record-nlp/ocr/output.txt', encoding="utf8") as f:
lines = f.readlines()
for line in lines:
#regex serch for patient and doctor
patient_last_first_name_match = re.search ("(?P<title>(?:MR|MONSIEUR|MADEMOISELLE|MLLE|MADAME|MME) (?P<name>.*?)\s*$)", line, re.IGNORECASE)
doctor_last_first_name_match = re.search ("(?P<title>(?:DR) (?P<name>.*?)\s*$)", line, re.IGNORECASE)
if(patient_last_first_name_match):
print(patient_last_first_name_match.group(1).split("\n")[0])
#regex can match many patient and doctor so when filter then with counter and get the first
if patient_match_conter == 0: data['patient_name'] = patient_last_first_name_match.group(1).split("\n")[0]
patient_match_conter +=1
if(doctor_last_first_name_match):
print(doctor_last_first_name_match.group(1))
if doctor_match_counter == 0:data['doctor_name'] = doctor_last_first_name_match.group(1).split("\n")[0]
doctor_match_counter +=1
#regex search for medical entity
for pattern in patterns:
if re.search(pattern, line, re.I):
valeurItems = re.findall ("<?\d\d?.+[0-9]*", line)
print(pattern,valeurItems,"-----------------------\n")
str_valeur_items = list_to_str(valeurItems)
medical_item_data_match = re.search("^(\S+) (\S+) (\S+ \S+ \S+) ?(|(\S+))$",str_valeur_items)
if(medical_item_data_match):
group_len = len(medical_item_data_match.groups())
print(group_len,"************")
medical_item_data['code'] = pattern
medical_item_data['value'] = medical_item_data_match.group(1)
medical_item_data['unity'] = medical_item_data_match.group(2)
if group_len == 5 : medical_item_data['normal'] = medical_item_data_match.group(3)
medical_item_data['history'] = medical_item_data_match.group(4)
print('pattern',pattern)
p = pattern
#medical_data[pattern] = medical_item_data
print("medicaldata",medical_data)
all_medical_data.append(medical_item_data)
medical_item_data ={}
print("all----------",all_medical_data,"all")
data['medical_data'] = all_medical_data
print(data)
return data
except IOError:
print("Le fichier pas etre ouvert")
pass
def list_to_str(str_list):
res = ' '.join([str(elem) for elem in str_list])
return res