|
a |
|
b/medic_health_assistant/train_data.py |
|
|
1 |
import json |
|
|
2 |
import os |
|
|
3 |
import numpy as np |
|
|
4 |
import nltk |
|
|
5 |
from nltk import LancasterStemmer, WordNetLemmatizer |
|
|
6 |
import random |
|
|
7 |
import pickle |
|
|
8 |
from tensorflow.keras.layers import Dense, Dropout |
|
|
9 |
from tensorflow.keras.models import Sequential |
|
|
10 |
import pandas as pd |
|
|
11 |
|
|
|
12 |
stemmer = LancasterStemmer() |
|
|
13 |
lemmatizer = WordNetLemmatizer() |
|
|
14 |
|
|
|
15 |
|
|
|
16 |
path = "dataset/medical-question-answer-data" |
|
|
17 |
|
|
|
18 |
def load_doc(jsonFile): |
|
|
19 |
with open(jsonFile) as file: |
|
|
20 |
Json_data = json.loads(file.read()) |
|
|
21 |
return Json_data |
|
|
22 |
|
|
|
23 |
# Load the files |
|
|
24 |
file1 = load_doc(os.path.abspath(os.path.join(path, "ehealthforumQAs.json"))) |
|
|
25 |
file2 = load_doc(os.path.abspath(os.path.join(path, "healthtapQAs.json"))) |
|
|
26 |
file3 = load_doc(os.path.abspath(os.path.join(path, "icliniqQAs.json"))) |
|
|
27 |
file4 = load_doc(os.path.abspath(os.path.join(path, "questionDoctorQAs.json"))) |
|
|
28 |
file5 = load_doc(os.path.abspath(os.path.join(path, "webmdQAs.json"))) |
|
|
29 |
file6 = load_doc(os.path.abspath(os.path.join(path, "medical_intent.json"))) |
|
|
30 |
|
|
|
31 |
# Select the files to be used for training and concatenate them |
|
|
32 |
all_Files = [file1, file3, file4, file6] |
|
|
33 |
|
|
|
34 |
words = [] |
|
|
35 |
labels = [] |
|
|
36 |
documents = [] |
|
|
37 |
ignore_words = ['?', '!'] |
|
|
38 |
|
|
|
39 |
for data in all_Files: |
|
|
40 |
for intent in data: |
|
|
41 |
if len(intent['tags']) == 0: |
|
|
42 |
tag = "unspecified" |
|
|
43 |
else: |
|
|
44 |
##Extracting only the first tags as they're the most relevant |
|
|
45 |
tag = intent['tags'][0] |
|
|
46 |
question = intent["question"] |
|
|
47 |
wrds = nltk.word_tokenize(question) |
|
|
48 |
|
|
|
49 |
words.extend(wrds) |
|
|
50 |
documents.append((wrds, tag)) |
|
|
51 |
|
|
|
52 |
if tag not in labels: |
|
|
53 |
labels.append(tag) |
|
|
54 |
|
|
|
55 |
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words] |
|
|
56 |
words = sorted(list(set(words))) |
|
|
57 |
|
|
|
58 |
labels = sorted(list(set(labels))) |
|
|
59 |
|
|
|
60 |
print (len(documents), "documents") |
|
|
61 |
|
|
|
62 |
print (len(labels), "labels", labels) |
|
|
63 |
|
|
|
64 |
print (len(words), "unique lemmatized words", words) |
|
|
65 |
pickle.dump(words, open('words.pkl','wb')) |
|
|
66 |
pickle.dump(labels, open('labels.pkl','wb')) |
|
|
67 |
|
|
|
68 |
training = [] |
|
|
69 |
out_empty = [0 for _ in range(len(labels))] |
|
|
70 |
|
|
|
71 |
for doc in documents: |
|
|
72 |
bag = [] |
|
|
73 |
|
|
|
74 |
pattern_words = doc[0] |
|
|
75 |
pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words] |
|
|
76 |
|
|
|
77 |
for w in words: |
|
|
78 |
bag.append(1) if w in pattern_words else bag.append(0) |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
output_row = out_empty[:] |
|
|
82 |
output_row[labels.index(doc[1])] = 1 |
|
|
83 |
|
|
|
84 |
training.append([bag, output_row]) |
|
|
85 |
|
|
|
86 |
random.shuffle(training) |
|
|
87 |
training = np.array(training) |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
train_x = list(training[:,0]) |
|
|
91 |
train_y = list(training[:,1]) |
|
|
92 |
print("Training data created") |
|
|
93 |
|
|
|
94 |
|
|
|
95 |
model = Sequential() |
|
|
96 |
model.add(Dense(64, input_shape=(len(train_x[0]),), activation='relu')) |
|
|
97 |
model.add(Dense(128, activation='relu')) |
|
|
98 |
model.add(Dropout(0.2)) |
|
|
99 |
model.add(Dense(64, activation='relu')) |
|
|
100 |
model.add(Dropout(0.2)) |
|
|
101 |
model.add(Dense(len(train_y[0]), activation='softmax')) |
|
|
102 |
|
|
|
103 |
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) |
|
|
104 |
model.summary() |
|
|
105 |
|
|
|
106 |
hist = model.fit(np.array(train_x), np.array(train_y), epochs=100, batch_size=5, verbose=1) |
|
|
107 |
|
|
|
108 |
model.save('chatbot_model.hdf5') |
|
|
109 |
|
|
|
110 |
|
|
|
111 |
from tensorflow.keras.models import load_model |
|
|
112 |
|
|
|
113 |
model = load_model("chatbot_model.hdf5") |
|
|
114 |
|
|
|
115 |
def clean_up_sentence(sentence): |
|
|
116 |
sentence_words = nltk.word_tokenize(sentence) |
|
|
117 |
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words] |
|
|
118 |
return sentence_words |
|
|
119 |
|
|
|
120 |
|
|
|
121 |
# In[26]: |
|
|
122 |
|
|
|
123 |
|
|
|
124 |
def bow(sentence, words, show_details=True): |
|
|
125 |
sentence_words = clean_up_sentence(sentence) |
|
|
126 |
bag = [0]*len(words) |
|
|
127 |
for s in sentence_words: |
|
|
128 |
for i,w in enumerate(words): |
|
|
129 |
if w == s: |
|
|
130 |
bag[i] = 1 |
|
|
131 |
if show_details: |
|
|
132 |
print ("found in bag: %s" % w) |
|
|
133 |
|
|
|
134 |
return(np.array(bag)) |
|
|
135 |
|
|
|
136 |
|
|
|
137 |
def classify_local(sentence): |
|
|
138 |
ERROR_THRESHOLD = 0.25 |
|
|
139 |
input_data = pd.DataFrame([bow(sentence, words)], dtype=float, index=['input']).to_numpy() |
|
|
140 |
results = model.predict([input_data])[0] |
|
|
141 |
results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD] |
|
|
142 |
results.sort(key=lambda x: x[1], reverse=True) |
|
|
143 |
return_list = [] |
|
|
144 |
for r in results: |
|
|
145 |
return_list.append((labels[r[0]], str(r[1]))) |
|
|
146 |
|
|
|
147 |
return return_list |