a b/medic_health_assistant/train_data.py
1
import json
2
import os
3
import numpy as np
4
import nltk
5
from nltk import LancasterStemmer, WordNetLemmatizer
6
import random
7
import pickle
8
from tensorflow.keras.layers import Dense, Dropout
9
from tensorflow.keras.models import Sequential
10
import pandas as pd
11
12
stemmer = LancasterStemmer()
13
lemmatizer = WordNetLemmatizer()
14
15
16
path = "dataset/medical-question-answer-data"
17
18
def load_doc(jsonFile):
19
    with open(jsonFile) as file:
20
        Json_data = json.loads(file.read())
21
    return Json_data
22
23
# Load the files
24
file1 = load_doc(os.path.abspath(os.path.join(path, "ehealthforumQAs.json")))
25
file2 = load_doc(os.path.abspath(os.path.join(path, "healthtapQAs.json")))
26
file3 = load_doc(os.path.abspath(os.path.join(path, "icliniqQAs.json")))
27
file4 = load_doc(os.path.abspath(os.path.join(path, "questionDoctorQAs.json")))
28
file5 = load_doc(os.path.abspath(os.path.join(path, "webmdQAs.json")))
29
file6 = load_doc(os.path.abspath(os.path.join(path, "medical_intent.json")))
30
31
# Select the files to be used for training and concatenate them
32
all_Files = [file1, file3, file4, file6]
33
34
words = []
35
labels = []
36
documents = []
37
ignore_words = ['?', '!']
38
39
for data in all_Files:
40
    for intent in data:
41
        if len(intent['tags']) == 0:
42
            tag = "unspecified"
43
        else:     
44
            ##Extracting only the first tags as they're the most relevant
45
            tag = intent['tags'][0]
46
            question = intent["question"]
47
            wrds = nltk.word_tokenize(question)
48
    
49
            words.extend(wrds)
50
            documents.append((wrds, tag))
51
            
52
            if tag not in labels:
53
                labels.append(tag)
54
55
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
56
words = sorted(list(set(words)))
57
58
labels = sorted(list(set(labels)))
59
60
print (len(documents), "documents")
61
62
print (len(labels), "labels", labels)
63
64
print (len(words), "unique lemmatized words", words)
65
pickle.dump(words, open('words.pkl','wb'))
66
pickle.dump(labels, open('labels.pkl','wb'))
67
68
training = []
69
out_empty = [0 for _ in range(len(labels))]
70
71
for doc in documents:
72
    bag = []
73
    
74
    pattern_words = doc[0]
75
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]
76
77
    for w in words:
78
        bag.append(1) if w in pattern_words else bag.append(0)
79
        
80
81
    output_row = out_empty[:]
82
    output_row[labels.index(doc[1])] = 1
83
84
    training.append([bag, output_row])
85
86
random.shuffle(training)
87
training = np.array(training)
88
89
90
train_x = list(training[:,0])
91
train_y = list(training[:,1])
92
print("Training data created")
93
94
95
model = Sequential()
96
model.add(Dense(64, input_shape=(len(train_x[0]),), activation='relu'))
97
model.add(Dense(128, activation='relu'))
98
model.add(Dropout(0.2))
99
model.add(Dense(64, activation='relu'))
100
model.add(Dropout(0.2))
101
model.add(Dense(len(train_y[0]), activation='softmax'))
102
103
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
104
model.summary()
105
106
hist = model.fit(np.array(train_x), np.array(train_y), epochs=100, batch_size=5, verbose=1)
107
108
model.save('chatbot_model.hdf5')
109
110
111
from tensorflow.keras.models import load_model
112
113
model = load_model("chatbot_model.hdf5")
114
115
def clean_up_sentence(sentence):
116
    sentence_words = nltk.word_tokenize(sentence)
117
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
118
    return sentence_words
119
120
121
# In[26]:
122
123
124
def bow(sentence, words, show_details=True):
125
    sentence_words = clean_up_sentence(sentence)
126
    bag = [0]*len(words)
127
    for s in sentence_words:
128
        for i,w in enumerate(words):
129
            if w == s:
130
                bag[i] = 1
131
                if show_details:
132
                    print ("found in bag: %s" % w)
133
134
    return(np.array(bag))
135
136
137
def classify_local(sentence):
138
    ERROR_THRESHOLD = 0.25
139
    input_data = pd.DataFrame([bow(sentence, words)], dtype=float, index=['input']).to_numpy()
140
    results = model.predict([input_data])[0]
141
    results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD]
142
    results.sort(key=lambda x: x[1], reverse=True)
143
    return_list = []
144
    for r in results:
145
        return_list.append((labels[r[0]], str(r[1])))
146
147
    return return_list