In [1]:
import os
import json
import os
import numpy as np
import nltk
from nltk import LancasterStemmer, WordNetLemmatizer
import random
import pickle
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD

In [2]:
stemmer = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

In [3]:
path = "../dataset/medical-question-answer-data"

In [4]:
def load_doc(jsonFile):
    with open(jsonFile) as file:
        Json_data = json.loads(file.read())
    return Json_data

In [5]:
# Load the files
file1 = load_doc(os.path.abspath(os.path.join(path, "ehealthforumQAs.json")))
file2 = load_doc(os.path.abspath(os.path.join(path, "healthtapQAs.json")))
file3 = load_doc(os.path.abspath(os.path.join(path, "icliniqQAs.json")))
file4 = load_doc(os.path.abspath(os.path.join(path, "questionDoctorQAs.json")))
file5 = load_doc(os.path.abspath(os.path.join(path, "webmdQAs.json")))
file6 = load_doc(os.path.abspath(os.path.join(path, "medical_intent.json")))

In [6]:
# Select the files to be used for training and concatenate them
all_Files = [file1, file3, file4, file6]

In [7]:
words = []
labels = []
documents = []
ignore_words = ['?', '!']

In [8]:
for data in all_Files:
    for intent in data:
        if len(intent['tags']) == 0:
            tag = "unspecified"
        else:     
            ##Extracting only the first tags as they're the most relevant
            tag = intent['tags'][0]
            question = intent["question"]
            wrds = nltk.word_tokenize(question)
    
            words.extend(wrds)
            documents.append((wrds, tag))
            
            if tag not in labels:
                labels.append(tag)

In [9]:
words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

labels = sorted(list(set(labels)))

print (len(documents), "documents")

print (len(labels), "labels", labels)

print (len(words), "unique lemmatized words", words)
pickle.dump(words, open('words.pkl','wb'))
pickle.dump(labels, open('labels.pkl','wb'))

5942 documents
1891 labels ['"cannabis episode" that really scared me', '(heterozygous) factor v liden', '(temporomandibular joint disorders) tmj', '100% deaf in my right ear hypothyroidism pain in hands and feet', '103 fever', '13cm 9 cm', '2 endometrial biopsy-both came negative', '20 months old boy does not communicate', '2nd opinion', '35 weeks pregnant', '4 back surgeries', '5 week old baby had bronchiolitis and therapy', '5mm solid nodule in throat', '5mm stone in left kidney (oxalate stones)', '5th spinal fusion pain right now', '6 weeks of physical therapy and then got a epidural steroid injection', '6 year old was diagnosed with cellulitis', '65 year old male with cervical spine (neck) pain', "82 year old male fully conscious but can't stand up", 'a lot of bleeding every day', 'a week ago i had a medical abortion', 'abdomen', 'abdomen numbness', 'abdominal pain', 'abdominal pain and constipation and rectal bleeding has been more frequent', 'abdominal pain and the blood test re

In [10]:
training = []
out_empty = [0 for _ in range(len(labels))]

In [11]:
for doc in documents:
    bag = []
    
    pattern_words = doc[0]
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]

    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
        

    output_row = out_empty[:]
    output_row[labels.index(doc[1])] = 1

    training.append([bag, output_row])

In [12]:
random.shuffle(training)
training = np.array(training)

In [13]:
train_x = list(training[:,0])
train_y = list(training[:,1])
print("Training data created")

Training data created


In [14]:
model = Sequential()
model.add(Dense(64, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(train_y[0]), activation='softmax'))

#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
model.summary()

hist = model.fit(np.array(train_x), np.array(train_y), epochs=100, batch_size=5, verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                335232    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1891)              122915    
Total params: 474,723
Trainable params: 474,723
Non-trainable params: 0
__________________________________________________

Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [15]:
model.save('chatbot_model.hdf5')

In [16]:
from tensorflow.keras.models import load_model

In [17]:
model = load_model("chatbot_model.hdf5")

In [18]:
def clean_up_sentence(sentence):
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

In [19]:
def bow(sentence, words, show_details=True):
    sentence_words = clean_up_sentence(sentence)
    bag = [0]*len(words)
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s:
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

In [20]:
def classify_local(sentence):
    ERROR_THRESHOLD = 0.25
    input_data = pd.DataFrame([bow(sentence, words)], dtype=float, index=['input']).to_numpy()
    results = model.predict([input_data])[0]
    results = [[i, r] for i, r in enumerate(results) if r > ERROR_THRESHOLD]
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append((labels[r[0]], str(r[1])))

    return return_list

In [21]:
import pandas as pd

In [22]:
classify_local("Hello")

found in bag: hello


[('start_conversation', '0.99797636')]

In [23]:
classify_local("Who are you?")

found in bag: who
found in bag: you


[('what_are_you', '0.993885')]

In [26]:
classify_local("I am to take a surgery")

found in bag: i
found in bag: am
found in bag: to
found in bag: a
found in bag: surgery


[('i have spondyloarthritis', '0.4327701')]