--- a +++ b/MED277_bot.py @@ -0,0 +1,505 @@ + +# coding: utf-8 + +# In[59]: + + +import pandas as pd +from sklearn.externals import joblib +import re +from nltk.stem.snowball import SnowballStemmer +from collections import defaultdict +import operator +import numpy as np +import sklearn.feature_extraction.text as text +from sklearn import decomposition +from nltk.stem import PorterStemmer, WordNetLemmatizer +from sklearn.decomposition import PCA +from numpy.linalg import norm + + +# In[60]: + + +def load_data(): + ## Intitializing data paths + base_path = r'D:\ORGANIZATION\UCSD_Life\Work\4. Quarter-3\Subjects\MED 277\Project\DATA\\' + data_file = base_path+"NOTEEVENTS.csv.gz" + + ## Loading data frames from CSV file + #df = pd.read_csv(data_file, compression='gzip') + #df = df[:10000] + #joblib.dump(df,base_path+'data10.pkl') + + ## loading data frames from PKL memory + df1 = joblib.load(base_path+'data10.pkl') + df = df1[:50] + + ## Filtering dataframe for "Discharge summaries" and "TEXT" + df = df.loc[df['CATEGORY'] == 'Discharge summary'] #Extracting only discharge summaries + df_text = df['TEXT'] + return df_text + + +# ## EXTRACT ALL THE TOPICS + +# In[61]: + + +'''Method that processes the entire document string''' +def process_text(txt): + txt1 = re.sub('[\n]'," ",txt) + txt1 = re.sub('[^A-Za-z \.]+', '', txt1) + + return txt1 + + +# In[62]: + + +'''Method that processes the document string not considering separate lines''' +def process(txt): + txt1 = re.sub('[\n]'," ",txt) + txt1 = re.sub('[^A-Za-z ]+', '', txt1) + + _wrds = txt1.split() + stemmer = SnowballStemmer("english") ## May use porter stemmer + wrds = [stemmer.stem(wrd) for wrd in _wrds] + return wrds + + +# In[63]: + + +'''Method that processes raw string and gets a processes list containing lines''' +def get_processed_sentences(snt_txt): + snt_list = [] + for line in snt_txt.split('.'): + line = line.strip() + if len(line.split()) >= 5: + snt_list.append(line) + return snt_list + + +# In[64]: + + +'''This method extracts topic from sentence''' +def extract_topic(str_arg, num_topics = 1, num_top_words = 3): + vectorizer = text.CountVectorizer(input='content', analyzer='word', lowercase=True, stop_words='english') + try: + dtm = vectorizer.fit_transform(str_arg.split()) + vocab = np.array(vectorizer.get_feature_names()) + + #clf = decomposition.NMF(n_components=num_topics, random_state=1) ## topic extraction + clf = decomposition.LatentDirichletAllocation(n_components=num_topics, learning_method='online') + clf.fit_transform(dtm) + + topic_words = [] + for topic in clf.components_: + word_idx = np.argsort(topic)[::-1][0:num_top_words] ##[::-1] reverses the list + topic_words.append([vocab[i] for i in word_idx]) + return topic_words + except: + return None + + +# In[65]: + + +'''This method extracts topics of each sentence and returns a list''' +def extract_topics_all(doc_string): + #One entry per sentence in list + doc_str = process_text(doc_string) + doc_str = get_processed_sentences(doc_str) + + res = [] + for i in range (0, len(doc_str)): + snd_str = doc_str[i].lower() + #print("Sending ----------------------------",snd_str,"==========",len(snd_str)) + tmp_topic = extract_topic(snd_str, num_topics = 2, num_top_words = 1) + for top in tmp_topic: + for wrd in top: + res.append(wrd) + return res + + +# In[66]: + + +'''This function takes a dataframe and returns all the topics in the entire corpus''' +def extract_corpus_topics(arg_df): + all_topics = set() + cnt = 1 + for txt in arg_df: + all_topics = all_topics.union(extract_topics_all(txt)) + print("Processed ",cnt," records") + cnt += 1 + all_topics = list(all_topics) + return all_topics + + +# ## GET A VECTORIZED REPRESENTATION OF ALL THE TOPICS + +# In[67]: + + +'''data_set = words list per document. + vocabulary = list of all the words present + _vocab = dict of word counts for words in vocabulary''' +def get_vocab_wrd_map(df_text): + data_set = [] + vocabulary = [] + _vocab = defaultdict(int) + for i in range(0,df_text.size): + txt = process(df_text[i]) + data_set.append(txt) + + for wrd in txt: + _vocab[wrd] += 1 + + vocabulary = vocabulary + txt + vocabulary = list(set(vocabulary)) + + if(i%100 == 0): + print("%5d records processed"%(i)) + return data_set, vocabulary, _vocab + + +# In[68]: + + +'''vocab = return sorted list of most common words in vocabulary''' +def get_common_vocab(num_arg, vocab): + vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True) + vocab = vocab[:num_arg] + return vocab + + +# In[69]: + + +'''Convert vocabulary and most common words to map for faster access''' +def get_vocab_map(vocabulary, vocab): + vocab_map = {} + for i in range(0,len(vocab)): + vocab_map[vocab[i][0]] = i + + vocabulary_map = {} + for i in range(0,len(vocabulary)): + vocabulary_map[vocabulary[i]] = i + + return vocabulary_map, vocab_map + + +# In[70]: + + +def get_embedding(word, data_set, vocab_map, wdw_size): + embedding = [0]*len(vocab_map) + for docs in data_set: + for i in range(wdw_size, len(docs)-wdw_size): + if docs[i] == word: + for j in range(i-wdw_size, i-1): + if docs[j] in vocab_map: + embedding[vocab_map[docs[j]]] += 1 + for j in range(i+1, i+wdw_size): + if docs[j] in vocab_map: + embedding[vocab_map[docs[j]]] += 1 + total_words = sum(embedding) + if total_words != 0: + embedding[:] = [e/total_words for e in embedding] + return embedding + + +# In[71]: + + +def get_embedding_all(all_topics, data_set, vocab_map, wdw_size): + embeddings = [] + for i in range(0, len(all_topics)): + embeddings.append(get_embedding(all_topics[i], data_set, vocab_map, wdw_size)) + return embeddings + + +# ## Get similarity function + +# In[72]: + + +def cos_matrix_multiplication(matrix, vector): + """ + Calculating pairwise cosine distance using matrix vector multiplication. + """ + dotted = matrix.dot(vector) + matrix_norms = np.linalg.norm(matrix, axis=1) + vector_norm = np.linalg.norm(vector) + matrix_vector_norms = np.multiply(matrix_norms, vector_norm) + neighbors = np.divide(dotted, matrix_vector_norms) + return neighbors + + +# In[73]: + + +def get_most_similar_topics(embd, embeddings, all_topics, num_wrd=10): + sim_top = [] + cos_sim = cos_matrix_multiplication(np.array(embeddings), embd) + #closest_match = cos_sim.argsort()[-num_wrd:][::-1] ## This sorts all matches in order + + ## This just takes 80% and above similar matches + idx = list(np.where(cos_sim > 0.9)[0]) + val = list(cos_sim[np.where(cos_sim > 0.9)]) + closest_match, list2 = (list(t) for t in zip(*sorted(zip(idx, val), reverse=True))) + closest_match = np.array(closest_match) + + for i in range(0, closest_match.shape[0]): + sim_top.append(all_topics[closest_match[i]]) + return sim_top + + +# ## Topic Modelling + +# In[74]: + + +def get_regex_match(regex, str_arg): + srch = re.search(regex,str_arg) + if srch is not None: + return srch.group(0).strip() + else: + return "Not found" + + +# In[75]: + + +def extract(key,str_arg): + if key == 'dob': + return get_regex_match('Date of Birth:(.*)] ', str_arg) + elif key == 'a_date': + return get_regex_match('Admission Date:(.*)] ', str_arg) + elif key == 'd_date': + return get_regex_match('Discharge Date:(.*)]\n', str_arg) + elif key == 'sex': + return get_regex_match('Sex:(.*)\n', str_arg) + elif key == 'service': + return get_regex_match('Service:(.*)\n', str_arg) + elif key == 'allergy': + return get_regex_match('Allergies:(.*)\n(.*)\n', str_arg) + elif key == 'attdng': + return get_regex_match('Attending:(.*)]\n', str_arg) + else: + return "I Don't know" + + +# In[76]: + + +'''This method extracts topic from sentence''' +def extract_topic(str_arg, num_topics = 1, num_top_words = 3): + vectorizer = text.CountVectorizer(input='content', analyzer='word', lowercase=True, stop_words='english') + dtm = vectorizer.fit_transform(str_arg.split()) + vocab = np.array(vectorizer.get_feature_names()) + + #clf = decomposition.NMF(n_components=num_topics, random_state=1) ## topic extraction + clf = decomposition.LatentDirichletAllocation(n_components=num_topics, learning_method='online') + clf.fit_transform(dtm) + + topic_words = [] + for topic in clf.components_: + word_idx = np.argsort(topic)[::-1][0:num_top_words] ##[::-1] reverses the list + topic_words.append([vocab[i] for i in word_idx]) + return topic_words + + +# In[77]: + + +'''This method extracts topics in a question''' +def extract_Q_topic(str_arg): + try: + return extract_topic(str_arg) + except: + return None + ## TODO fix later for more comprehensive results + + +# In[78]: + + +def get_extract_map(key_wrd): + ## A Stemmed mapping for simple extractions + extract_map = {'birth':'dob', 'dob':'dob', + 'admiss':'a_date', 'discharg':'d_date', + 'sex':'sex', 'gender':'sex', 'servic':'service', + 'allergi':'allergy', 'attend':'attdng'} + if key_wrd in extract_map.keys(): + return extract_map[key_wrd] + else: + return None + + +# In[79]: + + +'''Method that generates the answer for text extraction questions''' +def get_extracted_answer(topic_str, text): + port = PorterStemmer() + for i in range(0, len(topic_str)): + rel_wrd = topic_str[i] + for wrd in rel_wrd: + key = get_extract_map(port.stem(wrd)) + if key is not None: + return extract(key, text) + return None + + +# In[80]: + + +'''This method extracts topics of each sentence and returns a list''' +def get_topic_mapping(doc_string): + #One entry per sentence in list + doc_str = process_text(doc_string) + doc_str = get_processed_sentences(doc_str) + + res = defaultdict(list) + for i in range (0, len(doc_str)): + snd_str = doc_str[i].lower() + #print("Sending ----------------------------",snd_str,"==========",len(snd_str)) + tmp_topic = extract_topic(snd_str, num_topics = 2, num_top_words = 1) + for top in tmp_topic: + for wrd in top: + res[wrd].append(doc_str[i]) + return res + + +# In[81]: + + +def get_direct_answer(topic_str, topic_map): + ## Maybe apply lemmatizer here + for i in range(0, len(topic_str)): + rel_wrd = topic_str[i] + for wrd in rel_wrd: + if wrd in topic_map.keys(): + return topic_map[wrd] + return None + + +# In[82]: + + +def get_answer(topic, topic_map, embedding_short, all_topics, data_set, vocab_map, pca, wdw_size=5): + ## Get most similar topics + tpc_embedding = get_embedding(topic, data_set, vocab_map, wdw_size) + tpc_embedding = pca.transform([tpc_embedding]) + sim_topics = get_most_similar_topics(tpc_embedding[0], embedding_short, all_topics, num_wrd = len(all_topics)) + for topic in sim_topics: + if topic in topic_map.keys(): + return topic_map[topic] + return None + + +# In[83]: + + +'''This function checks if the user input text is an instruction allowed in chatbot or not''' +def is_instruction_option(str_arg): + if str_arg == "exit" or str_arg == "summary" or str_arg == "reveal": + return True + else: + return False + +def print_bot(): + print(r" _ _ _") + print(r" | o o |") + print(r" \| = |/") + print(r" -------") + print(r" |||||||") + print(r" // \\") + +def print_caption(): + print(r" ||\\ || || ||= =||") + print(r" || \\ || || ||= =||") + print(r" || \\ || || ||") + print(r" || \\|| ||_ _ _ ||") + + +# In[ ]: + + +if __name__ == "__main__": + print("Loading data ...","\n") + df_text = load_data() + + print("Getting Vocabulary ...") + data_set, vocabulary, _vocab = get_vocab_wrd_map(df_text) + + print("Creating context ...") + vocab = get_common_vocab(1000, _vocab) + vocabulary_map, vocab_map = get_vocab_map(vocabulary, vocab) + + print("Learning topics ...") + all_topics = extract_corpus_topics(df_text) + + print("Getting Embeddings") + embeddings = get_embedding_all(all_topics, data_set, vocab_map, 5) + + pca = PCA(n_components=10) + embedding_short = pca.fit_transform(embeddings) + + print_caption() + print_bot() + print("Bot:> I am online!") + print("Bot:> Type \"exit\" to switch to end a patient's session") + print("Bot:> Type \"summary\" to view patient's discharge summary") + while(True): + while(True): + try: + pid = int(input("Bot:> What is your Patient Id [0 to "+str(df_text.shape[0]-1)+"?]")) + except: + continue + if pid < 0 or pid > df_text.shape[0]-1: + print("Bot:> Patient Id out or range!") + continue + else: + print("Bot:> Reading Discharge Summary for Patient Id: ",pid) + break + + personal_topics = extract_topics_all(df_text[pid]) + topic_mapping = get_topic_mapping(df_text[pid]) + + ques = "random starter" + while(ques != "exit"): + ## Read Question + ques = input("Bot:> How can I help ?\nPerson:>") + + ## Check if it is an instructional question + if is_instruction_option(ques): + if ques == "summary": + print("Bot:> ================= Discharge Summary for Patient Id ",pid,"\n") + print(df_text[pid]) + elif ques == "reveal": + print(topic_mapping, topic_mapping.keys()) + continue + + ## Extract Question topic + topic_q = extract_Q_topic(ques) + if topic_q is None: + print("Bot:> I am a specialized NLP bot, please as a more specific question for me!") + continue + ans = get_extracted_answer(topic_q, df_text[pid]) + if ans is not None: + print("Bot:> ",ans) + else: + ans = get_direct_answer(topic_q, topic_mapping) + if ans is not None: + print("Bot:> ",ans) + else: + ans = get_answer(topic_q, topic_mapping, embedding_short, all_topics, data_set, vocab_map, pca, 5) + if ans is not None: + print("Bot:> ",ans) + else: + print("Bot:> Sorry but, I have no information on this topic!")