NER-medical-text / Git / Diff of /scripts/utils.py

Models:

philipB/

NER-medical-text

Downloads: 1

Diff of /scripts/utils.py [000000] .. [c0f169]

Switch to unified view

 b/scripts/utils.py
+import re
+import os
+import pickle
+import spacy
+from spacy import displacy
+import numpy as np
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import nltk
+nltk.download('punkt')
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+STOP_WORDS = stopwords.words('english')
+# Load the tokenizer from file
+with open('../data/tokenizer.pickle', 'rb') as handle:
+    tokenizer = pickle.load(handle)
+def load_data(data_dir):
+    data = np.load(os.path.join(data_dir, 'data.npz'), allow_pickle=True)
+    train_sequences_padded = data['train_sequences_padded']
+    train_labels = data['train_labels']
+    val_sequences_padded = data['val_sequences_padded']
+    val_labels = data['val_labels']
+    test_sequences_padded = data['test_sequences_padded']
+    test_labels = data['test_labels']
+    label_to_index = data['label_to_index'].item()  # use .item() to convert the numpy array to a Python dictionary
+    index_to_label = data['index_to_label'].item()
+    return (train_sequences_padded, train_labels), (val_sequences_padded, val_labels), (
+    test_sequences_padded, test_labels), label_to_index, index_to_label
+def clean_word(word):
+    """
+    Cleans a word by removing non-alphanumeric characters and extra whitespaces,
+    converting it to lowercase, and checking if it is a stopword.
+    Args:
+    - word (str): the word to clean
+    Returns:
+    - str: the cleaned word, or an empty string if it is a stopword
+    """
+    # remove non-alphanumeric characters and extra whitespaces
+    word = re.sub(r'[^\w\s]', '', word)
+    word = re.sub(r'\s+', ' ', word)
+    # convert to lowercase
+    word = word.lower()
+    if word not in STOP_WORDS:
+        return word
+    return ''
+def tokenize_text(text):
+    """
+    Tokenizes a text into a list of cleaned words.
+    Args:
+    - text (str): the text to tokenize
+    Returns:
+    - tokens (list of str): the list of cleaned words
+    - start_end_ranges (list of tuples): the start and end character positions for each token
+    """
+    regex_match = r'[^\s\u200a\-\u2010-\u2015\u2212\uff0d]+'  # r'[^\s\u200a\-\—\–]+'
+    tokens = []
+    start_end_ranges = []
+    # Tokenize the sentences in the text
+    sentences = nltk.sent_tokenize(text)
+    start = 0
+    for sentence in sentences:
+        sentence_tokens = re.findall(regex_match, sentence)
+        curr_sent_tokens = []
+        curr_sent_ranges = []
+        for word in sentence_tokens:
+            word = clean_word(word)
+            if word.strip():
+                start = text.lower().find(word, start)
+                end = start + len(word)
+                curr_sent_ranges.append((start, end))
+                curr_sent_tokens.append(word)
+                start = end
+        if len(curr_sent_tokens) > 0:
+            tokens.append(curr_sent_tokens)
+            start_end_ranges.append(curr_sent_ranges)
+    return tokens, start_end_ranges
+# def tokenize_text(text):
+#   """
+#   Tokenizes a text into a list of cleaned words.
+#
+#   Args:
+#   - text (str): the text to tokenize
+#
+#   Returns:
+#   - list of str: the list of cleaned words
+#   """
+#   regex_match = r'[^\s\u200a\-\u2010-\u2015\u2212\uff0d]+'  # r'[^\s\u200a\-\—\–]+'
+#   tokens = []
+#   for sentence in text.split('\n'):
+#       sentence_tokens = re.findall(regex_match, sentence)
+#       for word in sentence_tokens:
+#           word = clean_word(word)
+#           if word.strip():
+#               tokens.append(word)
+#   return tokens
+def predict(text, model, index_to_label, acronyms_to_entities, MAX_LENGTH):
+    """
+    Predicts named entities in a text using a trained NER model.
+    Args:
+    - text (str): the text to predict named entities in
+    - model: the trained NER model
+    - tokenizer: the trained tokenizer used for the model
+    - index_to_label (list of str): a list mapping each index in the predicted sequence to a named entity label
+    - acronyms_to_entities (dict): a dictionary mapping acronyms to their corresponding named entity labels
+    - MAX_LENGTH (int): the maximum sequence length for the model
+    Returns:
+    - None
+    """
+    tokens, start_end_ranges = tokenize_text(text)
+    all_tokens = []
+    all_ranges = []
+    for sent_tokens, sent_ranges in zip(tokens, start_end_ranges):
+        for token, start_end in zip(sent_tokens, sent_ranges):
+            start, end = start_end[0], start_end[1]
+            all_tokens.append(token)
+            all_ranges.append((start, end))
+    sequence = tokenizer.texts_to_sequences([' '.join(token for token in all_tokens)])
+    padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post')
+    # Make the prediction
+    prediction = model.predict(np.array(padded_sequence))
+    # Decode the prediction
+    predicted_labels = np.argmax(prediction, axis=-1)
+    predicted_labels = [index_to_label[i] for i in predicted_labels[0]]
+    entities = []
+    start_char = 0
+    for i, (token, label, start_end_range) in enumerate(zip(all_tokens, predicted_labels, all_ranges)):
+        start = start_end_range[0]
+        end = start_end_range[1]
+        if label != 'O':
+            entity_type = acronyms_to_entities[label[2:]]
+            entity = (start, end, entity_type)
+            entities.append(entity)
+    # Print the predicted named entities
+    print("Predicted Named Entities:")
+    for i in range(len(all_tokens)):
+        if predicted_labels[i] == 'O':
+            print(f"{all_tokens[i]}: {predicted_labels[i]}")
+        else:
+            print(f"{all_tokens[i]}: {acronyms_to_entities[predicted_labels[i][2:]]}")
+    display_pred(text, entities)
+def display_pred(text, entities):
+    nlp = spacy.load("en_core_web_sm", disable=['ner'])
+    # Generate the entities in Spacy format
+    doc = nlp(text)
+    # Add the predicted named entities to the Doc object
+    for start, end, label in entities:
+        span = doc.char_span(start, end, label=label)
+        if span is not None:
+            doc.ents += tuple([span])
+    colors = {"Activity": "#f9d5e5",
+              "Administration": "#f7a399",
+              "Age": "#f6c3d0",
+              "Area": "#fde2e4",
+              "Biological_attribute": "#d5f5e3",
+              "Biological_structure": "#9ddfd3",
+              "Clinical_event": "#77c5d5",
+              "Color": "#a0ced9",
+              "Coreference": "#e3b5a4",
+              "Date": "#f1f0d2",
+              "Detailed_description": "#ffb347",
+              "Diagnostic_procedure": "#c5b4e3",
+              "Disease_disorder": "#c4b7ea",
+              "Distance": "#bde0fe",
+              "Dosage": "#b9e8d8",
+              "Duration": "#ffdfba",
+              "Family_history": "#e6ccb2",
+              "Frequency": "#e9d8a6",
+              "Height": "#f2eecb",
+              "History": "#e2f0cb",
+              "Lab_value": "#f4b3c2",
+              "Mass": "#f4c4c3",
+              "Medication": "#f9d5e5",
+              "Nonbiological_location": "#f7a399",
+              "Occupation": "#f6c3d0",
+              "Other_entity": "#d5f5e3",
+              "Other_event": "#9ddfd3",
+              "Outcome": "#77c5d5",
+              "Personal_background": "#a0ced9",
+              "Qualitative_concept": "#e3b5a4",
+              "Quantitative_concept": "#f1f0d2",
+              "Severity": "#ffb347",
+              "Sex": "#c5b4e3",
+              "Shape": "#c4b7ea",
+              "Sign_symptom": "#bde0fe",
+              "Subject": "#b9e8d8",
+              "Texture": "#ffdfba",
+              "Therapeutic_procedure": "#e6ccb2",
+              "Time": "#e9d8a6",
+              "Volume": "#f2eecb",
+              "Weight": "#e2f0cb"}
+    options = {"compact": True, "bg": "#F8F8F8",
+               "ents": list(colors.keys()),
+               "colors": colors}
+    # Generate the HTML visualization
+    html = displacy.render(doc, style="ent", options=options)
+# def predict(text, model, tokenizer, index_to_label, acronyms_to_entities, MAX_LENGTH):
+#   """
+#   Predicts named entities in a text using a trained NER model.
+#
+#   Args:
+#   - text (str): the text to predict named entities in
+#   - model: the trained NER model
+#   - tokenizer: the trained tokenizer used for the model
+#   - index_to_label (list of str): a list mapping each index in the predicted sequence to a named entity label
+#   - acronyms_to_entities (dict): a dictionary mapping acronyms to their corresponding named entity labels
+#   - MAX_LENGTH (int): the maximum sequence length for the model
+#
+#   Returns:
+#   - None
+#   """
+#
+#   tokens = tokenize_text(text)
+#   sequence = tokenizer.texts_to_sequences([' '.join(token for token in tokens)])
+#   padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post')
+#
+#   # Make the prediction
+#   prediction = model.predict(np.array(padded_sequence))
+#
+#   # Decode the prediction
+#   predicted_labels = np.argmax(prediction, axis=-1)
+#   predicted_labels = [index_to_label[i] for i in predicted_labels[0]]
+#
+#   # Print the predicted named entities
+#   print("Predicted Named Entities:")
+#   for i in range(len(tokens)):
+#       if predicted_labels[i] == 'O':
+#           print(f"{tokens[i]}: {predicted_labels[i]}")
+#       else:
+#           print(f"{tokens[i]}: {acronyms_to_entities[predicted_labels[i][2:]]}")
+#
+def predict_multi_line_text(text, model, index_to_label, acronyms_to_entities, MAX_LENGTH):
+    # sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
+    # sent_tokens = []
+    # sent_start_end = []
+    sequences = []
+    sent_tokens, sent_start_end = tokenize_text(text)
+    for i in range(len(sent_tokens)):
+        sequence = tokenizer.texts_to_sequences([' '.join(token for token in sent_tokens[i])])
+        sequences.extend(sequence)
+    # for sentence in sentences:
+    #   tokens, start_end_ranges = tokenize_text(sentence)
+    #   sequence = tokenizer.texts_to_sequences([' '.join(token for token in tokens)])
+    #   sequences.append(sequence[0])
+    #   sent_tokens.append(tokens)
+    #   sent_start_end.append(start_end_ranges)
+    padded_sequence = pad_sequences(sequences, maxlen=MAX_LENGTH, padding='post')
+    # Make the prediction
+    prediction = model.predict(np.array(padded_sequence))
+    # Decode the prediction
+    predicted_labels = np.argmax(prediction, axis=-1)
+    predicted_labels = [
+        [index_to_label[i] for i in sent_predicted_labels]
+        for sent_predicted_labels in predicted_labels
+    ]
+    entities = []
+    start_char = 0
+    for tokens, sent_pred_labels, start_end_ranges in zip(sent_tokens, predicted_labels, sent_start_end):
+        for i, (token, label, start_end_range) in enumerate(zip(tokens, sent_pred_labels, start_end_ranges)):
+            start = start_end_range[0]
+            end = start_end_range[1]
+            if label != 'O':
+                entity_type = acronyms_to_entities[label[2:]]
+                entity = (start, end, entity_type)
+                entities.append(entity)
+    # Print the predicted named entities
+    print("Predicted Named Entities:")
+    for i in range(len(sent_tokens)):
+        for j in range(len(sent_tokens[i])):
+            if predicted_labels[i][j] == 'O':
+                print(f"{sent_tokens[i][j]}: {predicted_labels[i][j]}")
+            else:
+                print(f"{sent_tokens[i][j]}: {acronyms_to_entities[predicted_labels[i][j][2:]]}")
+        print("\n\n\n")
+    display_pred(text, entities)
+    # return entities