EHRKit-2022 / Git / [2d4573] /wrapper_functions/multi_doc

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / wrapper_functions / multi_doc_functions.py
History
Download this file
50 lines (38 with data), 1.9 kB

from transformer_functions import get_bert_embeddings
from utils import get_sents_pyrush, get_multiple_sents_stanza, get_sents_stanza
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from utils import get_multiple_sents_stanza

def get_similar_documents(bert_model, query_note, candidate_notes, candidates, top_k=2):
    """
    retrieve top_k documents in candidate_notes that are most similar to query_note
    returns a dataframe with candidate_note_id, similarity_score, and candidate_text
    """

    candidate_vectors = np.array(get_bert_embeddings(bert_model, get_multiple_sents_stanza(candidate_notes)))
    query_vector = get_bert_embeddings(bert_model, [get_sents_stanza(query_note)])

    # compute cosine similarity
    similarities = cosine_similarity(query_vector, candidate_vectors)[0]

    sorted_args = np.argsort(similarities)[::-1]
    top_args = sorted_args[:top_k]

    selected_rows = np.array(candidates)[top_args]
    selected_similarities = np.array(similarities)[top_args]
    selected_texts = np.array(candidate_notes)[top_args]

    output_df = pd.DataFrame({'candidate_id': selected_rows,
                              'similarity_score': selected_similarities,
                              'candidate_text': selected_texts})

    return output_df

def get_clusters(bert_model, notes, k=2):
    """
    performs k-means clustering with documents represented using pre-trained transformers
    returns a dataframe with 2 columns: note and assigned cluster id
    """

    # performs k-means clustering on notes
    tokenized_texts = get_multiple_sents_stanza(notes)
    encoded_texts = get_bert_embeddings(bert_model, tokenized_texts)

    kmeans = KMeans(n_clusters=k)
    kmeans.fit(encoded_texts)
    labels = kmeans.labels_

    output_df = pd.DataFrame(list(zip(notes, labels)), columns=['note', 'cluster'])

    return output_df