EHRKit-2022 / Git / [2d4573] /wrapper_functions/transformer

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / wrapper_functions / transformer_functions.py
History
Download this file
190 lines (141 with data), 7.8 kB

from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from utils import get_sents_stanza, get_multiple_sents_stanza
from transformers import pipeline

LANG_CODE = {'Malay_written_with_Latin': '>>zlm_Latn<<', 'Mauritian_Creole': '>>mfe<<', 'Haitian': '>>hat<<',
             'Papiamento': '>>pap<<', 'Asturian': '>>ast<<', 'Catalan': '>>cat<<', 'Indonesian': '>>ind<<',
             'Galician': '>>glg<<', 'Walloon': '>>wln<<', 'Spanish': '>>spa<<', 'French': '>>fra<<',
             'Romanian': '>>ron<<', 'Portuguese': '>>por<<', 'Italian': '>>ita<<', 'Occitan': '>>oci<<',
             'Aragonese': '>>arg<<', 'Minangkabau': '>>min<<'}

def get_supported_translation_languages():
    return list(LANG_CODE.keys())

def get_translation(text, model_name, target_language):
    '''
    returns a string, which is the translated version of text
    '''

    # logging
    print(f'Translating medical note using {model_name}')
    partial_input = '\n'.join(text.split('\n')[:5])
    print(f"Input text (truncated): {partial_input}\n...")

    # translation using MarianMT
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    sents = [f'{LANG_CODE[target_language]} ' + sent for sent in get_sents_stanza(text)]

    translated = model.generate(**tokenizer(sents, return_tensors="pt", padding=True))
    translated = ' '.join([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

    return translated

def get_bert_embeddings(pretrained_model, texts):
    """
    texts: a list of lists of sentences, each list is made up of sentences from the same document
    """

    tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
    model = AutoModel.from_pretrained(pretrained_model, return_dict=False, output_hidden_states=True)

    output_embeddings = []

    for text in texts:
        # get embeddings for each sentence, compute mean to represent document
        inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')

        with torch.no_grad():
            model_output = model(**inputs)

        pooled_output = model_output[1]
        mean_embedding = torch.mean(pooled_output, axis=0)

        output_embeddings.append(mean_embedding.numpy())

    return output_embeddings

def get_single_summary(text, model_name="t5-small", min_length=50, max_length=200):
    '''
    https://huggingface.co/transformers/v3.0.2/_modules/transformers/pipelines.html#SummarizationPipeline
    :param text: input sequence, a string or a list of string
    :param model_name: model_name: `bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`
    :param min_length: min length in summary
    :param max_length: max length in summary
    :return: summary string
    '''
    # choices: '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'

    classifier = pipeline("summarization", model=model_name, tokenizer=model_name)
    res = classifier(text, min_length=min_length, max_length=max_length)
    final_summary = []

    for summary in res:
        final_summary.append(summary['summary_text'])

    final_summary = '\n\n'.join(final_summary)

    return final_summary

def get_multi_summary_joint(text, model_name="osama7/t5-summarization-multinews", min_length=50, max_length=200):
    '''
    Join all the input documents as a long document, then do single document summarization
    :param text: input sequence, a string or a list of string
    :param model_name: model_name: `bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`, other models https://huggingface.co/models?sort=downloads&search=summarization
    :param min_length: min length in summary
    :param max_length: max length in summary
    :return: summary string
    '''
    # choices: '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'

    classifier = pipeline("summarization", model=model_name, tokenizer=model_name)
    text = ' '.join(text)
    res = classifier(text, min_length=min_length, max_length=max_length)
    final_summary = []

    for summary in res:
        final_summary.append(summary['summary_text'])

    final_summary = '\n\n'.join(final_summary)

    return final_summary
  
  
def get_span_answer(question, context,model_name="sultan/BioM-ELECTRA-Large-SQuAD2"):
    
    # choices:`sultan/BioM-ELECTRA-Large-SQuAD2`(tested), `deepset/roberta-base-squad2`, `AnonymousSub/SciFive_MedQuAD_question_generation`
    # result is a dictionary, i.e., {'score': 0.9190713763237, 'start': 34, 'end': 40, 'answer': 'Berlin'}; we return the answer token directly
    # for testing purpose: question='Where do I live?', context="My name is Wolfgang and I live in Berlin"

    bot =  pipeline(model=model_name)
    answer = bot(question=question, context=context)
    return answer['answer']
    
def get_question(context, model_name="AnonymousSub/SciFive_MedQuAD_question_generation"):
    # generate a question given the input content
    bot =  pipeline(model=model_name)
    question = bot(context)
    
    return question

def get_choice(question, candicates, model="russab0/distilbert-qa"):
    # multiple-choice-qa there is no fine-tuned version on headQA!, reference: https://huggingface.co/persiannlp/mbert-base-parsinlu-multiple-choice
    # we return the answer directly

    # from typing import List
    # import torch
    # 

    # model_name = "persiannlp/mbert-base-parsinlu-multiple-choice"
    model_name = model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name)
    model = AutoModelForMultipleChoice.from_pretrained(model_name, config=config)

    assert len(candicates) == 4, "you need four candidates"
    choices_inputs = []
    for c in candicates:
        text_a = ""  # empty context
        text_b = question + " " + c
        inputs = tokenizer(
            text_a,
            text_b,
            add_special_tokens=True,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_overflowing_tokens=True,
        )
        choices_inputs.append(inputs)

    input_ids = torch.LongTensor([x["input_ids"] for x in choices_inputs])
    output = model(input_ids=input_ids)
   
    print (question+' choose from:')
    print (candicates)
    
    return candicates[torch.argmax(output['logits'])]
    
def get_med_question(context, model_name="AnonymousSub/SciFive_MedQuAD_question_generation"):
    # generate a question given the input content
    bot =  pipeline("text2text-generation", model=model_name)
    question = bot(context)[0]
    question = question['generated_text']
    return question
    
def get_dialogpt():
    

    tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

    # Let's chat for 5 lines
    for step in range(5):
        # encode the new user input, add the eos_token and return a tensor in Pytorch
        new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

        # append the new user input tokens to the chat history
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

        # generated a response while limiting the total chat history to 1000 tokens, 
        chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

        # pretty print last ouput tokens from bot
        print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))