EHRKit-2022 / Git / [2d4573] /wrapper_functions/summarization

Models:
philipB/
EHRKit-2022
Downloads: 1
[2d4573]: / wrapper_functions / summarization_functions.py
History
Download this file
72 lines (61 with data), 3.4 kB

from transformers import pipeline
from summa.summarizer import summarize

def get_single_summary(text, model_name="t5-small", min_length=50, max_length=200):
    '''
    https://huggingface.co/transformers/v3.0.2/_modules/transformers/pipelines.html#SummarizationPipeline
    :param text: input sequence, a string or a list of string
    :param model_name: model_name: "bart-large-cnn", "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" and "razent/SciFive-large-Pubmed_PMC", other models https://huggingface.co/models?sort=downloads&search=summarization
    :param min_length: min length in summary
    :param max_length: max length in summary
    :return: a list of string
    '''
    # choices: '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'
    classifier = pipeline("summarization",model=model_name,tokenizer=model_name)
    res = classifier(text,min_length=min_length,max_length=max_length)
    final_summary = []
    for summary in res:
        final_summary.append(summary['summary_text'])
    return final_summary


def get_multi_summary_joint(text, model_name="osama7/t5-summarization-multinews", min_length=50, max_length=200):
    '''
    Join all the input documents as a long document, then do single document summarization
    :param text: input sequence, a string or a list of string
    :param model_name: model_name:  "bart-large-cnn", "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" and "razent/SciFive-large-Pubmed_PMC", other models https://huggingface.co/models?sort=downloads&search=summarization
    :param min_length: min length in summary
    :param max_length: max length in summary
    :return: a list of string
    '''
    # choices: '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'
    classifier = pipeline("summarization", model=model_name, tokenizer=model_name)
    text = ' '.join(text)
    res = classifier(text, min_length=min_length, max_length=max_length)
    final_summary = []
    for summary in res:
        final_summary.append(summary['summary_text'])
    return final_summary



def get_multi_summary_extractive_textRank(text,ratio=-0.1,words=0):
    '''
    Textrank method for multi-doc summarization
    :param text: a list of string
    :param ratio: the ratio of summary (0-1.0)
    :param words: the number of words of summary, default is 50
    :return: a string as the final summary
    
    Example for testing:
    >>text1 = 'Automatic summarization is the process of reducing a text document with a \
    computer program in order to create a summary that retains the most important points \
    of the original document. As the problem of information overload has grown, and as \
    the quantity of data has increased, so has interest in automatic summarization.'
    >>text2 = 'Technologies that can make a coherent summary take into account variables such as \
    length, writing style and syntax. An example of the use of summarization technology \
    is search engines such as Google. Document summarization is another.'
    >>print(get_multi_summary_extractive_textRank([text1,text2]))
    '''
    text = ' '.join(text)
    summ = None
    if ratio>0:
        summ = summarize(text, ratio=ratio)

    if words>0:
        summ = summarize(text, words=words)

    if summ is None:
        summ = summarize(text, words=50)

    return summ.replace('\n',' ')