--- a +++ b/wrapper_functions/summarization_functions.py @@ -0,0 +1,71 @@ +from transformers import pipeline +from summa.summarizer import summarize + +def get_single_summary(text, model_name="t5-small", min_length=50, max_length=200): + ''' + https://huggingface.co/transformers/v3.0.2/_modules/transformers/pipelines.html#SummarizationPipeline + :param text: input sequence, a string or a list of string + :param model_name: model_name: "bart-large-cnn", "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" and "razent/SciFive-large-Pubmed_PMC", other models https://huggingface.co/models?sort=downloads&search=summarization + :param min_length: min length in summary + :param max_length: max length in summary + :return: a list of string + ''' + # choices: '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`' + classifier = pipeline("summarization",model=model_name,tokenizer=model_name) + res = classifier(text,min_length=min_length,max_length=max_length) + final_summary = [] + for summary in res: + final_summary.append(summary['summary_text']) + return final_summary + + +def get_multi_summary_joint(text, model_name="osama7/t5-summarization-multinews", min_length=50, max_length=200): + ''' + Join all the input documents as a long document, then do single document summarization + :param text: input sequence, a string or a list of string + :param model_name: model_name: "bart-large-cnn", "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b" and "razent/SciFive-large-Pubmed_PMC", other models https://huggingface.co/models?sort=downloads&search=summarization + :param min_length: min length in summary + :param max_length: max length in summary + :return: a list of string + ''' + # choices: '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`' + classifier = pipeline("summarization", model=model_name, tokenizer=model_name) + text = ' '.join(text) + res = classifier(text, min_length=min_length, max_length=max_length) + final_summary = [] + for summary in res: + final_summary.append(summary['summary_text']) + return final_summary + + + +def get_multi_summary_extractive_textRank(text,ratio=-0.1,words=0): + ''' + Textrank method for multi-doc summarization + :param text: a list of string + :param ratio: the ratio of summary (0-1.0) + :param words: the number of words of summary, default is 50 + :return: a string as the final summary + + Example for testing: + >>text1 = 'Automatic summarization is the process of reducing a text document with a \ + computer program in order to create a summary that retains the most important points \ + of the original document. As the problem of information overload has grown, and as \ + the quantity of data has increased, so has interest in automatic summarization.' + >>text2 = 'Technologies that can make a coherent summary take into account variables such as \ + length, writing style and syntax. An example of the use of summarization technology \ + is search engines such as Google. Document summarization is another.' + >>print(get_multi_summary_extractive_textRank([text1,text2])) + ''' + text = ' '.join(text) + summ = None + if ratio>0: + summ = summarize(text, ratio=ratio) + + if words>0: + summ = summarize(text, words=words) + + if summ is None: + summ = summarize(text, words=50) + + return summ.replace('\n',' ')