AIAgents4Pharma / Git / [9d3784] /aiagents4pharma/talk2scholars/tools/pdf/question_and

Models:
Amanda-D/
AIAgents4Pharma
Downloads: 1
[9d3784]: / aiagents4pharma / talk2scholars / tools / pdf / question_and_answer.py
History
Download this file
218 lines (186 with data), 8.8 kB

#!/usr/bin/env python3
"""
question_and_answer: Tool for performing Q&A on PDF documents using retrieval augmented generation.

This module provides functionality to extract text from PDF binary data, split it into
chunks, retrieve relevant segments via a vector store, and generate an answer to a
user-provided question using a language model chain.
"""

import io
import logging
from typing import Annotated, Dict, Any, List

from PyPDF2 import PdfReader
from pydantic import BaseModel, Field
import hydra

from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.messages import ToolMessage
from langchain_core.tools import tool
from langchain_core.tools.base import InjectedToolCallId
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import Annoy
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langgraph.types import Command
from langgraph.prebuilt import InjectedState

# Set up logging.
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Load configuration using Hydra.
with hydra.initialize(version_base=None, config_path="../../configs"):
    cfg = hydra.compose(
        config_name="config", overrides=["tools/question_and_answer=default"]
    )
    cfg = cfg.tools.question_and_answer
    logger.info("Loaded Question and Answer tool configuration.")


class QuestionAndAnswerInput(BaseModel):
    """
    Input schema for the PDF Question and Answer tool.

    Attributes:
        question (str): The question to ask regarding the PDF content.
        tool_call_id (str): Unique identifier for the tool call, injected automatically.
    """

    question: str = Field(description="The question to ask regarding the PDF content.")
    tool_call_id: Annotated[str, InjectedToolCallId]
    state: Annotated[dict, InjectedState]


def extract_text_from_pdf_data(pdf_bytes: bytes) -> str:
    """
    Extract text content from PDF binary data.

    This function uses PyPDF2 to read the provided PDF bytes and concatenates the text
    extracted from each page.

    Args:
        pdf_bytes (bytes): The binary data of the PDF document.

    Returns:
        str: The complete text extracted from the PDF.
    """
    reader = PdfReader(io.BytesIO(pdf_bytes))
    text = ""
    for page in reader.pages:
        page_text = page.extract_text() or ""
        text += page_text
    return text


def generate_answer(
    question: str, pdf_bytes: bytes, llm_model: BaseChatModel
) -> Dict[str, Any]:
    """
    Generate an answer for a question using retrieval augmented generation on PDF content.

    This function extracts text from the PDF data, splits the text into manageable chunks,
    performs a similarity search to retrieve the most relevant segments, and then uses a
    question-answering chain (built using the provided llm_model) to generate an answer.

    Args:
        question (str): The question to be answered.
        pdf_bytes (bytes): The binary content of the PDF document.
        llm_model (BaseChatModel): The language model instance to use for answering.

    Returns:
        Dict[str, Any]: A dictionary containing the answer generated by the language model.
    """
    text = extract_text_from_pdf_data(pdf_bytes)
    logger.info("Extracted text from PDF.")
    text_splitter = CharacterTextSplitter(
        separator="\n", chunk_size=cfg.chunk_size, chunk_overlap=cfg.chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    documents: List[Document] = [Document(page_content=chunk) for chunk in chunks]
    logger.info("Split PDF text into %d chunks.", len(documents))

    embeddings = OpenAIEmbeddings(openai_api_key=cfg.openai_api_key)
    vector_store = Annoy.from_documents(documents, embeddings)
    search_results = vector_store.similarity_search(question, k=cfg.num_retrievals)
    logger.info("Retrieved %d relevant document chunks.", len(search_results))
    # Use the provided llm_model to build the QA chain.
    qa_chain = load_qa_chain(llm_model, chain_type=cfg.qa_chain_type)
    answer = qa_chain.invoke(
        input={"input_documents": search_results, "question": question}
    )
    return answer


def generate_answer2(
    question: str, pdf_url: str, text_embedding_model: Embeddings
) -> Dict[str, Any]:
    """
    Generate an answer for a question using retrieval augmented generation on PDF content.

    This function extracts text from the PDF data, splits the text into manageable chunks,
    performs a similarity search to retrieve the most relevant segments, and then uses a
    question-answering chain (built using the provided llm_model) to generate an answer.

    Args:
        question (str): The question to be answered.
        pdf_bytes (bytes): The binary content of the PDF document.
        llm_model (BaseChatModel): The language model instance to use for answering.

    Returns:
        Dict[str, Any]: A dictionary containing the answer generated by the language model.
    """
    # text = extract_text_from_pdf_data(pdf_bytes)
    # logger.info("Extracted text from PDF.")
    logger.log(logging.INFO, "searching the article with the question: %s", question)
    # Load the article
    # loader = PyPDFLoader(state['pdf_file_name'])
    # loader = PyPDFLoader("https://arxiv.org/pdf/2310.08365")
    loader = PyPDFLoader(pdf_url)
    # Load the pages of the article
    pages = []
    for page in loader.lazy_load():
        pages.append(page)
    # Set up text embedding model
    # text_embedding_model = state['text_embedding_model']
    # text_embedding_model = OpenAIEmbeddings(openai_api_key=cfg.openai_api_key)
    logging.info("Loaded text embedding model %s", text_embedding_model)
    # Create a vector store from the pages
    vector_store = InMemoryVectorStore.from_documents(pages, text_embedding_model)
    # Search the article with the question
    docs = vector_store.similarity_search(question)
    # Return the content of the pages
    return "\n".join([doc.page_content for doc in docs])
    # return answer


@tool(args_schema=QuestionAndAnswerInput)
def question_and_answer_tool(
    question: str,
    tool_call_id: Annotated[str, InjectedToolCallId],
    state: Annotated[dict, InjectedState],
) -> Dict[str, Any]:
    """
    Answer a question using PDF content stored in the state via retrieval augmented generation.

    This tool retrieves the PDF binary data from the state (under the key "pdf_data"), extracts its
    textual content, and generates an answer to the specified question. It also extracts the
    llm_model (of type BaseChatModel) from the state to use for answering.

    Args:
        question (str): The question regarding the PDF content.
        tool_call_id (str): Unique identifier for the current tool call.
        state (dict): A dictionary representing the current state, expected to contain PDF data
                      under the key "pdf_data" with a sub-key "pdf_object" for the binary content,
                      and a key "llm_model" holding the language model instance.

    Returns:
        Dict[str, Any]: A dictionary containing the generated answer or an error message.
    """
    logger.info("Starting PDF Question and Answer tool using PDF data from state.")
    # print (state['text_embedding_model'])
    text_embedding_model = state["text_embedding_model"]
    pdf_state = state.get("pdf_data")
    if not pdf_state:
        error_msg = "No pdf_data found in state."
        logger.error(error_msg)
        return Command(
            update={
                "messages": [ToolMessage(content=error_msg, tool_call_id=tool_call_id)]
            }
        )
    pdf_bytes = pdf_state.get("pdf_object")
    if not pdf_bytes:
        error_msg = "PDF binary data is missing in the pdf_data from state."
        logger.error(error_msg)
        return Command(
            update={
                "messages": [ToolMessage(content=error_msg, tool_call_id=tool_call_id)]
            }
        )
    pdf_url = pdf_state.get("pdf_url")
    # Retrieve llm_model from state; use a default if not provided.
    llm_model = state.get("llm_model")
    if not llm_model:
        logger.error("Missing LLM model instance in state.")
        return {"error": "No LLM model found in state."}
    # answer = generate_answer(question, pdf_bytes, llm_model)
    print(pdf_url)
    answer = generate_answer2(question, pdf_url, text_embedding_model)
    # logger.info("Generated answer: %s", answer)
    return answer