#!/usr/bin/env python3
"""
question_and_answer: Tool for performing Q&A on PDF documents using retrieval augmented generation.
This module provides functionality to extract text from PDF binary data, split it into
chunks, retrieve relevant segments via a vector store, and generate an answer to a
user-provided question using a language model chain.
"""
import io
import logging
from typing import Annotated, Dict, Any, List
from PyPDF2 import PdfReader
from pydantic import BaseModel, Field
import hydra
from langchain.chains.question_answering import load_qa_chain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.messages import ToolMessage
from langchain_core.tools import tool
from langchain_core.tools.base import InjectedToolCallId
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores import Annoy
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langgraph.types import Command
from langgraph.prebuilt import InjectedState
# Set up logging.
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Load configuration using Hydra.
with hydra.initialize(version_base=None, config_path="../../configs"):
cfg = hydra.compose(
config_name="config", overrides=["tools/question_and_answer=default"]
)
cfg = cfg.tools.question_and_answer
logger.info("Loaded Question and Answer tool configuration.")
class QuestionAndAnswerInput(BaseModel):
"""
Input schema for the PDF Question and Answer tool.
Attributes:
question (str): The question to ask regarding the PDF content.
tool_call_id (str): Unique identifier for the tool call, injected automatically.
"""
question: str = Field(description="The question to ask regarding the PDF content.")
tool_call_id: Annotated[str, InjectedToolCallId]
state: Annotated[dict, InjectedState]
def extract_text_from_pdf_data(pdf_bytes: bytes) -> str:
"""
Extract text content from PDF binary data.
This function uses PyPDF2 to read the provided PDF bytes and concatenates the text
extracted from each page.
Args:
pdf_bytes (bytes): The binary data of the PDF document.
Returns:
str: The complete text extracted from the PDF.
"""
reader = PdfReader(io.BytesIO(pdf_bytes))
text = ""
for page in reader.pages:
page_text = page.extract_text() or ""
text += page_text
return text
def generate_answer(
question: str, pdf_bytes: bytes, llm_model: BaseChatModel
) -> Dict[str, Any]:
"""
Generate an answer for a question using retrieval augmented generation on PDF content.
This function extracts text from the PDF data, splits the text into manageable chunks,
performs a similarity search to retrieve the most relevant segments, and then uses a
question-answering chain (built using the provided llm_model) to generate an answer.
Args:
question (str): The question to be answered.
pdf_bytes (bytes): The binary content of the PDF document.
llm_model (BaseChatModel): The language model instance to use for answering.
Returns:
Dict[str, Any]: A dictionary containing the answer generated by the language model.
"""
text = extract_text_from_pdf_data(pdf_bytes)
logger.info("Extracted text from PDF.")
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=cfg.chunk_size, chunk_overlap=cfg.chunk_overlap
)
chunks = text_splitter.split_text(text)
documents: List[Document] = [Document(page_content=chunk) for chunk in chunks]
logger.info("Split PDF text into %d chunks.", len(documents))
embeddings = OpenAIEmbeddings(openai_api_key=cfg.openai_api_key)
vector_store = Annoy.from_documents(documents, embeddings)
search_results = vector_store.similarity_search(question, k=cfg.num_retrievals)
logger.info("Retrieved %d relevant document chunks.", len(search_results))
# Use the provided llm_model to build the QA chain.
qa_chain = load_qa_chain(llm_model, chain_type=cfg.qa_chain_type)
answer = qa_chain.invoke(
input={"input_documents": search_results, "question": question}
)
return answer
def generate_answer2(
question: str, pdf_url: str, text_embedding_model: Embeddings
) -> Dict[str, Any]:
"""
Generate an answer for a question using retrieval augmented generation on PDF content.
This function extracts text from the PDF data, splits the text into manageable chunks,
performs a similarity search to retrieve the most relevant segments, and then uses a
question-answering chain (built using the provided llm_model) to generate an answer.
Args:
question (str): The question to be answered.
pdf_bytes (bytes): The binary content of the PDF document.
llm_model (BaseChatModel): The language model instance to use for answering.
Returns:
Dict[str, Any]: A dictionary containing the answer generated by the language model.
"""
# text = extract_text_from_pdf_data(pdf_bytes)
# logger.info("Extracted text from PDF.")
logger.log(logging.INFO, "searching the article with the question: %s", question)
# Load the article
# loader = PyPDFLoader(state['pdf_file_name'])
# loader = PyPDFLoader("https://arxiv.org/pdf/2310.08365")
loader = PyPDFLoader(pdf_url)
# Load the pages of the article
pages = []
for page in loader.lazy_load():
pages.append(page)
# Set up text embedding model
# text_embedding_model = state['text_embedding_model']
# text_embedding_model = OpenAIEmbeddings(openai_api_key=cfg.openai_api_key)
logging.info("Loaded text embedding model %s", text_embedding_model)
# Create a vector store from the pages
vector_store = InMemoryVectorStore.from_documents(pages, text_embedding_model)
# Search the article with the question
docs = vector_store.similarity_search(question)
# Return the content of the pages
return "\n".join([doc.page_content for doc in docs])
# return answer
@tool(args_schema=QuestionAndAnswerInput)
def question_and_answer_tool(
question: str,
tool_call_id: Annotated[str, InjectedToolCallId],
state: Annotated[dict, InjectedState],
) -> Dict[str, Any]:
"""
Answer a question using PDF content stored in the state via retrieval augmented generation.
This tool retrieves the PDF binary data from the state (under the key "pdf_data"), extracts its
textual content, and generates an answer to the specified question. It also extracts the
llm_model (of type BaseChatModel) from the state to use for answering.
Args:
question (str): The question regarding the PDF content.
tool_call_id (str): Unique identifier for the current tool call.
state (dict): A dictionary representing the current state, expected to contain PDF data
under the key "pdf_data" with a sub-key "pdf_object" for the binary content,
and a key "llm_model" holding the language model instance.
Returns:
Dict[str, Any]: A dictionary containing the generated answer or an error message.
"""
logger.info("Starting PDF Question and Answer tool using PDF data from state.")
# print (state['text_embedding_model'])
text_embedding_model = state["text_embedding_model"]
pdf_state = state.get("pdf_data")
if not pdf_state:
error_msg = "No pdf_data found in state."
logger.error(error_msg)
return Command(
update={
"messages": [ToolMessage(content=error_msg, tool_call_id=tool_call_id)]
}
)
pdf_bytes = pdf_state.get("pdf_object")
if not pdf_bytes:
error_msg = "PDF binary data is missing in the pdf_data from state."
logger.error(error_msg)
return Command(
update={
"messages": [ToolMessage(content=error_msg, tool_call_id=tool_call_id)]
}
)
pdf_url = pdf_state.get("pdf_url")
# Retrieve llm_model from state; use a default if not provided.
llm_model = state.get("llm_model")
if not llm_model:
logger.error("Missing LLM model instance in state.")
return {"error": "No LLM model found in state."}
# answer = generate_answer(question, pdf_bytes, llm_model)
print(pdf_url)
answer = generate_answer2(question, pdf_url, text_embedding_model)
# logger.info("Generated answer: %s", answer)
return answer