--- a +++ b/eval_rag.py @@ -0,0 +1,176 @@ + +#RAG evaluation is quite hard for me , I refer some documentation online + +eval_dataset = Dataset.from_csv("haa_develAdmittimes.csv") + +eval_dataset + +!pip install llama-index -qU + +from ragas.metrics import ( + answer_relevancy, + faithfulness, + context_recall, + context_precision, +) +from ragas.metrics.critique import harmfulness +from ragas import evaluate + +subject_id hadm_id timestamp observations + +def create_ragas_dataset(rag_pipeline, eval_dataset): + rag_dataset = [] + for row in tqdm(eval_dataset): + answer = rag_pipeline({"query" : row["timestamp"]}) + rag_dataset.append( + {"subject_id" : row["subject_id"], + "answer" : answer["hadm_id"], + "contexts" : [context.page_content for context in answer haa_develAdmittimes['hadm_id']], + "observations" : [row["observations"]] + } + ) + +haa_develAdmittimes['combined'] = haa_develAdmittimes['hadm_id'].astype(str) + " at " + haa_develAdmittimes['admittime'].astype(str) + + + + rag_df = haa_develAdmittimes['combined'] + rag_eval_dataset = Dataset.from_pandas(haa_develAdmittimes['combined']) + return rag_eval_dataset + +def evaluate_ragas_dataset(ragas_dataset): + result = evaluate( + ragas_dataset, + metrics=[ + context_precision, + faithfulness, + answer_relevancy, + context_recall, + ], + ) + return result + +"""Lets create our dataset first:""" + +from tqdm import tqdm +import pandas as pd + +basic_qa_ragas_dataset = create_ragas_dataset(qa_chain, eval_dataset) + +"""Save it for later:""" + +basic_qa_ragas_dataset.to_csv("basic_qa_ragas_dataset.csv") + +"""And finally - evaluate how it did!""" + +basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset) + +basic_qa_result + +"""### Testing Other Retrievers + +Now we can test our how changing our Retriever impacts our RAGAS evaluation! +""" + +def create_qa_chain(medical_retriever): + primary_qa_llm = llm + + created_qa_chain = RetrievalQA.from_chain_type( + primary_qa_llm, + medical_retriever=medical_retriever, + return_source_documents=True + ) + + return created_qa_chain + +"""#### Parent Document Retriever + +One of the easier ways we can imagine improving a retriever is to embed our documents into small chunks, and then retrieve a significant amount of additional context that "surrounds" the found context. + +You can read more about this method [here](https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever)! +""" + +!pip install chromadb -qU + +from langchain.retrievers import ParentDocumentRetriever +from langchain.storage import InMemoryStore +from langchain.vectorstores import Chroma + +parent_splitter = RecursiveCharacterTextSplitter(chunk_size=750) +child_splitter = RecursiveCharacterTextSplitter(chunk_size=200) + +vectorstore = Chroma(collection_name="split_parents", embedding_function=embeddings_model) + +store = InMemoryStore() + +parent_document_retriever = ParentDocumentRetriever( + vectorstore=vectorstore, + docstore=store, + child_splitter=child_splitter, + parent_splitter=parent_splitter, +) + +parent_document_retriever.add_documents(base_docs) + +"""Let's create, test, and then evaluate our new chain!""" + +parent_document_retriever_qa_chain = create_qa_chain(parent_document_retriever) + +parent_document_retriever_qa_chain({"query" : "What is RAG?"})["result"] + +pdr_qa_ragas_dataset = create_ragas_dataset(parent_document_retriever_qa_chain, eval_dataset) + +pdr_qa_ragas_dataset.to_csv("pdr_qa_ragas_dataset.csv") + +pdr_qa_result = evaluate_ragas_dataset(pdr_qa_ragas_dataset) + +pdr_qa_result + +!pip install -q -U rank_bm25 + +from langchain.retrievers import BM25Retriever, EnsembleRetriever + +text_splitter = RecursiveCharacterTextSplitter() +docs = text_splitter.split_documents(base_docs) + +bm25_retriever = BM25Retriever.from_documents(docs) +bm25_retriever.k = 1 + +embedding = OpenAIEmbeddings() +vectorstore = Chroma.from_documents(docs, embedding) +chroma_retriever = vectorstore.as_retriever(search_kwargs={"k": 1}) + +ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.5, 0.5]) + +ensemble_retriever_qa_chain = create_qa_chain(ensemble_retriever) + +ensemble_retriever_qa_chain({"query" : "What subject id here ?"})["result"] + +ensemble_qa_ragas_dataset = create_ragas_dataset(ensemble_retriever_qa_chain, eval_dataset) + +ensemble_qa_ragas_dataset.to_csv("ensemble_qa_ragas_dataset.csv") + +ensemble_qa_result = evaluate_ragas_dataset(ensemble_qa_ragas_dataset) + +ensemble_qa_result + + +from rouge_score import rouge_scorer + +def calculate_rouge_scores(references, predictions): + scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) + scores = [] + for ref, pred in zip(references, predictions): + score = scorer.score(ref, pred) + scores.append(score) + return scores + +# Example usage with dummy data +references = ["subject_id hadm_id timestamp observations 0 12 112213 2104-08-05 C0392747 C0684224 C3273238 C3812171 C0700287 C... 1 12 112213 2104-08-07 C0392747 C0684224 C3273238 C1523018 C0700287 12 112213 2104-08-08 C0181904 C1552822 C0015392 C0450429 C0150369 C..." ] + + +predictions = ["2 12 112213 2104-08-08 C0181904 C1552822 C0015392 C0450429 C0150369 C...3 12 112213 2104-08-08 C0392747 C0684224 C3273238 C0202059 C4050465 C.."] + +rouge_scores = calculate_rouge_scores(references, predictions) +for score in rouge_scores: + print(score)