[0037e2]: / eval_rag.py

Download this file

177 lines (115 with data), 5.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#RAG evaluation is quite hard for me , I refer some documentation online
eval_dataset = Dataset.from_csv("haa_develAdmittimes.csv")
eval_dataset
!pip install llama-index -qU
from ragas.metrics import (
answer_relevancy,
faithfulness,
context_recall,
context_precision,
)
from ragas.metrics.critique import harmfulness
from ragas import evaluate
subject_id hadm_id timestamp observations
def create_ragas_dataset(rag_pipeline, eval_dataset):
rag_dataset = []
for row in tqdm(eval_dataset):
answer = rag_pipeline({"query" : row["timestamp"]})
rag_dataset.append(
{"subject_id" : row["subject_id"],
"answer" : answer["hadm_id"],
"contexts" : [context.page_content for context in answer haa_develAdmittimes['hadm_id']],
"observations" : [row["observations"]]
}
)
haa_develAdmittimes['combined'] = haa_develAdmittimes['hadm_id'].astype(str) + " at " + haa_develAdmittimes['admittime'].astype(str)
rag_df = haa_develAdmittimes['combined']
rag_eval_dataset = Dataset.from_pandas(haa_develAdmittimes['combined'])
return rag_eval_dataset
def evaluate_ragas_dataset(ragas_dataset):
result = evaluate(
ragas_dataset,
metrics=[
context_precision,
faithfulness,
answer_relevancy,
context_recall,
],
)
return result
"""Lets create our dataset first:"""
from tqdm import tqdm
import pandas as pd
basic_qa_ragas_dataset = create_ragas_dataset(qa_chain, eval_dataset)
"""Save it for later:"""
basic_qa_ragas_dataset.to_csv("basic_qa_ragas_dataset.csv")
"""And finally - evaluate how it did!"""
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)
basic_qa_result
"""### Testing Other Retrievers
Now we can test our how changing our Retriever impacts our RAGAS evaluation!
"""
def create_qa_chain(medical_retriever):
primary_qa_llm = llm
created_qa_chain = RetrievalQA.from_chain_type(
primary_qa_llm,
medical_retriever=medical_retriever,
return_source_documents=True
)
return created_qa_chain
"""#### Parent Document Retriever
One of the easier ways we can imagine improving a retriever is to embed our documents into small chunks, and then retrieve a significant amount of additional context that "surrounds" the found context.
You can read more about this method [here](https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever)!
"""
!pip install chromadb -qU
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain.vectorstores import Chroma
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=750)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
vectorstore = Chroma(collection_name="split_parents", embedding_function=embeddings_model)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
vectorstore=vectorstore,
docstore=store,
child_splitter=child_splitter,
parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(base_docs)
"""Let's create, test, and then evaluate our new chain!"""
parent_document_retriever_qa_chain = create_qa_chain(parent_document_retriever)
parent_document_retriever_qa_chain({"query" : "What is RAG?"})["result"]
pdr_qa_ragas_dataset = create_ragas_dataset(parent_document_retriever_qa_chain, eval_dataset)
pdr_qa_ragas_dataset.to_csv("pdr_qa_ragas_dataset.csv")
pdr_qa_result = evaluate_ragas_dataset(pdr_qa_ragas_dataset)
pdr_qa_result
!pip install -q -U rank_bm25
from langchain.retrievers import BM25Retriever, EnsembleRetriever
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(base_docs)
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 1
embedding = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(docs, embedding)
chroma_retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.5, 0.5])
ensemble_retriever_qa_chain = create_qa_chain(ensemble_retriever)
ensemble_retriever_qa_chain({"query" : "What subject id here ?"})["result"]
ensemble_qa_ragas_dataset = create_ragas_dataset(ensemble_retriever_qa_chain, eval_dataset)
ensemble_qa_ragas_dataset.to_csv("ensemble_qa_ragas_dataset.csv")
ensemble_qa_result = evaluate_ragas_dataset(ensemble_qa_ragas_dataset)
ensemble_qa_result
from rouge_score import rouge_scorer
def calculate_rouge_scores(references, predictions):
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = []
for ref, pred in zip(references, predictions):
score = scorer.score(ref, pred)
scores.append(score)
return scores
# Example usage with dummy data
references = ["subject_id hadm_id timestamp observations 0 12 112213 2104-08-05 C0392747 C0684224 C3273238 C3812171 C0700287 C... 1 12 112213 2104-08-07 C0392747 C0684224 C3273238 C1523018 C0700287 12 112213 2104-08-08 C0181904 C1552822 C0015392 C0450429 C0150369 C..." ]
predictions = ["2 12 112213 2104-08-08 C0181904 C1552822 C0015392 C0450429 C0150369 C...3 12 112213 2104-08-08 C0392747 C0684224 C3273238 C0202059 C4050465 C.."]
rouge_scores = calculate_rouge_scores(references, predictions)
for score in rouge_scores:
print(score)