|
a |
|
b/rag (1).py |
|
|
1 |
# -*- coding: utf-8 -*- |
|
|
2 |
"""rag.ipynb |
|
|
3 |
|
|
|
4 |
|
|
|
5 |
"""### RAG on using this dataset -> # GPT 3.5""" |
|
|
6 |
|
|
|
7 |
!pip install openai |
|
|
8 |
|
|
|
9 |
#!pip install --upgrade pip |
|
|
10 |
|
|
|
11 |
!pip install transformers torch |
|
|
12 |
|
|
|
13 |
!pip install llama-index llama-index-experimental |
|
|
14 |
|
|
|
15 |
import logging |
|
|
16 |
import sys |
|
|
17 |
from IPython.display import Markdown, display |
|
|
18 |
|
|
|
19 |
import pandas as pd |
|
|
20 |
from llama_index.experimental.query_engine import PandasQueryEngine |
|
|
21 |
|
|
|
22 |
|
|
|
23 |
logging.basicConfig(stream=sys.stdout, level=logging.INFO) |
|
|
24 |
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) |
|
|
25 |
|
|
|
26 |
from llama_index.llms.openai import OpenAI |
|
|
27 |
|
|
|
28 |
llm = OpenAI(model="gpt-3.5-turbo", api_key="sk-proj-fPxf5ypW6BhdQP5XMgetT3BlbkFJI5hOUj2DsSwlUI5ZwN5b") |
|
|
29 |
|
|
|
30 |
import openai |
|
|
31 |
openai.api_key = 'sk-proj-fPxf5ypW6BhdQP5XMgetT3BlbkFJI5hOUj2DsSwlUI5ZwN5b' |
|
|
32 |
|
|
|
33 |
query_engine = PandasQueryEngine(df=haa_trainChronologies,llm=llm, verbose=True) |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
|
|
|
37 |
file_formats = { |
|
|
38 |
"csv": pd.read_csv, |
|
|
39 |
"xls": pd.read_excel, |
|
|
40 |
"xlsx": pd.read_excel, |
|
|
41 |
"xlsm": pd.read_excel, |
|
|
42 |
"xlsb": pd.read_excel, |
|
|
43 |
} |
|
|
44 |
|
|
|
45 |
def load_data(uploaded_file): |
|
|
46 |
try: |
|
|
47 |
ext = os.path.splitext(uploaded_file.name)[1][1:].lower() |
|
|
48 |
except: |
|
|
49 |
ext = uploaded_file.split(".")[-1] |
|
|
50 |
if ext in file_formats: |
|
|
51 |
return file_formats[ext](uploaded_file) |
|
|
52 |
else: |
|
|
53 |
st.error(f"Unsupported file format: {ext}") |
|
|
54 |
return None |
|
|
55 |
|
|
|
56 |
# Read the Pandas DataFrame |
|
|
57 |
|
|
|
58 |
df = haa_develChronologies |
|
|
59 |
|
|
|
60 |
!pip install langchain |
|
|
61 |
|
|
|
62 |
!pip install langchain_openai |
|
|
63 |
|
|
|
64 |
from langchain_openai.chat_models import ChatOpenAI |
|
|
65 |
|
|
|
66 |
# Set your API key and model choice |
|
|
67 |
OPENAI_API_KEY = "sk-proj-0oig5Ll0qDjkUPCGlzq8T3BlbkFJHuYBzj0UJkkUy2Wgfh3O" |
|
|
68 |
MODEL_NAME = "gpt-3.5-turbo" # Specify the model you want to use |
|
|
69 |
|
|
|
70 |
# Initialize the ChatOpenAI model with a specific temperature |
|
|
71 |
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL_NAME, temperature=0.5) |
|
|
72 |
|
|
|
73 |
# Example usage |
|
|
74 |
response = model.invoke("What MLB team won the World Series during the COVID-19 pandemic?") |
|
|
75 |
print(response) |
|
|
76 |
|
|
|
77 |
!pip show langchain # Check the installed version |
|
|
78 |
!pip install --upgrade langchain # Update langchain |
|
|
79 |
|
|
|
80 |
!pip install langchain_experimental |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
|
|
|
84 |
#from langchain.agents.agent_types import AgentType |
|
|
85 |
|
|
|
86 |
df= haa_develChronologies |
|
|
87 |
from langchain.agents.agent_types import AgentType # Use the experimental version of AgentType |
|
|
88 |
from langchain_experimental.agents import create_pandas_dataframe_agent # Assuming you also need this experimental feature |
|
|
89 |
|
|
|
90 |
from langchain.callbacks import StreamlitCallbackHandler |
|
|
91 |
from langchain.chat_models import ChatOpenAI |
|
|
92 |
|
|
|
93 |
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613", openai_api_key="sk-proj-0oig5Ll0qDjkUPCGlzq8T3BlbkFJHuYBzj0UJkkUy2Wgfh3O", streaming=True) |
|
|
94 |
pandas_df_agent = create_pandas_dataframe_agent(llm,df,verbose=True,agent_type=AgentType.OPENAI_FUNCTIONS,handle_parsing_errors=True,) |
|
|
95 |
|
|
|
96 |
response = pandas_df_agent.run("provide me time stemp for observations C0392747") |
|
|
97 |
|
|
|
98 |
|
|
|
99 |
|
|
|
100 |
dataFrameResponse = pandas_df_agent.run("provide me dataframe as string") |
|
|
101 |
|
|
|
102 |
type(dataFrameResponse) |
|
|
103 |
|
|
|
104 |
!pip install langchain_core |
|
|
105 |
|
|
|
106 |
!pip install ChatOpenAI |
|
|
107 |
|
|
|
108 |
from langchain_core.output_parsers import StrOutputParser |
|
|
109 |
|
|
|
110 |
parser = StrOutputParser() |
|
|
111 |
model = llm |
|
|
112 |
chain = model | parser |
|
|
113 |
|
|
|
114 |
try: |
|
|
115 |
chain.invoke({ |
|
|
116 |
"context":dataFrameResponse, |
|
|
117 |
"question":"which observations are more prone have Hospital acquired pressure injury? " |
|
|
118 |
}) |
|
|
119 |
except Exception as e: |
|
|
120 |
print(e) |
|
|
121 |
|
|
|
122 |
#String Dataframe respone to Text Splitter by langchain |
|
|
123 |
|
|
|
124 |
# Note : Chunk size dependent on task you want to achieve with it |
|
|
125 |
|
|
|
126 |
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
127 |
|
|
|
128 |
df_responseSplitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=15) |
|
|
129 |
df_responseSplitter.split_documents(dataFrameResponse)[:3] |
|
|
130 |
|
|
|
131 |
|
|
|
132 |
|
|
|
133 |
# openai embedding : |
|
|
134 |
|
|
|
135 |
from langchain_openai.embeddings import OpenAIEmbeddings |
|
|
136 |
|
|
|
137 |
ai_embeddings = OpenAIEmbeddings() |
|
|
138 |
ai_embed_query = embeddings.embed_query("provide me time stemp for observations C0392747") |
|
|
139 |
|
|
|
140 |
print(f"length: {len(ai_embed_query)}") |
|
|
141 |
|
|
|
142 |
|
|
|
143 |
|
|
|
144 |
medical_sentence1 = embeddings.embed_query("The timestamps for observations containing C0392747 are as follows - 2104-08-05 - 2104-08-07 - 2104-08-08") |
|
|
145 |
medical_sentence2 = embeddings.embed_query("Subject id : 75") |
|
|
146 |
|
|
|
147 |
# now let's examine performance of this medical query |
|
|
148 |
|
|
|
149 |
from sklearn.metrics.pairwise import cosine_similarity |
|
|
150 |
|
|
|
151 |
def calculate_cosine_similarity(embedded_query, embedded_sentence): |
|
|
152 |
|
|
|
153 |
similarities = [cosine_similarity([embedded_query], [sentence])[0][0] for sentence in embedded_sentences] |
|
|
154 |
return similarities |
|
|
155 |
|
|
|
156 |
|
|
|
157 |
|
|
|
158 |
similarities_query1 = calculate_cosine_similarity(embedded_medical_query1, medical_sentence1) |
|
|
159 |
similarities_query2 = calculate_cosine_similarity(embedded_medical_query2, medical_sentence2) |
|
|
160 |
|
|
|
161 |
|
|
|
162 |
print('Similarities for Query 1:', similarities_query1) |
|
|
163 |
print('Similarities for Query 2:', similarities_query2) |
|
|
164 |
|
|
|
165 |
# now adding vector store |
|
|
166 |
|
|
|
167 |
# Concatenate the columns with a separator for clarity |
|
|
168 |
haa_develAdmittimes['combined'] = haa_develAdmittimes['hadm_id'].astype(str) + " at " + haa_develAdmittimes['admittime'].astype(str) |
|
|
169 |
|
|
|
170 |
# Now extract the combined text data |
|
|
171 |
text_data = haa_develAdmittimes['combined'].tolist() |
|
|
172 |
|
|
|
173 |
# here we can change multiple vector store as per application |
|
|
174 |
|
|
|
175 |
from langchain_community.vectorstores import DocArrayInMemorySearch |
|
|
176 |
|
|
|
177 |
# Create vector store from the combined text data |
|
|
178 |
admittime_vectorstore = DocArrayInMemorySearch.from_texts(text_data, embedding=embeddings) |
|
|
179 |
|
|
|
180 |
admittime_vectorstore.similarity_search_with_score(query="give me hadm_id of paintents accociated with 3rd january", k=2) |
|
|
181 |
|
|
|
182 |
# retriever |
|
|
183 |
|
|
|
184 |
med_retriever = admittime_vectorstore.as_retriever() |
|
|
185 |
|
|
|
186 |
from langchain_core.runnables import RunnableParallel, RunnablePassthrough |
|
|
187 |
|
|
|
188 |
setup = RunnableParallel(context=med_retriever, question=RunnablePassthrough()) |
|
|
189 |
setup.invoke("give me hadm_id of paintents accociated with 3rd january?") |
|
|
190 |
|
|
|
191 |
chain = setup | prompt | model | parser |
|
|
192 |
chain.invoke("give me hadm_id of paintents accociated with 2nd january") |
|
|
193 |
|