Diff of /rag (1).py [000000] .. [0037e2]

Switch to unified view

a b/rag (1).py
1
# -*- coding: utf-8 -*-
2
"""rag.ipynb
3
4
5
"""### RAG on using this dataset -> # GPT 3.5"""
6
7
!pip install openai
8
9
#!pip install --upgrade pip
10
11
!pip install transformers torch
12
13
!pip install llama-index llama-index-experimental
14
15
import logging
16
import sys
17
from IPython.display import Markdown, display
18
19
import pandas as pd
20
from llama_index.experimental.query_engine import PandasQueryEngine
21
22
23
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
24
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
25
26
from llama_index.llms.openai import OpenAI
27
28
llm = OpenAI(model="gpt-3.5-turbo", api_key="sk-proj-fPxf5ypW6BhdQP5XMgetT3BlbkFJI5hOUj2DsSwlUI5ZwN5b")
29
30
import openai
31
openai.api_key = 'sk-proj-fPxf5ypW6BhdQP5XMgetT3BlbkFJI5hOUj2DsSwlUI5ZwN5b'
32
33
query_engine = PandasQueryEngine(df=haa_trainChronologies,llm=llm, verbose=True)
34
35
36
37
file_formats = {
38
    "csv": pd.read_csv,
39
    "xls": pd.read_excel,
40
    "xlsx": pd.read_excel,
41
    "xlsm": pd.read_excel,
42
    "xlsb": pd.read_excel,
43
}
44
45
def load_data(uploaded_file):
46
    try:
47
        ext = os.path.splitext(uploaded_file.name)[1][1:].lower()
48
    except:
49
        ext = uploaded_file.split(".")[-1]
50
    if ext in file_formats:
51
        return file_formats[ext](uploaded_file)
52
    else:
53
        st.error(f"Unsupported file format: {ext}")
54
        return None
55
56
# Read the Pandas DataFrame
57
58
df = haa_develChronologies
59
60
!pip install langchain
61
62
!pip install langchain_openai
63
64
from langchain_openai.chat_models import ChatOpenAI
65
66
# Set your API key and model choice
67
OPENAI_API_KEY = "sk-proj-0oig5Ll0qDjkUPCGlzq8T3BlbkFJHuYBzj0UJkkUy2Wgfh3O"
68
MODEL_NAME = "gpt-3.5-turbo"  # Specify the model you want to use
69
70
# Initialize the ChatOpenAI model with a specific temperature
71
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL_NAME, temperature=0.5)
72
73
# Example usage
74
response = model.invoke("What MLB team won the World Series during the COVID-19 pandemic?")
75
print(response)
76
77
!pip show langchain  # Check the installed version
78
!pip install --upgrade langchain  # Update langchain
79
80
!pip install langchain_experimental
81
82
83
84
#from langchain.agents.agent_types import AgentType
85
86
df= haa_develChronologies
87
from langchain.agents.agent_types import AgentType  # Use the experimental version of AgentType
88
from langchain_experimental.agents import create_pandas_dataframe_agent  # Assuming you also need this experimental feature
89
90
from langchain.callbacks import StreamlitCallbackHandler
91
from langchain.chat_models import ChatOpenAI
92
93
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613", openai_api_key="sk-proj-0oig5Ll0qDjkUPCGlzq8T3BlbkFJHuYBzj0UJkkUy2Wgfh3O", streaming=True)
94
pandas_df_agent = create_pandas_dataframe_agent(llm,df,verbose=True,agent_type=AgentType.OPENAI_FUNCTIONS,handle_parsing_errors=True,)
95
96
response = pandas_df_agent.run("provide me time stemp for observations C0392747")
97
98
99
100
dataFrameResponse = pandas_df_agent.run("provide me dataframe as string")
101
102
type(dataFrameResponse)
103
104
!pip install langchain_core
105
106
!pip install ChatOpenAI
107
108
from langchain_core.output_parsers import StrOutputParser
109
110
parser = StrOutputParser()
111
model = llm
112
chain = model | parser
113
114
try:
115
    chain.invoke({
116
        "context":dataFrameResponse,
117
        "question":"which observations are more prone have Hospital acquired pressure injury? "
118
    })
119
except Exception as e:
120
    print(e)
121
122
#String Dataframe respone to Text Splitter by langchain
123
124
# Note : Chunk size dependent on task you want to achieve with it
125
126
from langchain.text_splitter import RecursiveCharacterTextSplitter
127
128
df_responseSplitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=15)
129
df_responseSplitter.split_documents(dataFrameResponse)[:3]
130
131
132
133
# openai embedding :
134
135
from langchain_openai.embeddings import OpenAIEmbeddings
136
137
ai_embeddings = OpenAIEmbeddings()
138
ai_embed_query = embeddings.embed_query("provide me time stemp for observations C0392747")
139
140
print(f"length: {len(ai_embed_query)}")
141
142
143
144
medical_sentence1 = embeddings.embed_query("The timestamps for observations containing C0392747 are as follows - 2104-08-05 - 2104-08-07 - 2104-08-08")
145
medical_sentence2 = embeddings.embed_query("Subject id : 75")
146
147
# now let's examine performance of this medical query
148
149
from sklearn.metrics.pairwise import cosine_similarity
150
151
def calculate_cosine_similarity(embedded_query, embedded_sentence):
152
153
    similarities = [cosine_similarity([embedded_query], [sentence])[0][0] for sentence in embedded_sentences]
154
    return similarities
155
156
157
158
similarities_query1 = calculate_cosine_similarity(embedded_medical_query1, medical_sentence1)
159
similarities_query2 = calculate_cosine_similarity(embedded_medical_query2, medical_sentence2)
160
161
162
print('Similarities for Query 1:', similarities_query1)
163
print('Similarities for Query 2:', similarities_query2)
164
165
# now adding vector store
166
167
# Concatenate the columns with a separator for clarity
168
haa_develAdmittimes['combined'] = haa_develAdmittimes['hadm_id'].astype(str) + " at " + haa_develAdmittimes['admittime'].astype(str)
169
170
# Now extract the combined text data
171
text_data = haa_develAdmittimes['combined'].tolist()
172
173
# here we can change multiple vector store as per application
174
175
from langchain_community.vectorstores import DocArrayInMemorySearch
176
177
# Create vector store from the combined text data
178
admittime_vectorstore = DocArrayInMemorySearch.from_texts(text_data, embedding=embeddings)
179
180
admittime_vectorstore.similarity_search_with_score(query="give me hadm_id of paintents accociated with 3rd january", k=2)
181
182
#  retriever
183
184
med_retriever = admittime_vectorstore.as_retriever()
185
186
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
187
188
setup = RunnableParallel(context=med_retriever, question=RunnablePassthrough())
189
setup.invoke("give me hadm_id of paintents accociated with 3rd january?")
190
191
chain = setup | prompt | model | parser
192
chain.invoke("give me hadm_id of paintents accociated with 2nd january")
193