In [None]:
import os
import sys

src_path = os.path.abspath("../..")
print(src_path)
sys.path.append(src_path)

In [None]:
from src.utils import processed_data_path, set_seed, remote_project_path

In [None]:
set_seed(seed=42)

In [None]:
import pandas as pd

In [None]:
answer_filename = "llemr_vicuna"

In [None]:
model_path = os.path.join(remote_project_path, "output")

In [None]:
output_path = os.path.join(processed_data_path, "mimic4")

In [None]:
b_answer = pd.read_json(os.path.join(model_path, f"gpt4/qa_output/answer.jsonl"), lines=True)
b_answer.a_hat = b_answer.a_hat.replace("", float("nan"))
b_answer = b_answer.dropna()
b_answer

In [None]:
answer = pd.read_json(os.path.join(model_path, f"{answer_filename}/qa_output/answer.jsonl"), lines=True)
answer

In [None]:
answer = b_answer.merge(answer, on=["hadm_id", "q", "a", "source"])
answer

In [None]:
system_content = """You are a helpful and precise assistant for evaluating the quality of responses.

Please assess the performance of two clinical AI assistants based on the question and the ground-truth answer provided below.

Your evaluation should consider helpfulness, relevance, accuracy, and level of detail.

Rate each AI assistant's response with a single score on a scale of 1 to 10, where 10 represents excellent performance.

Please first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.

In the subsequent line, provide a concise explanation of your evaluation.

Avoid any potential bias and ensure that the order in which the responses were presented does not affect your judgment."""

In [None]:
def generate_user_content(q, a, a_hat_1, a_hat_2):
    return f"""[Question]
{q}
[End of Question]
    
[Ground-truth Answer]
{a}
[End of Ground-truth Answer]

[Assistant 1 Answer]
{a_hat_1}
[End of Assistant 1 Answer]

[Assistant 2 Answer]
{a_hat_2}
[End of Assistant 2 Answer]"""

In [None]:
prompts = {}
for _, data in answer.iterrows():
    messages = [{"role": "system", "content": system_content},
                {"role": "user", "content": generate_user_content(data.q, data.a, data.a_hat_x, data.a_hat_y)}]
    prompts[(data.source, data.hadm_id)] = messages
len(prompts)

In [None]:
import asyncio
from openai import AsyncAzureOpenAI

# TODO: Enter your credentials
async_client = AsyncAzureOpenAI(
    azure_endpoint="",
    api_key="",
    api_version=""
)

In [None]:
async def generate_chat_response(async_client, prompt):
    chat_params = {
        "model": "gpt-3.5-turbo",
        "messages": prompt,
        "max_tokens": 512,
        "temperature": 0.0,
    }
    try:
        response = await async_client.chat.completions.create(**chat_params)
    except Exception as e:
        print(f"Error in call_async: {e}")
        time.sleep(10)
        print(f"Sleep for 10s...")
        return -1
    return response.choices[0].message.content

In [None]:
import time


async def process_prompts(prompts):
    # Gather all the futures together and wait for them to complete
    responses = await asyncio.gather(*(generate_chat_response(async_client, prompt) for prompt in prompts))
    return responses

In [None]:
def chunk_list(lst, chunk_size):
    """Yield successive chunk_size chunks from lst."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

In [None]:
from tqdm.asyncio import tqdm


async def process_prompts_in_batches(prompts, batch_size, repeat=3):
    all_responses = {}

    for i in range(repeat):

        print(f"round {i}")
        prev_n_responses = len(all_responses)

        prompts_k = [k for k in prompts.keys() if k not in all_responses]

        # Chunk the prompts into batches
        prompt_k_batches = list(chunk_list(prompts_k, batch_size))

        for batch_k in tqdm(prompt_k_batches, desc="Processing Batches"):
            batch_v = [prompts[k] for k in batch_k]
            responses = await process_prompts(batch_v)
            all_responses |= {k: v for k, v in zip(batch_k, responses) if type(v) is str}
        print(f"get {len(all_responses) - prev_n_responses} new responses")

    return all_responses

In [None]:
# Choose an appropriate batch size
batch_size = 10  # Adjust based on your system and API limits

# Assuming we are in an async environment
responses = await process_prompts_in_batches(prompts, batch_size)
print(f"Processed {len(responses)} responses")

In [None]:
def split_responase(r, verbose=False):
    if verbose:
        print(r)
    split_text = r.split("\n", 1)
    scores = split_text[0].split(" ")
    base_score = float(scores[0])
    score = float(scores[1])
    comment = split_text[1].strip() if len(split_text) > 1 else ""
    if verbose:
        print("scores:", scores)
        print("comment:", comment)
    return base_score, score, comment

In [None]:
responses_split = {}
for k, r in responses.items():
    responses_split[k] = split_responase(r)

In [None]:
import json

with open(os.path.join(model_path, f"{answer_filename}/qa_output/answer_eval.jsonl"), "w") as file:
    c = 0
    for _, data in answer.iterrows():
        if (data.source, data.hadm_id) in responses_split:
            base_score, score, comment = responses_split[(data.source, data.hadm_id)]
            json_string = json.dumps({
                "hadm_id": data.hadm_id,
                "q": data.q,
                "a": data.a,
                "a_hat": data.a_hat_y,
                "score": score,
                "base_a_hat": data.a_hat_x,
                "base_score": base_score,
                "comment": comment,
                "source": data.source
            })
            file.write(json_string + '\n')
            c += 1
c