Asclepius / Git / [4988ef] /eval/gpt4

Models:
cathy-stones/
Asclepius
Downloads: 0
[4988ef]: / eval / gpt4_evaluate.py
History
Download this file
117 lines (90 with data), 4.3 kB

import argparse
import json
import random
import time

import openai
import pandas as pd
from tqdm import tqdm

openai.api_key = "YOUR_API_KEY"

prompt = """You are an intelligent clinical language model. 

[Discharge Summary Begin]
{note}
[Discharge Summary End]

[Instruction Begin]
{question}
[Instruction End]

{answers}
Above, we provide you a discharge summary and the instruction that the healthcare professional gave about the discharge summary.
You are also provided with {num_samples} corresponding responses from {num_samples} different clinical models.
Your task is to read the discharge summary and the instruction carefully then find the answer to the instruction. 
Then, compare your answer with each model's response and evaluate the response based on the following criteria.

Criteria : 
1. Unacceptable (1 point): The model's response includes any incorrect or irrelevant contents. If the instruction was unanswerable, the model did not acknowledge this and outputs wrong answer.
2. Poor (2 points): The model's response does not contain any incorrect or irrelevant contents, but omits significant or crucial contents that the instruction is requiring for.
3. Satisfactory (3 points): The model's response does not contain any incorrect or irrelevant contents, but omits minor or insignificant contents that the instruction is requiring for.
4. Excellent (4 points): The model's response contains all necesarry information that the instruction is requiring for. If the instruction was unanswerable, the model correctly acknowledged this and says that it is unanswerable.

When evaluating each score based on above criteria, ensure that each judgement is not affected by other model's response.
First line must contain only {num_samples} values, which indicate the score for each model, respectively.
The {num_samples} scores are separated by a space.
Output scores without explanation.
"""


def generate_inst_prompt(note, question, samples):
    answers = ""
    for i, sample in enumerate(samples):
        sample_name = chr(65 + i)  # Alphabet A, B, C...
        answers += f"[Agent {sample_name}'s Answer Begin]\n{sample}\n[Agent {sample_name}'s Answer End]\n\n"
    return [
        {
            "role": "user",
            "content": prompt.format(
                note=note, question=question, answers=answers, num_samples=len(samples)
            ),
        }
    ]


def make_answer_gpt(message):
    for i in range(10):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4-0314", messages=message, max_tokens=2048, temperature=0
            )
        except Exception as e:
            print(e)
            time.sleep(5)
            continue
        return response["choices"][0]["message"]["content"]
    return str(response)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_path", type=str, required=True)
    parser.add_argument("--save_path", type=str, required=True)

    return parser.parse_args()


def main():
    args = parse_args()
    data = pd.read_json(args.input_path, orient="records", lines=True)

    answer_cols = [i for i in data.columns if "answer" in i]

    for _, row in tqdm(data.iterrows()):
        order = list(range(len(answer_cols)))
        random.shuffle(order)

        note = row["note"]
        question = row["question"]
        samples = row[answer_cols].values[order]

        prompt = generate_inst_prompt(note, question, samples)
        answer = make_answer_gpt(prompt)

        answer = answer.strip('"')
        answer = answer.strip("'")
        splitted_answer = answer.split()

        try:
            [splitted_answer[order.index(idx)] for idx in range(len(answer_cols))]
        except:
            for idx, col in enumerate(answer_cols):
                model_name = "_".join(col.split("_")[:-1])
                row[f"{model_name}_score"] = 0
        else:
            for idx, col in enumerate(answer_cols):
                model_name = "_".join(col.split("_")[:-1])
                row[f"{model_name}_score"] = splitted_answer[order.index(idx)]

        row["gpt_response"] = answer
        with open(args.save_path, "a") as f:
            f.write(json.dumps(row.to_dict()) + "\n")


if __name__ == "__main__":
    main()