Asclepius / Git / [4988ef] /preprocessing/postproc

Models:
cathy-stones/
Asclepius
Downloads: 0
[4988ef]: / preprocessing / postproc_questions.py
History
Download this file
78 lines (62 with data), 2.1 kB

import argparse
import re

import pandas as pd
import tiktoken

prompt = """You are an intelligent clinical language model.

[Discharge Summary Begin]
{note}
[Discharge Summary End]

[Instruction Begin]
{question}
[Instruction End]

Above, we provide you with a part of the discharge summary and the instruction that the healthcare professional gave about it.
Generate a response to the healthcare professional's instruction using the given discharge summary.

Here are requirements:
- Your response must be accurate and concise to the instruction.
- If the instruction is not fully answerable within the given discharge summary, explain the reason why it is unanswerable using the given information. 
- Do not say that you cannot respond as an AI model.
- Do not ask back nor rephrase the instruction.

Response:"""


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_path", type=str)
    parser.add_argument("--save_path", type=str)
    return parser.parse_args()


def main():
    args = parse_args()
    df = pd.read_json(args.input_path, lines=True)
    df["note"] = (
        df[0]
        .map(lambda x: x["messages"][0]["content"])
        .map(
            lambda x: re.findall(
                r"\[Discharge Summary Begin\]\n(.*)\n\[Discharge Summary End\]",
                x,
                flags=re.DOTALL | re.MULTILINE,
            )[0]
        )
    )
    df["question"] = df[1].map(lambda x: x["choices"][0]["message"]["content"])

    df["input"] = df.apply(
        lambda x: prompt.format(
            note=x["note"],
            question=x["question"],
        ),
        axis=1,
    )

    df["input"].map(
        lambda x: {
            "model": "gpt-3.5-turbo",
            "messages": [
                {
                    "role": "user",
                    "content": x,
                }
            ],
            # Max of GPT-3.5-Turbo
            "max_tokens": len(tiktoken.get_encoding("cl100k_base").encode(x)) + 400,
            "temperature": 1,
        },
    ).to_json(args.save_path, orient="records", lines=True)


if __name__ == "__main__":
    main()