[4988ef]: / preprocessing / pmc_postprocessing.py

Download this file

41 lines (31 with data), 955 Bytes

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import argparse
import random
import re
import pandas as pd
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--input_path", type=str)
parser.add_argument("--save_path", type=str)
return parser.parse_args()
def main():
args = parse_args()
df = pd.read_json(args.input_path, lines=True)
df["case_report"] = (
df[0]
.map(lambda x: x["messages"][0]["content"])
.map(
lambda x: re.findall(
r"\[The start of case report\]\n(.*)\n\[The end of case report\]",
x,
re.DOTALL | re.MULTILINE,
)[0]
)
)
df["note"] = df[1].map(lambda x: x["choices"][0]["message"]["content"])
df["idx"] = df.apply(lambda x: random.randint(0, 7), axis=1)
df[["case_report", "note", "idx"]].to_json(
args.save_path, orient="records", indent=4
)
return
if __name__ == "__main__":
main()