[5b4ecd]: / evaluation / finetune_preprocessing.py

Download this file

102 lines (87 with data), 4.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
"""
This script implements the main pipeline of data preprocessing for fine-tuning.
"""
import argparse
import os
import sys
import sys
sys.path.append('../')
from benchmarks import benchmark_factory
from benchmarks import load_instruction
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
INSTRUCTIONS = {
'medmcqa': {'partition': 'validation', 'instructions': 'medmcqa', 'cot_col': 'exp'},
'pubmedqa': {'partition': 'validation', 'instructions': 'pubmedqa', 'cot_col': 'LONG_ANSWER'},
'medqa': {'partition': 'test', 'instructions': 'medqa'},
'medicationqa': {'partition': '', 'instructions': 'open_question'},
'mmlu_medical': {'partition': 'test', 'instructions': 'medmcqa'},
'mmlu_general': {'partition': 'test', 'instructions': 'medmcqa'},
"gsm8k": {'partition': 'test', 'instructions': 'gsm8k', 'cot_col': 'steps'},
}
def benchmark_preparation(data_obj, partition, args):
"""
Runs the benchmark preparation pipeline on a given benchmark.
:param data_obj: benchmark.Benchmark, the benchmark to run the preparation pipeline on
:param partition: str, the partition to run the preparation pipeline on
:param args: argparse.Namespace, the arguments to run the preparation pipeline
"""
# Load & preprocess [partition] data
data_obj.load_data(partition=partition)
data_obj.preprocessing(partition=partition)
# Get instructions
prompt_name = INSTRUCTIONS[args.benchmark]['instructions']
if args.tags:
prompt_name = prompt_name + '_tags'
if args.cot:
prompt_name = prompt_name + '_cot'
print(f'Prompt used for evaluation: {prompt_name}')
instruction = load_instruction(prompt_name)
print(f'Instruction used for evaluation: \n\t{instruction["system"]}\n\t{instruction["user"]}\n')
# Adding instruction
if args.cot:
data_obj.add_instruction(
instruction=instruction,
partition=partition,
cot_column = INSTRUCTIONS[args.benchmark].get('cot_col', None))
else:
data_obj.add_instruction(
instruction=instruction,
partition=partition)
return prompt_name
def main(args):
"""
Runs the main evaluation pipeline on the given generations.
:param benchmark: str, the name of the bechmark used for the evaluation
:param distination_path: str, the name of the checkpoint that produced the generations.
"""
# Instantiate benchmark and load data
data_obj = benchmark_factory(args.benchmark)
prompt_name = benchmark_preparation(data_obj, args.split, args)
# Store preprocessed data
file_name = '{}_{}_{}.jsonl'.format(args.benchmark, prompt_name, args.split)
destination_file = os.path.join(ROOT_DIR, 'benchmarks', 'ft_preprocessed', file_name)
if args.split == 'train':
data_obj.train_data.to_json(destination_file, orient='records', lines=True)
elif args.split == 'validation':
data_obj.validation_data.to_json(destination_file, orient='records', lines=True)
elif args.split == 'test':
data_obj.test_data.to_json(destination_file, orient='records', lines=True)
print('Preprocessed Benchmark {} ready for fine-tuning. \nPath: {}'.format(file_name, destination_file))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--split',
type=str,
default="train",
help="The name of the split used for the evaluation.")
parser.add_argument('--benchmark',
type=str,
default="medmcqa",
help="The name of the dataset used for the fine-tuning.")
parser.add_argument('--cot',
action='store_true',
help="Whether to use the chain-of-thought in fine-tuning.")
parser.add_argument('-tags',
action='store_true',
help="Whether to use tags for the fine-tuning.")
args = parser.parse_args()
main(args)