meditron / Git / [5b4ecd] /gap-replay/tokenize.sh

Models:

cathy-stones/

meditron

Downloads: 0

[5b4ecd]: / gap-replay / tokenize.sh

History

Download this file

39 lines (31 with data), 1.4 kB

#!/bin/bash

# Tokenize GAP-Replay (Train + Test)
# You will need to clone the Megatron-LLM repository https://github.com/epfLLM/Megatron-LLM
# and provide a valid path to it

echo "Tokenize GAP-Replay (Train + Test)"

TRAINER_PATH = /model-parallel-trainer
echo "Trainer path: $TRAINER_PATH"

DATA_PATH = /data/gap-replay
echo "GAP-Replay path: $GAP_REPLAY_PATH"

python $TRAINER_PATH/tools/preprocess_data.py \
       --input $DATA_PATH/gap_replay_train.jsonl \
       --output_prefix /path/to/output/dir \
       --vocab_file /path/to/tokenizer.model \
       --dataset_impl mmap \
       --tokenizer_type SentencePieceTokenizer \
       --vocab_extra_ids_list "[bib_ref],[/bib_ref],[fig_ref],[/fig_ref],[bib],[/bib],[fig],[/fig],[table],[/table],[formula],[/formula]" \
       --json_keys "text" \
       --workers 8 \
       --chunk_size 2048 \
       --append_eod

python $TRAINER_PATH/tools/preprocess_data.py \
       --input $DATA_PATH/gap_replay_test.jsonl \
       --output_prefix /path/to/output/dir \
       --vocab_file /path/to/tokenizer.model \
       --dataset_impl mmap \
       --tokenizer_type SentencePieceTokenizer \
       --vocab_extra_ids_list "[bib_ref],[/bib_ref],[fig_ref],[/fig_ref],[bib],[/bib],[fig],[/fig],[table],[/table],[formula],[/formula]" \
       --json_keys "text" \
       --workers 8 \
       --chunk_size 2048 \
       --append_eod