[5b4ecd]: / gap-replay / pubmed / download.sh

Download this file

49 lines (39 with data), 1.7 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/bash
# PubMed Abstracts + Papers downloader
# Load data, pre-process, and tokenize the GAP-Replay dataset.
# Comment out the steps you don't need and adjust the paths as desired.
# 1. Load data: S2ORC + Abstracts and join with metadata from Papers. After this command,
# - Abstracts with metadata are stored in /data/abstracts-PubMed_metadata.jsonl
# - Full-text articles with metadata are stored in /data/s2orc-PubMed_metadata.jsonl
# NOTE: You will need to provide your Semantic Scholar API key.
python pubmed/load.py \
--dataset all \
--data_path $GAP_DATA_DIR \
--key_path pubmed/keys.json
# 2. Deduplication: remove abstracts for which we already have full-text articles
python pubmed/process.py \
--deduplicate\
--source_path /data/abstracts-PubMed_metadata.jsonl \
--save_path /data/abstracts-PubMed_dedup.jsonl
# 3. Clean PubMed abstracts and full-text articles
python pubmed/process.py \
--dataset s2orc \
--source_path /data/s2orc-PubMed_metadata.jsonl \
--save_path /data/s2orc-PubMed_processed.jsonl
python process.py \
--dataset abstracts \
--source_path /data/abstracts-PubMed_dedup.jsonl \
--save_path /data/abstracts-PubMed_processed.jsonl
# 4. Train-test split for each dataset
python pubmed/process.py \
--train_test_split \
--source_path /data/s2orc-PubMed_processed.jsonl \
--split_ratio 0.03 \
python pubmed/process.py \
--train_test_split \
--source_path /data/abstracts-PubMed_processed.jsonl \
--split_ratio 0.03 \
python pubmed/process.py \
--train_test_split \
--source_path /data/guidelines.jsonl \
--split_ratio 0.05 \