|
a |
|
b/utils/aws/resume.py |
|
|
1 |
# Resume all interrupted trainings in yolov5/ dir including DDP trainings |
|
|
2 |
# Usage: $ python utils/aws/resume.py |
|
|
3 |
|
|
|
4 |
import os |
|
|
5 |
import sys |
|
|
6 |
from pathlib import Path |
|
|
7 |
|
|
|
8 |
import torch |
|
|
9 |
import yaml |
|
|
10 |
|
|
|
11 |
FILE = Path(__file__).resolve() |
|
|
12 |
ROOT = FILE.parents[2] # YOLOv5 root directory |
|
|
13 |
if str(ROOT) not in sys.path: |
|
|
14 |
sys.path.append(str(ROOT)) # add ROOT to PATH |
|
|
15 |
|
|
|
16 |
port = 0 # --master_port |
|
|
17 |
path = Path('').resolve() |
|
|
18 |
for last in path.rglob('*/**/last.pt'): |
|
|
19 |
ckpt = torch.load(last) |
|
|
20 |
if ckpt['optimizer'] is None: |
|
|
21 |
continue |
|
|
22 |
|
|
|
23 |
# Load opt.yaml |
|
|
24 |
with open(last.parent.parent / 'opt.yaml', errors='ignore') as f: |
|
|
25 |
opt = yaml.safe_load(f) |
|
|
26 |
|
|
|
27 |
# Get device count |
|
|
28 |
d = opt['device'].split(',') # devices |
|
|
29 |
nd = len(d) # number of devices |
|
|
30 |
ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel |
|
|
31 |
|
|
|
32 |
if ddp: # multi-GPU |
|
|
33 |
port += 1 |
|
|
34 |
cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}' |
|
|
35 |
else: # single-GPU |
|
|
36 |
cmd = f'python train.py --resume {last}' |
|
|
37 |
|
|
|
38 |
cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread |
|
|
39 |
print(cmd) |
|
|
40 |
os.system(cmd) |