Diff of /utils/aws/resume.py [000000] .. [190ca4]

Switch to unified view

a b/utils/aws/resume.py
1
# Resume all interrupted trainings in yolov5/ dir including DDP trainings
2
# Usage: $ python utils/aws/resume.py
3
4
import os
5
import sys
6
from pathlib import Path
7
8
import torch
9
import yaml
10
11
FILE = Path(__file__).resolve()
12
ROOT = FILE.parents[2]  # YOLOv5 root directory
13
if str(ROOT) not in sys.path:
14
    sys.path.append(str(ROOT))  # add ROOT to PATH
15
16
port = 0  # --master_port
17
path = Path('').resolve()
18
for last in path.rglob('*/**/last.pt'):
19
    ckpt = torch.load(last)
20
    if ckpt['optimizer'] is None:
21
        continue
22
23
    # Load opt.yaml
24
    with open(last.parent.parent / 'opt.yaml', errors='ignore') as f:
25
        opt = yaml.safe_load(f)
26
27
    # Get device count
28
    d = opt['device'].split(',')  # devices
29
    nd = len(d)  # number of devices
30
    ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1)  # distributed data parallel
31
32
    if ddp:  # multi-GPU
33
        port += 1
34
        cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
35
    else:  # single-GPU
36
        cmd = f'python train.py --resume {last}'
37
38
    cmd += ' > /dev/null 2>&1 &'  # redirect output to dev/null and run in daemon thread
39
    print(cmd)
40
    os.system(cmd)