--- a +++ b/yolov5/utils/aws/resume.py @@ -0,0 +1,40 @@ +# Resume all interrupted trainings in yolov5/ dir including DDP trainings +# Usage: $ python utils/aws/resume.py + +import os +import sys +from pathlib import Path + +import torch +import yaml + +FILE = Path(__file__).resolve() +ROOT = FILE.parents[2] # YOLOv5 root directory +if str(ROOT) not in sys.path: + sys.path.append(str(ROOT)) # add ROOT to PATH + +port = 0 # --master_port +path = Path('').resolve() +for last in path.rglob('*/**/last.pt'): + ckpt = torch.load(last) + if ckpt['optimizer'] is None: + continue + + # Load opt.yaml + with open(last.parent.parent / 'opt.yaml', errors='ignore') as f: + opt = yaml.safe_load(f) + + # Get device count + d = opt['device'].split(',') # devices + nd = len(d) # number of devices + ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel + + if ddp: # multi-GPU + port += 1 + cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}' + else: # single-GPU + cmd = f'python train.py --resume {last}' + + cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread + print(cmd) + os.system(cmd)