diff --git a/trainer/app_code/yolov5_trainer.py b/trainer/app_code/yolov5_trainer.py index 505b7a0..1e530fb 100644 --- a/trainer/app_code/yolov5_trainer.py +++ b/trainer/app_code/yolov5_trainer.py @@ -17,6 +17,7 @@ PointDetection, PretrainedModel, TrainingStateData) from learning_loop_node.trainer import trainer_logic +from learning_loop_node.trainer.exceptions import NodeNeedsRestartError from learning_loop_node.trainer.executor import Executor from . import batch_size_calculation, model_files, yolov5_format @@ -156,7 +157,11 @@ async def _detect( await executor.start(cmd) if await executor.wait() != 0: - logging.error(f'Error during detecting: \n {executor.get_log()}') + executor_log = executor.get_log() + logging.error(f'Error during detecting: \n {executor_log}') + if 'CUDA out of memory' in executor_log or 'No CUDA GPUs are available' in executor_log: + raise NodeNeedsRestartError() + raise Exception('Error during detecting') logging.info('Start parsing detections') diff --git a/trainer/pred_det.py b/trainer/pred_det.py index d65404c..34539c0 100644 --- a/trainer/pred_det.py +++ b/trainer/pred_det.py @@ -182,7 +182,12 @@ def main(opt): if __name__ == "__main__": - torch.cuda.init() + try: + torch.cuda.init() + except RuntimeError as e: + print(e) + sys.exit(1) + torch.cuda.empty_cache() opt = parse_opt() main(opt)