Skip to content

Commit 23c6d07

Browse files
npowNissan Pow
and
Nissan Pow
authored
Terminate early on ENOSPC (#2392)
Running out of disk space should be treated as a fatal error. It is currently treated as a transient error, leading to retries which never succeed. The task eventually dies after all the retries are exhausted. This change makes `ENOSPC` a fatal error and we exit immediately. --------- Co-authored-by: Nissan Pow <npow@netflix.com>
1 parent 76bbec9 commit 23c6d07

File tree

2 files changed

+25
-1
lines changed

2 files changed

+25
-1
lines changed

metaflow/plugins/datatools/s3/s3.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import errno
12
import json
23
import os
34
import re
@@ -137,6 +138,10 @@ class MetaflowS3InvalidRange(MetaflowException):
137138
headline = "S3 invalid range"
138139

139140

141+
class MetaflowS3InsufficientDiskSpace(MetaflowException):
142+
headline = "Insufficient disk space"
143+
144+
140145
class S3Object(object):
141146
"""
142147
This object represents a path or an object in S3,
@@ -1377,8 +1382,10 @@ def _one_boto_op(self, op, url, create_tmp_file=True):
13771382
elif error_code == "NoSuchBucket":
13781383
raise MetaflowS3URLException("Specified S3 bucket doesn't exist.")
13791384
error = str(err)
1385+
except OSError as e:
1386+
if e.errno == errno.ENOSPC:
1387+
raise MetaflowS3InsufficientDiskSpace(str(e))
13801388
except Exception as ex:
1381-
# TODO specific error message for out of disk space
13821389
error = str(ex)
13831390
if tmp:
13841391
os.unlink(tmp.name)

metaflow/plugins/datatools/s3/s3op.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import print_function
22

3+
import errno
34
import json
45
import time
56
import math
@@ -108,6 +109,7 @@ def __str__(self):
108109
ERROR_LOCAL_FILE_NOT_FOUND = 10
109110
ERROR_INVALID_RANGE = 11
110111
ERROR_TRANSIENT = 12
112+
ERROR_OUT_OF_DISK_SPACE = 13
111113

112114

113115
def format_result_line(idx, prefix, url="", local=""):
@@ -277,6 +279,17 @@ def op_info(url):
277279
err = convert_to_client_error(e)
278280
handle_client_error(err, idx, result_file)
279281
continue
282+
except OSError as e:
283+
tmp.close()
284+
os.unlink(tmp.name)
285+
if e.errno == errno.ENOSPC:
286+
result_file.write(
287+
"%d %d\n" % (idx, -ERROR_OUT_OF_DISK_SPACE)
288+
)
289+
else:
290+
result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT))
291+
result_file.flush()
292+
continue
280293
except (SSLError, Exception) as e:
281294
tmp.close()
282295
os.unlink(tmp.name)
@@ -643,6 +656,8 @@ def exit(exit_code, url):
643656
msg = "Local file not found: %s" % url
644657
elif exit_code == ERROR_TRANSIENT:
645658
msg = "Transient error for url: %s" % url
659+
elif exit_code == ERROR_OUT_OF_DISK_SPACE:
660+
msg = "Out of disk space when downloading URL: %s" % url
646661
else:
647662
msg = "Unknown error"
648663
print("s3op failed:\n%s" % msg, file=sys.stderr)
@@ -1173,6 +1188,8 @@ def get(
11731188
)
11741189
if verify:
11751190
verify_info.append((url, sz))
1191+
elif sz == -ERROR_OUT_OF_DISK_SPACE:
1192+
exit(ERROR_OUT_OF_DISK_SPACE, url)
11761193
elif sz == -ERROR_URL_ACCESS_DENIED:
11771194
denied_url = url
11781195
break

0 commit comments

Comments
 (0)