Skip to content

Commit

Permalink
Get OOM errors to stderr and the UI
Browse files Browse the repository at this point in the history
There is a race between Metaflow detecting that a pod failed execution
and the reason for pod failure being set on the pod. As a result,
at times, the failure reason doesn't posted to stderr.

This change makes Metaflow try a little harder to get the reason for
the failures.

For pods that get OOM killed, this change worked just fine.

Without this change, the OOM killed pod would simply die and the user
would have no idea why. With this change, the error on stderr shows:

Task ran out of memory. Increase the available memory by specifying @resource(memory=...) for the step.
  • Loading branch information
shrinandj committed Jan 25, 2024
1 parent a46f4a3 commit 8c750f6
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions metaflow/plugins/kubernetes/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,8 @@ def update_delay(secs_since_start):
sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0))
return 0.5 + sigmoid * 30.0

start_time = time.time()

def wait_for_launch(job):
status = job.status
echo(
Expand All @@ -349,7 +351,6 @@ def wait_for_launch(job):
job_id=job.id,
)
t = time.time()
start_time = time.time()
while job.is_waiting:
new_status = job.status
if status != new_status or (time.time() - t) > 30:
Expand Down Expand Up @@ -389,7 +390,7 @@ def wait_for_launch(job):
# truncated logs if it doesn't.
# TODO : For hard crashes, we can fetch logs from the pod.

if self._job.has_failed:
def _handle_exit_code():
exit_code, reason = self._job.reason
msg = next(
msg
Expand All @@ -405,8 +406,7 @@ def wait_for_launch(job):
if int(exit_code) == 137:
raise KubernetesException(
"Task ran out of memory. "
"Increase the available memory by specifying "
"@resource(memory=...) for the step. "
"Increase the available memory for the step."
)
if int(exit_code) == 134:
raise KubernetesException("%s (exit code %s)" % (msg, exit_code))
Expand All @@ -416,7 +416,14 @@ def wait_for_launch(job):
"%s. This could be a transient error. Use @retry to retry." % msg
)

if self._job.has_failed:
_handle_exit_code()

exit_code, _ = self._job.reason
if exit_code != 0:
# One more attempt to get the reason; after a small delay
time.sleep(time.time() - start_time)
_handle_exit_code()
echo(
"Task finished with exit code %s." % exit_code,
"stderr",
Expand Down

0 comments on commit 8c750f6

Please sign in to comment.