Skip to content

Commit c388af1

Browse files
committed
feat: add sigterm handling for graceful termination
1 parent d4f9e38 commit c388af1

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

axlearn/common/trainer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ def __init__(
352352
model=self.model,
353353
model_param_partition_specs=model_param_partition_specs,
354354
)
355+
register_sigterm_handler()
355356
self._maybe_record_event(measurement.Event.END_ACCELERATOR_INIT)
356357

357358
@property
@@ -1450,3 +1451,12 @@ def m_or_g(x, suffix=""):
14501451
logging.warning("Attempt to parse cost_stats=%s but failed.", cost_stats)
14511452

14521453
return analysis_results
1454+
1455+
def register_sigterm_handler():
1456+
original_sigterm_handler = signal.getsignal(signal.SIGTERM)
1457+
def sigterm_handler(signum, frame):
1458+
original_sigterm_handler(signum, frame)
1459+
1460+
# system is being shutdown
1461+
if os.path.exists("/var/run/nologin") or os.path.exists("/run/nologin"):
1462+
raise SystemExit(f"Exiting without waiting checkpoint saving after system shutdown is detected.")

0 commit comments

Comments
 (0)