Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
b877fc9
Add parameter to change the preemption signal
Queuecumber Sep 9, 2022
25c0c3f
Make the signal connector use the custom signal from SLURMEnvironment
Queuecumber Sep 9, 2022
85402b0
Add a unit test
Queuecumber Sep 9, 2022
84ac410
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 9, 2022
f09bb21
Fix mypy errors
Queuecumber Sep 9, 2022
606b829
Fix error in test
Queuecumber Sep 9, 2022
9ea7e5b
Add documentation for the new signal flag
Queuecumber Sep 9, 2022
8885420
Update the changelog
Queuecumber Sep 9, 2022
94bedc3
Fix malformed example code
Queuecumber Sep 9, 2022
2d43ac7
Signal set by the user could be garbage
Queuecumber Sep 9, 2022
ecd046f
Fix whitespace in changelog
Queuecumber Sep 9, 2022
747baf6
Add test case for a garbage signal set by the user
Queuecumber Sep 9, 2022
b2ab4dc
One more fix fort test case
Queuecumber Sep 9, 2022
3bf0d25
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 9, 2022
a1d3984
Fix typo in update docs
Queuecumber Sep 11, 2022
1b4f5ac
Change environment signal to be a real signal
Queuecumber Sep 11, 2022
e71a113
Simplified logic by using a real signal object in the environment class
Queuecumber Sep 11, 2022
06b6ef3
Updated docs
Queuecumber Sep 11, 2022
2460055
Update tests with new format
Queuecumber Sep 11, 2022
2583017
Remove mostly duplicated test
Queuecumber Sep 11, 2022
a0eca33
Fix incorrect tuple in test case
Queuecumber Sep 12, 2022
91e69a9
Move documentation of new flag
Queuecumber Sep 12, 2022
da5cb59
Merge branch 'master' into master
Borda Sep 12, 2022
3f77bde
handle windows not supporting sigusr1
awaelchli Sep 12, 2022
cac6893
Update src/pytorch_lightning/trainer/connectors/signal_connector.py
awaelchli Sep 12, 2022
d9dd583
fix broken GH suggestion
awaelchli Sep 12, 2022
2abe4a8
skip access to SIGUSR1 on windows in test
awaelchli Sep 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/source-pytorch/clouds/cluster_advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,18 @@ To get this behavior make sure to add the correct signal to your SLURM script
# 90 seconds before training ends
SBATCH --signal=SIGUSR1@90

You can change this signal if your environment requires the use of a different one, for example

.. code-block:: bash

#SBATCH --signal=SIGHUP@90

Then, when you make your trainer, pass the `requeue_signal` option to the :class:`~pytorch_lightning.plugins.environments.slurm_environment.SLURMEnvironment` plugin:

.. code-block:: python

trainer = Trainer(plugins=[SLURMEnvironment(requeue_signal=signal.SIGHUP)])

If auto-resubmit is not desired, it can be turned off in the :class:`~pytorch_lightning.plugins.environments.slurm_environment.SLURMEnvironment` plugin:

.. code-block:: python
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
import logging
import os
import re
import signal
from typing import Optional

from lightning_lite.plugins.environments.cluster_environment import ClusterEnvironment
from lightning_lite.utilities.imports import _IS_WINDOWS

log = logging.getLogger(__name__)

Expand All @@ -28,11 +30,16 @@ class SLURMEnvironment(ClusterEnvironment):
Args:
auto_requeue: Whether automatic job resubmission is enabled or not. How and under which conditions a job gets
rescheduled gets determined by the owner of this plugin.
requeue_signal: The signal that SLURM will send to indicate that the job should be requeued. Defaults to
SIGUSR1 on Unix.
"""

def __init__(self, auto_requeue: bool = True) -> None:
def __init__(self, auto_requeue: bool = True, requeue_signal: Optional[signal.Signals] = None) -> None:
super().__init__()
self.auto_requeue = auto_requeue
if requeue_signal is None and not _IS_WINDOWS:
requeue_signal = signal.SIGUSR1
self.requeue_signal = requeue_signal

@property
def creates_processes_externally(self) -> bool:
Expand Down
4 changes: 4 additions & 0 deletions src/pytorch_lightning/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

## [unReleased] - 2022-MM-DD


- Added an option to configure the signal SLURM sends when a job is preempted or requeued ([#14610](https://github.com/Lightning-AI/lightning/issues/14610))


### Added


Expand Down
18 changes: 11 additions & 7 deletions src/pytorch_lightning/trainer/connectors/signal_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, trainer: "pl.Trainer") -> None:
def register_signal_handlers(self) -> None:
self._original_handlers = self._get_current_signal_handlers()

sigusr1_handlers: List[_HANDLER] = []
sigusr_handlers: List[_HANDLER] = []
sigterm_handlers: List[_HANDLER] = []

if _fault_tolerant_training():
Expand All @@ -51,19 +51,23 @@ def register_signal_handlers(self) -> None:
environment = self.trainer._accelerator_connector.cluster_environment
if isinstance(environment, SLURMEnvironment) and environment.auto_requeue:
log.info("SLURM auto-requeueing enabled. Setting signal handlers.")
sigusr1_handlers.append(self.slurm_sigusr1_handler_fn)
sigusr_handlers.append(self.slurm_sigusr_handler_fn)
sigterm_handlers.append(self.sigterm_handler_fn)

# signal.SIGUSR1 doesn't seem available on windows
# Windows seems to have signal incompatibilities
if not self._is_on_windows():
if sigusr1_handlers and not self._has_already_handler(signal.SIGUSR1):
self._register_signal(signal.SIGUSR1, HandlersCompose(sigusr1_handlers))
sigusr = environment.requeue_signal if isinstance(environment, SLURMEnvironment) else signal.SIGUSR1

assert sigusr is not None

if sigusr_handlers and not self._has_already_handler(sigusr):
self._register_signal(sigusr, HandlersCompose(sigusr_handlers))

if sigterm_handlers and not self._has_already_handler(signal.SIGTERM):
self._register_signal(signal.SIGTERM, HandlersCompose(sigterm_handlers))

def slurm_sigusr1_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None:
rank_zero_info("handling SIGUSR1")
def slurm_sigusr_handler_fn(self, signum: _SIGNUM, frame: FrameType) -> None:
rank_zero_info("handling auto-requeue signal")

# save logger to make sure we get all the metrics
for logger in self.trainer.loggers:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pytest

from lightning_lite.plugins.environments import SLURMEnvironment
from lightning_lite.utilities.imports import _IS_WINDOWS
from pytorch_lightning import Trainer
from pytorch_lightning.demos.boring_classes import BoringModel
from pytorch_lightning.trainer.connectors.signal_connector import SignalConnector
Expand Down Expand Up @@ -78,8 +79,9 @@ def training_step(self, batch, batch_idx):

@RunIf(skip_windows=True)
@pytest.mark.parametrize("auto_requeue", (True, False))
def test_auto_requeue_flag(auto_requeue):
trainer = Trainer(plugins=[SLURMEnvironment(auto_requeue=auto_requeue)])
@pytest.mark.parametrize("requeue_signal", [signal.SIGUSR1, signal.SIGUSR2, signal.SIGHUP] if not _IS_WINDOWS else [])
def test_auto_requeue_custom_signal_flag(auto_requeue, requeue_signal):
trainer = Trainer(plugins=[SLURMEnvironment(auto_requeue=auto_requeue, requeue_signal=requeue_signal)])
connector = SignalConnector(trainer)
connector.register_signal_handlers()

Expand All @@ -88,12 +90,12 @@ def test_auto_requeue_flag(auto_requeue):
assert len(sigterm_handlers) == 1
assert sigterm_handlers[0].__qualname__ == "SignalConnector.sigterm_handler_fn"

sigusr1_handlers = signal.getsignal(signal.SIGUSR1).signal_handlers
assert len(sigusr1_handlers) == 1
assert sigusr1_handlers[0].__qualname__ == "SignalConnector.slurm_sigusr1_handler_fn"
sigusr_handlers = signal.getsignal(requeue_signal).signal_handlers
assert len(sigusr_handlers) == 1
assert sigusr_handlers[0].__qualname__ == "SignalConnector.slurm_sigusr_handler_fn"
else:
assert signal.getsignal(signal.SIGTERM) is signal.SIG_DFL
assert signal.getsignal(signal.SIGUSR1) is signal.SIG_DFL
assert signal.getsignal(requeue_signal) is signal.SIG_DFL

connector.teardown()

Expand Down