Skip to content
Merged
3 changes: 3 additions & 0 deletions src/lightning/fabric/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

- Fixed issue where DDP subprocesses that used Hydra would set hydra's working directory to current directory ([#18145](https://github.com/Lightning-AI/lightning/pull/18145))


- Fixed issue where running on TPUs would select the wrong device index ([#17227](https://github.com/Lightning-AI/lightning/pull/17227))


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def _basic_subprocess_cmd() -> Sequence[str]:

def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
from hydra.core.hydra_config import HydraConfig
from hydra.utils import get_original_cwd, to_absolute_path

# when user is using hydra find the absolute path
Expand All @@ -154,6 +155,8 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
command += sys.argv[1:]

cwd = get_original_cwd()
os_cwd = f'"{os.getcwd()}"'
command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
run_dir = f'"{HydraConfig.get().run.dir}"'
command += [f"hydra.run.dir={run_dir}", f"hydra.job.name=train_ddp_process_{local_rank}"]
subdir = "null" if HydraConfig.get().output_subdir is None else f".pl_ddp_hydra_{local_rank}"
command += [f"hydra.output_subdir={subdir}"]
return command, cwd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import subprocess
import sys
from pathlib import Path
from unittest.mock import Mock

import pytest
Expand All @@ -13,6 +14,7 @@

if _HYDRA_WITH_RUN_PROCESS:
from hydra.test_utils.test_utils import run_process
from omegaconf import OmegaConf


# Script to run from command line
Expand Down Expand Up @@ -48,7 +50,7 @@ def task_fn(cfg):

@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
@pytest.mark.parametrize("subdir", [None, "dksa", ".hello"])
@pytest.mark.parametrize("subdir", [None, "null", "dksa", ".hello"])
def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch):
monkeypatch.chdir(tmpdir)

Expand All @@ -58,11 +60,24 @@ def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch):

# Run CLI
devices = 2
cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"']
run_dir = Path(tmpdir) / "hydra_output"
cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"', f"hydra.run.dir={run_dir}"]
if subdir is not None:
cmd += [f"hydra.output_subdir={subdir}"]
run_process(cmd)

# Make sure config.yaml was created for additional processes iff subdir is present.
saved_confs = list(run_dir.glob("**/config.yaml"))
assert len(saved_confs) == (0 if subdir == "null" else devices)

if saved_confs: # Make sure the parameter was set and used
cfg = OmegaConf.load(saved_confs[0])
assert cfg.devices == devices

# Make sure PL spawned jobs that are logged by Hydra
logs = list(run_dir.glob("**/*.log"))
assert len(logs) == devices


def test_kill():
launcher = _SubprocessScriptLauncher(Mock(), 1, 1)
Expand Down