Skip to content

Commit 7d8d21b

Browse files
awaelchliBorda
authored andcommitted
Revert new Hydra launch behavior (#15737)
* revert new hydra cwd behavior * remove debug statements * changelog Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka <[email protected]> Co-authored-by: Jirka Borovec <[email protected]> (cherry picked from commit 88b2e5a)
1 parent 23ec3c4 commit 7d8d21b

File tree

5 files changed

+22
-142
lines changed

5 files changed

+22
-142
lines changed

src/lightning_lite/strategies/launchers/subprocess_script.py

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414
import os
1515
import subprocess
1616
import sys
17-
from time import sleep
18-
from typing import Any, Callable, Sequence
17+
from typing import Any, Callable, Optional, Sequence, Tuple
1918

2019
import numpy as np
2120
from lightning_utilities.core.imports import RequirementCache
@@ -116,15 +115,16 @@ def _call_children_scripts(self) -> None:
116115
# start process
117116
# if hydra is available and initialized, make sure to set the cwd correctly
118117
hydra_in_use = False
118+
cwd: Optional[str] = None
119119
if _HYDRA_AVAILABLE:
120120
from hydra.core.hydra_config import HydraConfig
121121

122122
hydra_in_use = HydraConfig.initialized()
123123
if hydra_in_use:
124-
command = _hydra_subprocess_cmd(local_rank=local_rank)
124+
command, cwd = _hydra_subprocess_cmd(local_rank=local_rank)
125125
else:
126126
command = _basic_subprocess_cmd()
127-
subprocess.Popen(command, env=env_copy)
127+
subprocess.Popen(command, env=env_copy, cwd=cwd)
128128

129129
# starting all processes at once can cause issues
130130
# with dataloaders delay between 1-10 seconds
@@ -149,36 +149,19 @@ def _basic_subprocess_cmd() -> Sequence[str]:
149149
return [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:]
150150

151151

152-
def _hydra_subprocess_cmd(local_rank: int) -> Sequence[str]:
152+
def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
153153
import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
154-
from hydra.core.hydra_config import HydraConfig
155-
from hydra.utils import to_absolute_path
154+
from hydra.utils import get_original_cwd, to_absolute_path
156155

157156
# when user is using hydra find the absolute path
158157
if __main__.__spec__ is None: # pragma: no-cover
159158
command = [sys.executable, to_absolute_path(sys.argv[0])]
160159
else:
161160
command = [sys.executable, "-m", __main__.__spec__.name]
162161

163-
# extract the hydra configuration
164-
hydra_cfg = HydraConfig.get()
162+
command += sys.argv[1:]
165163

166-
# the location of the hydra configuration files saved for the current job
167-
hydra_output = hydra_cfg.runtime.output_dir
168-
if hydra_cfg.output_subdir is not None:
169-
hydra_output = os.path.join(hydra_output, hydra_cfg.output_subdir)
170-
171-
# check if experimental re-run capability exists
172-
# otherwise use existing config.yaml which may have issues
173-
pickled_config = os.path.join(hydra_output, "config.pickle")
174-
if os.path.exists(pickled_config):
175-
command += ["--experimental-rerun", pickled_config]
176-
177-
else:
178-
command += ["-cp", hydra_output, "-cn", "config.yaml"]
179-
command += [
180-
f"hydra.output_subdir=.pl_ddp_hydra_{local_rank}",
181-
f"hydra.run.dir={hydra_cfg.runtime.output_dir}",
182-
]
183-
184-
return command
164+
cwd = get_original_cwd()
165+
os_cwd = f'"{os.getcwd()}"'
166+
command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
167+
return command, cwd

src/pytorch_lightning/CHANGELOG.md

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
1313

1414
### Changed
1515

16-
-
16+
- Temporarily removed support for Hydra multi-run ([#15737](https://github.com/Lightning-AI/lightning/pull/15737))
1717

1818

1919
### Fixed
@@ -79,7 +79,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
7979
- Added a sanity check that scripts are executed with the `srun` command in SLURM and that environment variables are not conflicting ([#15011](https://github.com/Lightning-AI/lightning/pull/15011))
8080
- Added an error message when attempting to launch processes with `python -i` and an interactive-incompatible strategy ([#15293](https://github.com/Lightning-AI/lightning/pull/15293))
8181

82-
8382
### Changed
8483

8584
- The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892))
@@ -107,7 +106,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
107106
- To avoid issues with forking processes, from PyTorch 1.13 and higher, Lightning will directly use the PyTorch NVML-based check for `torch.cuda.device_count` and from PyTorch 1.14 and higher, Lightning will configure PyTorch to use a NVML-based check for `torch.cuda.is_available`. ([#15110](https://github.com/Lightning-AI/lightning/pull/15110), [#15133](https://github.com/Lightning-AI/lightning/pull/15133))
108107
- The `NeptuneLogger` now uses `neptune.init_run` instead of the deprecated `neptune.init` to initialize a run ([#15393](https://github.com/Lightning-AI/lightning/pull/15393))
109108

110-
111109
### Deprecated
112110

113111
- Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000))
@@ -137,7 +135,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
137135
- Deprecated `TrainerFn.TUNING`, `RunningStage.TUNING` and `trainer.tuning` property ([#15100](https://github.com/Lightning-AI/lightning/pull/15100))
138136
- Deprecated custom `pl.utilities.distributed.AllGatherGrad` implementation in favor of PyTorch's ([#15364](https://github.com/Lightnign-AI/lightning/pull/15364))
139137

140-
141138
### Removed
142139

143140
- Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011))
@@ -195,7 +192,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
195192
- Removed the deprecated `LightningDataModule.on_save/load_checkpoint` hooks ([#14909](https://github.com/Lightning-AI/lightning/pull/14909))
196193
- Removed support for returning a value in `Callback.on_save_checkpoint` in favor of implementing `Callback.state_dict` ([#14835](https://github.com/Lightning-AI/lightning/pull/14835))
197194

198-
199195
### Fixed
200196

201197
- Fixed an issue with `LightningLite.setup()` not setting the `.device` attribute correctly on the returned wrapper ([#14822](https://github.com/Lightning-AI/lightning/pull/14822))

src/pytorch_lightning/strategies/launchers/subprocess_script.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,17 +111,18 @@ def _call_children_scripts(self) -> None:
111111
del env_copy["PL_GLOBAL_SEED"]
112112

113113
hydra_in_use = False
114+
cwd: Optional[str] = None
114115
if _HYDRA_AVAILABLE:
115116
from hydra.core.hydra_config import HydraConfig
116117

117118
hydra_in_use = HydraConfig.initialized()
118119

119120
if hydra_in_use:
120-
command = _hydra_subprocess_cmd(local_rank)
121+
command, cwd = _hydra_subprocess_cmd(local_rank)
121122
else:
122123
command = _basic_subprocess_cmd()
123124

124-
subprocess.Popen(command, env=env_copy)
125+
subprocess.Popen(command, env=env_copy, cwd=cwd)
125126

126127
# starting all processes at once can cause issues
127128
# with dataloaders delay between 1-10 seconds

tests/tests_lite/strategies/launchers/test_subprocess_script.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def test_subprocess_script_launcher_launch_processes(popen_mock, _):
8484
@mock.patch("lightning_lite.strategies.launchers.subprocess_script.subprocess.Popen")
8585
def test_subprocess_script_launcher_hydra_in_use(popen_mock, _, monkeypatch):
8686
basic_command = Mock(return_value="basic_command")
87-
hydra_command = Mock(return_value="hydra_command")
87+
hydra_command = Mock(return_value=("hydra_command", "hydra_cwd"))
8888
monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_basic_subprocess_cmd", basic_command)
8989
monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_hydra_subprocess_cmd", hydra_command)
9090

@@ -101,7 +101,7 @@ def simulate_launch():
101101
# when hydra not available
102102
monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_HYDRA_AVAILABLE", False)
103103
simulate_launch()
104-
popen_mock.assert_called_with("basic_command", env=ANY)
104+
popen_mock.assert_called_with("basic_command", env=ANY, cwd=None)
105105
popen_mock.reset_mock()
106106

107107
import hydra
@@ -112,7 +112,7 @@ def simulate_launch():
112112
HydraConfigMock.initialized.return_value = False
113113
monkeypatch.setattr(hydra.core.hydra_config, "HydraConfig", HydraConfigMock)
114114
simulate_launch()
115-
popen_mock.assert_called_with("basic_command", env=ANY)
115+
popen_mock.assert_called_with("basic_command", env=ANY, cwd=None)
116116
popen_mock.reset_mock()
117117

118118
# when hydra available and initialized
@@ -121,5 +121,5 @@ def simulate_launch():
121121
HydraConfigMock.initialized.return_value = True
122122
monkeypatch.setattr(hydra.core.hydra_config, "HydraConfig", HydraConfigMock)
123123
simulate_launch()
124-
popen_mock.assert_called_with("hydra_command", env=ANY)
124+
popen_mock.assert_called_with("hydra_command", env=ANY, cwd="hydra_cwd")
125125
popen_mock.reset_mock()
Lines changed: 3 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,17 @@
1-
import logging
2-
import os
31
import sys
4-
from pathlib import Path
52

63
import pytest
74
from lightning_utilities.core.imports import RequirementCache
85

9-
from pytorch_lightning.strategies.launchers.subprocess_script import _HYDRA_AVAILABLE
106
from tests_pytorch.helpers.runif import RunIf
117

128
_HYDRA_WITH_RERUN = RequirementCache("hydra-core>=1.2")
139
_HYDRA_WITH_RUN_PROCESS = RequirementCache("hydra-core>=1.0.7")
1410

15-
if _HYDRA_AVAILABLE:
16-
from omegaconf import OmegaConf
1711
if _HYDRA_WITH_RUN_PROCESS:
1812
from hydra.test_utils.test_utils import run_process
1913

2014

21-
# fixture to run hydra jobs in a clean temporary directory
22-
# Hydra creates its own output directories and logs
23-
@pytest.fixture
24-
def cleandir(tmp_path):
25-
"""Run function in a temporary directory."""
26-
old_dir = os.getcwd() # get current working directory (cwd)
27-
os.chdir(tmp_path) # change cwd to the temp-directory
28-
yield tmp_path # yields control to the test to be run
29-
os.chdir(old_dir)
30-
logging.shutdown()
31-
32-
3315
# Script to run from command line
3416
script = """
3517
import hydra
@@ -64,7 +46,9 @@ def task_fn(cfg):
6446
@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
6547
@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
6648
@pytest.mark.parametrize("subdir", [None, "dksa", ".hello"])
67-
def test_ddp_with_hydra_runjob(cleandir, subdir):
49+
def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch):
50+
monkeypatch.chdir(tmpdir)
51+
6852
# Save script locally
6953
with open("temp.py", "w") as fn:
7054
fn.write(script)
@@ -75,87 +59,3 @@ def test_ddp_with_hydra_runjob(cleandir, subdir):
7559
if subdir is not None:
7660
cmd += [f"hydra.output_subdir={subdir}"]
7761
run_process(cmd)
78-
79-
# Make sure config.yaml was created for additional
80-
# processes.
81-
logs = list(Path.cwd().glob("**/config.yaml"))
82-
assert len(logs) == devices
83-
84-
# Make sure the parameter was set and used
85-
cfg = OmegaConf.load(logs[0])
86-
assert cfg.devices == devices
87-
88-
# Make sure PL spawned a job that is logged by Hydra
89-
logs = list(Path.cwd().glob("**/*.log"))
90-
assert len(logs) == 1
91-
92-
93-
@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
94-
@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
95-
@pytest.mark.parametrize("num_jobs", [1, 2])
96-
def test_ddp_with_hydra_multirunjob(cleandir, num_jobs):
97-
# Save script locally
98-
with open("temp.py", "w") as fn:
99-
fn.write(script)
100-
101-
# create fake multirun params based on `num_jobs`
102-
fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs))
103-
104-
# Run CLI
105-
run_process([sys.executable, "temp.py", "+devices=2", '+strategy="ddp"', fake_param, "--multirun"])
106-
107-
# Make sure config.yaml was created for each job
108-
configs = sorted(Path.cwd().glob("**/.pl_ddp_hydra_*/config.yaml"))
109-
assert len(configs) == num_jobs
110-
111-
# Make sure the parameter was set and used for each job
112-
for i, config in enumerate(configs):
113-
cfg = OmegaConf.load(config)
114-
local_rank = int(config.parent.parent.parts[-1])
115-
assert cfg.devices == 2
116-
assert cfg.foo == local_rank
117-
118-
logs = list(Path.cwd().glob("**/*.log"))
119-
assert len(logs) == num_jobs
120-
121-
122-
yaml_file = """
123-
hydra:
124-
callbacks:
125-
save_job_info:
126-
_target_: hydra.experimental.callbacks.PickleJobInfoCallback
127-
"""
128-
129-
130-
@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
131-
@pytest.mark.skipif(not _HYDRA_WITH_RERUN, reason=str(_HYDRA_WITH_RERUN))
132-
@pytest.mark.parametrize("num_jobs", [1, 2])
133-
def test_ddp_with_hydra_multirunjob_rerun(cleandir, num_jobs):
134-
# Save script locally
135-
with open("temp.py", "w") as fn:
136-
fn.write(script)
137-
138-
with open("config.yaml", "w") as fn:
139-
fn.write(yaml_file)
140-
141-
# create fake multirun params based on `num_jobs`
142-
fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs))
143-
144-
# Run CLI
145-
run_process(
146-
[
147-
sys.executable,
148-
"temp.py",
149-
"-cp",
150-
".",
151-
"-cn",
152-
"config.yaml",
153-
"+devices=2",
154-
'+strategy="ddp"',
155-
fake_param,
156-
"--multirun",
157-
]
158-
)
159-
160-
pickles = sorted(Path.cwd().glob("**/.hydra/config.pickle"))
161-
assert len(pickles) == num_jobs

0 commit comments

Comments
 (0)