Revert new Hydra launch behavior (#15737)

awaelchli · Borda · commit 7d8d21bc9f11 · 2022-11-21T21:46:09.000+01:00
* revert new hydra cwd behavior * remove debug statements * changelog Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> (cherry picked from commit 88b2e5a)
diff --git a/src/lightning_lite/strategies/launchers/subprocess_script.py b/src/lightning_lite/strategies/launchers/subprocess_script.py
@@ -14,8 +14,7 @@
 import os
 import subprocess
 import sys
-from time import sleep
-from typing import Any, Callable, Sequence
+from typing import Any, Callable, Optional, Sequence, Tuple
 
 import numpy as np
 from lightning_utilities.core.imports import RequirementCache
@@ -116,15 +115,16 @@ def _call_children_scripts(self) -> None:
             # start process
             # if hydra is available and initialized, make sure to set the cwd correctly
             hydra_in_use = False
+            cwd: Optional[str] = None
             if _HYDRA_AVAILABLE:
                 from hydra.core.hydra_config import HydraConfig
 
                 hydra_in_use = HydraConfig.initialized()
             if hydra_in_use:
-                command = _hydra_subprocess_cmd(local_rank=local_rank)
+                command, cwd = _hydra_subprocess_cmd(local_rank=local_rank)
             else:
                 command = _basic_subprocess_cmd()
-            subprocess.Popen(command, env=env_copy)
+            subprocess.Popen(command, env=env_copy, cwd=cwd)
 
             # starting all processes at once can cause issues
             # with dataloaders delay between 1-10 seconds
@@ -149,36 +149,19 @@ def _basic_subprocess_cmd() -> Sequence[str]:
         return [sys.executable, "-m", __main__.__spec__.name] + sys.argv[1:]
 
 
-def _hydra_subprocess_cmd(local_rank: int) -> Sequence[str]:
+def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
     import __main__  # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
-    from hydra.core.hydra_config import HydraConfig
-    from hydra.utils import to_absolute_path
+    from hydra.utils import get_original_cwd, to_absolute_path
 
     # when user is using hydra find the absolute path
     if __main__.__spec__ is None:  # pragma: no-cover
         command = [sys.executable, to_absolute_path(sys.argv[0])]
     else:
         command = [sys.executable, "-m", __main__.__spec__.name]
 
-    # extract the hydra configuration
-    hydra_cfg = HydraConfig.get()
+    command += sys.argv[1:]
 
-    # the location of the hydra configuration files saved for the current job
-    hydra_output = hydra_cfg.runtime.output_dir
-    if hydra_cfg.output_subdir is not None:
-        hydra_output = os.path.join(hydra_output, hydra_cfg.output_subdir)
-
-    # check if experimental re-run capability exists
-    # otherwise use existing config.yaml which may have issues
-    pickled_config = os.path.join(hydra_output, "config.pickle")
-    if os.path.exists(pickled_config):
-        command += ["--experimental-rerun", pickled_config]
-
-    else:
-        command += ["-cp", hydra_output, "-cn", "config.yaml"]
-        command += [
-            f"hydra.output_subdir=.pl_ddp_hydra_{local_rank}",
-            f"hydra.run.dir={hydra_cfg.runtime.output_dir}",
-        ]
-
-    return command
+    cwd = get_original_cwd()
+    os_cwd = f'"{os.getcwd()}"'
+    command += [f"hydra.run.dir={os_cwd}", f"hydra.job.name=train_ddp_process_{local_rank}"]
+    return command, cwd
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -13,7 +13,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 
-- 
+- Temporarily removed support for Hydra multi-run ([#15737](https://github.com/Lightning-AI/lightning/pull/15737))
 
 
 ### Fixed
@@ -79,7 +79,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added a sanity check that scripts are executed with the `srun` command in SLURM and that environment variables are not conflicting ([#15011](https://github.com/Lightning-AI/lightning/pull/15011))
 - Added an error message when attempting to launch processes with `python -i` and an interactive-incompatible strategy ([#15293](https://github.com/Lightning-AI/lightning/pull/15293))
 
-
 ### Changed
 
 - The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892))
@@ -107,7 +106,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - To avoid issues with forking processes, from PyTorch 1.13 and higher, Lightning will directly use the PyTorch NVML-based check for `torch.cuda.device_count` and from PyTorch 1.14 and higher, Lightning will configure PyTorch to use a NVML-based check for `torch.cuda.is_available`. ([#15110](https://github.com/Lightning-AI/lightning/pull/15110), [#15133](https://github.com/Lightning-AI/lightning/pull/15133))
 - The `NeptuneLogger` now uses `neptune.init_run` instead of the deprecated `neptune.init` to initialize a run ([#15393](https://github.com/Lightning-AI/lightning/pull/15393))
 
-
 ### Deprecated
 
 - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000))
@@ -137,7 +135,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `TrainerFn.TUNING`, `RunningStage.TUNING` and `trainer.tuning` property ([#15100](https://github.com/Lightning-AI/lightning/pull/15100))
 - Deprecated custom `pl.utilities.distributed.AllGatherGrad` implementation in favor of PyTorch's ([#15364](https://github.com/Lightnign-AI/lightning/pull/15364))
 
-
 ### Removed
 
 - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011))
@@ -195,7 +192,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed the deprecated `LightningDataModule.on_save/load_checkpoint` hooks ([#14909](https://github.com/Lightning-AI/lightning/pull/14909))
 - Removed support for returning a value in `Callback.on_save_checkpoint` in favor of implementing `Callback.state_dict` ([#14835](https://github.com/Lightning-AI/lightning/pull/14835))
 
-
 ### Fixed
 
 - Fixed an issue with `LightningLite.setup()` not setting the `.device` attribute correctly on the returned wrapper ([#14822](https://github.com/Lightning-AI/lightning/pull/14822))
diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py
@@ -111,17 +111,18 @@ def _call_children_scripts(self) -> None:
                 del env_copy["PL_GLOBAL_SEED"]
 
             hydra_in_use = False
+            cwd: Optional[str] = None
             if _HYDRA_AVAILABLE:
                 from hydra.core.hydra_config import HydraConfig
 
                 hydra_in_use = HydraConfig.initialized()
 
             if hydra_in_use:
-                command = _hydra_subprocess_cmd(local_rank)
+                command, cwd = _hydra_subprocess_cmd(local_rank)
             else:
                 command = _basic_subprocess_cmd()
 
-            subprocess.Popen(command, env=env_copy)
+            subprocess.Popen(command, env=env_copy, cwd=cwd)
 
             # starting all processes at once can cause issues
             # with dataloaders delay between 1-10 seconds
diff --git a/tests/tests_lite/strategies/launchers/test_subprocess_script.py b/tests/tests_lite/strategies/launchers/test_subprocess_script.py
@@ -84,7 +84,7 @@ def test_subprocess_script_launcher_launch_processes(popen_mock, _):
 @mock.patch("lightning_lite.strategies.launchers.subprocess_script.subprocess.Popen")
 def test_subprocess_script_launcher_hydra_in_use(popen_mock, _, monkeypatch):
     basic_command = Mock(return_value="basic_command")
-    hydra_command = Mock(return_value="hydra_command")
+    hydra_command = Mock(return_value=("hydra_command", "hydra_cwd"))
     monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_basic_subprocess_cmd", basic_command)
     monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_hydra_subprocess_cmd", hydra_command)
 
@@ -101,7 +101,7 @@ def simulate_launch():
     # when hydra not available
     monkeypatch.setattr(lightning_lite.strategies.launchers.subprocess_script, "_HYDRA_AVAILABLE", False)
     simulate_launch()
-    popen_mock.assert_called_with("basic_command", env=ANY)
+    popen_mock.assert_called_with("basic_command", env=ANY, cwd=None)
     popen_mock.reset_mock()
 
     import hydra
@@ -112,7 +112,7 @@ def simulate_launch():
     HydraConfigMock.initialized.return_value = False
     monkeypatch.setattr(hydra.core.hydra_config, "HydraConfig", HydraConfigMock)
     simulate_launch()
-    popen_mock.assert_called_with("basic_command", env=ANY)
+    popen_mock.assert_called_with("basic_command", env=ANY, cwd=None)
     popen_mock.reset_mock()
 
     # when hydra available and initialized
@@ -121,5 +121,5 @@ def simulate_launch():
     HydraConfigMock.initialized.return_value = True
     monkeypatch.setattr(hydra.core.hydra_config, "HydraConfig", HydraConfigMock)
     simulate_launch()
-    popen_mock.assert_called_with("hydra_command", env=ANY)
+    popen_mock.assert_called_with("hydra_command", env=ANY, cwd="hydra_cwd")
     popen_mock.reset_mock()
diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py
@@ -1,35 +1,17 @@
-import logging
-import os
 import sys
-from pathlib import Path
 
 import pytest
 from lightning_utilities.core.imports import RequirementCache
 
-from pytorch_lightning.strategies.launchers.subprocess_script import _HYDRA_AVAILABLE
 from tests_pytorch.helpers.runif import RunIf
 
 _HYDRA_WITH_RERUN = RequirementCache("hydra-core>=1.2")
 _HYDRA_WITH_RUN_PROCESS = RequirementCache("hydra-core>=1.0.7")
 
-if _HYDRA_AVAILABLE:
-    from omegaconf import OmegaConf
 if _HYDRA_WITH_RUN_PROCESS:
     from hydra.test_utils.test_utils import run_process
 
 
-# fixture to run hydra jobs in a clean temporary directory
-# Hydra creates its own output directories and logs
-@pytest.fixture
-def cleandir(tmp_path):
-    """Run function in a temporary directory."""
-    old_dir = os.getcwd()  # get current working directory (cwd)
-    os.chdir(tmp_path)  # change cwd to the temp-directory
-    yield tmp_path  # yields control to the test to be run
-    os.chdir(old_dir)
-    logging.shutdown()
-
-
 # Script to run from command line
 script = """
 import hydra
@@ -64,7 +46,9 @@ def task_fn(cfg):
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 @pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
 @pytest.mark.parametrize("subdir", [None, "dksa", ".hello"])
-def test_ddp_with_hydra_runjob(cleandir, subdir):
+def test_ddp_with_hydra_runjob(subdir, tmpdir, monkeypatch):
+    monkeypatch.chdir(tmpdir)
+
     # Save script locally
     with open("temp.py", "w") as fn:
         fn.write(script)
@@ -75,87 +59,3 @@ def test_ddp_with_hydra_runjob(cleandir, subdir):
     if subdir is not None:
         cmd += [f"hydra.output_subdir={subdir}"]
     run_process(cmd)
-
-    # Make sure config.yaml was created for additional
-    # processes.
-    logs = list(Path.cwd().glob("**/config.yaml"))
-    assert len(logs) == devices
-
-    # Make sure the parameter was set and used
-    cfg = OmegaConf.load(logs[0])
-    assert cfg.devices == devices
-
-    # Make sure PL spawned a job that is logged by Hydra
-    logs = list(Path.cwd().glob("**/*.log"))
-    assert len(logs) == 1
-
-
-@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
-@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
-@pytest.mark.parametrize("num_jobs", [1, 2])
-def test_ddp_with_hydra_multirunjob(cleandir, num_jobs):
-    # Save script locally
-    with open("temp.py", "w") as fn:
-        fn.write(script)
-
-    # create fake multirun params based on `num_jobs`
-    fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs))
-
-    # Run CLI
-    run_process([sys.executable, "temp.py", "+devices=2", '+strategy="ddp"', fake_param, "--multirun"])
-
-    # Make sure config.yaml was created for each job
-    configs = sorted(Path.cwd().glob("**/.pl_ddp_hydra_*/config.yaml"))
-    assert len(configs) == num_jobs
-
-    # Make sure the parameter was set and used for each job
-    for i, config in enumerate(configs):
-        cfg = OmegaConf.load(config)
-        local_rank = int(config.parent.parent.parts[-1])
-        assert cfg.devices == 2
-        assert cfg.foo == local_rank
-
-    logs = list(Path.cwd().glob("**/*.log"))
-    assert len(logs) == num_jobs
-
-
-yaml_file = """
-hydra:
-  callbacks:
-    save_job_info:
-      _target_: hydra.experimental.callbacks.PickleJobInfoCallback
-"""
-
-
-@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
-@pytest.mark.skipif(not _HYDRA_WITH_RERUN, reason=str(_HYDRA_WITH_RERUN))
-@pytest.mark.parametrize("num_jobs", [1, 2])
-def test_ddp_with_hydra_multirunjob_rerun(cleandir, num_jobs):
-    # Save script locally
-    with open("temp.py", "w") as fn:
-        fn.write(script)
-
-    with open("config.yaml", "w") as fn:
-        fn.write(yaml_file)
-
-    # create fake multirun params based on `num_jobs`
-    fake_param = "+foo=" + ",".join(str(i) for i in range(num_jobs))
-
-    # Run CLI
-    run_process(
-        [
-            sys.executable,
-            "temp.py",
-            "-cp",
-            ".",
-            "-cn",
-            "config.yaml",
-            "+devices=2",
-            '+strategy="ddp"',
-            fake_param,
-            "--multirun",
-        ]
-    )
-
-    pickles = sorted(Path.cwd().glob("**/.hydra/config.pickle"))
-    assert len(pickles) == num_jobs