Fix TPU test CI (#14926)

carmocca · awaelchli · pre-commit-ci[bot] · web-flow · commit 3028fd287dd1 · 2022-10-03T09:13:33.000-04:00
* Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107e. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813a. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -14,6 +14,9 @@ parameters:
   GHA_Event:
     type: string
     default: ""
+  GHA_Meta:
+    type: string
+    default: ""
 
 references:
 
@@ -49,9 +52,10 @@ references:
   update_jsonnet: &update_jsonnet
     run:
       name: Update jsonnet
+      environment:
+        PR_NUMBER: << pipeline.parameters.GHA_Meta >>
       command: |
        export SHA=$(git rev-parse --short HEAD)
-       export PR_NUMBER=$(git ls-remote origin "pull/*/head" | grep -F -f $SHA | awk -F'/' '{print $3}')
        python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
        data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
        cat dockers/tpu-tests/tpu_test_cases.jsonnet
diff --git a/.github/workflows/ci-circleci.yml b/.github/workflows/ci-circleci.yml
@@ -30,3 +30,5 @@ jobs:
       - uses: CircleCI-Public/trigger-circleci-pipeline-action@v1.0.5
         env:
           CCI_TOKEN: ${{ secrets.CCI_TOKEN }}
+        with:
+          GHA_Meta: ${{ github.event.pull_request.number }}
diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet
@@ -20,29 +20,40 @@ local tputests = base.BaseTest {
 
   command: utils.scriptCommand(
     |||
+      set +x  # turn off tracing, spammy
+      set -e  # exit on error
+
       source ~/.bashrc
-      set -e
       conda activate lightning
-      mkdir -p /home/runner/work/lightning && cd /home/runner/work/lightning
-      git clone https://github.com/Lightning-AI/lightning.git
-      cd lightning
-      echo $PWD
-      git ls-remote --refs origin
-      git fetch origin "refs/pull/{PR_NUMBER}/head"
-      git checkout {SHA}
-      export PACKAGE_NAME=pytorch
-      export FREEZE_REQUIREMENTS=1
-      pip install -e .[test]
+
+      echo "--- Fetch the SHA's changes ---"
+      git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git /home/runner/work/lightning
+      cd home/runner/work/lightning
+      git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER}
+      git -c advice.detachedHead=false checkout {SHA}
+
+      echo "--- Install PL ---"
+      PACKAGE_NAME=pytorch FREEZE_REQUIREMENTS=1 pip install -e .[test]
+      pip list
+
       echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
       export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
-      export PL_RUN_TPU_TESTS=1
-      cd tests/tests_pytorch
-      coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
-      echo "\n||| Running standalone tests |||\n"
-      export PL_STANDALONE_TESTS_SOURCE=pytorch_lightning
-      export PL_STANDALONE_TESTS_BATCH_SIZE=1
-      bash run_standalone_tests.sh
-      echo "\n||| END PYTEST LOGS |||\n"
+
+      echo "--- Running Lite tests ---"
+      cd tests/tests_lite
+      PL_RUN_TPU_TESTS=1 coverage run --source=lightning_lite -m pytest -vv --durations=0 ./
+
+      echo "--- Running standalone Lite tests ---"
+      PL_STANDALONE_TESTS_SOURCE=lightning_lite PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
+
+      echo "--- Running PL tests ---"
+      cd ../tests_pytorch
+      PL_RUN_TPU_TESTS=1 coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./
+
+      echo "--- Running standalone PL tests ---"
+      PL_STANDALONE_TESTS_SOURCE=pytorch_lightning PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
+
+      echo "--- Generating coverage ---"
       coverage xml
       cat coverage.xml | tr -d '\t'
     |||
diff --git a/src/lightning_lite/strategies/launchers/xla.py b/src/lightning_lite/strategies/launchers/xla.py
@@ -12,11 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
-from functools import wraps
 from multiprocessing.queues import SimpleQueue
-from typing import Any, Callable, Optional, Tuple, TYPE_CHECKING
+from typing import Any, Callable, Optional, TYPE_CHECKING
 
-from torch.multiprocessing import get_context, ProcessContext
+from torch.multiprocessing import get_context
 
 from lightning_lite.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher
 from lightning_lite.utilities import _TPU_AVAILABLE
@@ -67,7 +66,7 @@ def launch(self, function: Callable, *args: Any, **kwargs: Any) -> Any:
         """
         context = get_context(self._start_method)
         return_queue = context.SimpleQueue()
-        _save_spawn(
+        xmp.spawn(
             self._wrapping_function,
             args=(function, args, kwargs, return_queue),
             nprocs=self._strategy.num_processes,
@@ -90,30 +89,16 @@ def _wrapping_function(
         if process_idx == 0:
             return_queue.put(move_data_to_device(results, "cpu"))
 
+        _rank_teardown(process_idx)
 
-def _save_spawn(
-    fn: Callable,
-    args: Tuple = (),
-    nprocs: Optional[int] = None,
-    join: bool = True,
-    daemon: bool = False,
-    start_method: str = "spawn",
-) -> Optional[ProcessContext]:
-    """Wraps the :func:`torch_xla.distributed.xla_multiprocessing.spawn` with added teardown logic for the worker
-    processes."""
-
-    @wraps(fn)
-    def wrapped(rank: int, *_args: Any) -> None:
-        fn(rank, *_args)
-
-        import torch_xla.core.xla_model as xm
-
-        # Make all processes wait for each other before joining
-        # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542
-        xm.rendezvous("end-process")
-        # Ensure that the rank 0 process is the one exiting last
-        # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358
-        if rank == 0:
-            time.sleep(1)
-
-    return xmp.spawn(wrapped, args=args, nprocs=nprocs, join=join, daemon=daemon, start_method=start_method)
+
+def _rank_teardown(rank: int) -> None:
+    import torch_xla.core.xla_model as xm
+
+    # Make all processes wait for each other before joining
+    # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542
+    xm.rendezvous("end-process")
+    # Ensure that the rank 0 process is the one exiting last
+    # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358
+    if rank == 0:
+        time.sleep(1)
diff --git a/src/pytorch_lightning/strategies/launchers/xla.py b/src/pytorch_lightning/strategies/launchers/xla.py
@@ -18,7 +18,7 @@
 import torch.multiprocessing as mp
 
 import pytorch_lightning as pl
-from lightning_lite.strategies.launchers.xla import _save_spawn
+from lightning_lite.strategies.launchers.xla import _rank_teardown
 from lightning_lite.utilities import move_data_to_device
 from pytorch_lightning.strategies.launchers.multiprocessing import (
     _FakeQueue,
@@ -74,7 +74,7 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"]
         """
         context = mp.get_context(self._start_method)
         return_queue = context.SimpleQueue()
-        _save_spawn(
+        xmp.spawn(
             self._wrapping_function,
             args=(trainer, function, args, kwargs, return_queue),
             nprocs=self._strategy.num_processes,
@@ -106,6 +106,8 @@ def _wrapping_function(
         if process_idx == 0:
             return_queue.put(move_data_to_device(results, "cpu"))
 
+        _rank_teardown(process_idx)
+
     def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Optional["_WorkerOutput"]:
         rank_zero_debug("Collecting results from rank 0 process.")
         checkpoint_callback = trainer.checkpoint_callback
diff --git a/tests/tests_lite/strategies/launchers/test_xla.py b/tests/tests_lite/strategies/launchers/test_xla.py
@@ -1,5 +1,5 @@
 from unittest import mock
-from unittest.mock import ANY, Mock
+from unittest.mock import Mock
 
 from tests_lite.helpers.runif import RunIf
 
@@ -29,11 +29,9 @@ def test_xla_launcher_xmp_spawn(get_context_mock, xmp_mock):
     queue = get_context_mock.return_value.SimpleQueue.return_value
     get_context_mock.assert_called_with("fork")
     xmp_mock.spawn.assert_called_with(
-        ANY,
+        launcher._wrapping_function,
         args=(function, ("positional-arg",), {"keyword_arg": 0}, queue),
         nprocs=strategy.num_processes,
-        join=True,
-        daemon=False,
         start_method="fork",
     )
     queue.get.assert_called_once_with()
diff --git a/tests/tests_pytorch/accelerators/test_tpu.py b/tests/tests_pytorch/accelerators/test_tpu.py
@@ -20,6 +20,7 @@
 import pytest
 import torch
 from torch import nn
+from torch.multiprocessing import ProcessExitedException
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import Trainer
@@ -69,6 +70,7 @@ def test_resume_training_on_cpu(tmpdir):
 
 @RunIf(tpu=True)
 @mock.patch.dict(os.environ, {}, clear=True)
+@pytest.mark.xfail(raises=ProcessExitedException, reason="https://github.com/pytorch/xla/issues/1666")
 def test_if_test_works_after_train(tmpdir):
     """Ensure that .test() works after .fit()"""
     model = BoringModel()

-Original file line number
+Diff line change
       - uses: CircleCI-Public/[email protected]
         env:
           CCI_TOKEN: ${{ secrets.CCI_TOKEN }}
 +        with:
 +          GHA_Meta: ${{ github.event.pull_request.number }}