Merge branch 'master' into 19223_num_workers_warning

awaelchli · web-flow · commit 75f83c71e289 · 2024-01-08T12:43:21.000+01:00
diff --git a/requirements/docs.txt b/requirements/docs.txt
@@ -1,6 +1,7 @@
 sphinx >5.0, <6.0
 myst-parser >=0.18.1, <3.0.0
 nbsphinx >=0.8.5, <=0.9.2
+nbconvert <7.14  # temporary fix for https://github.com/jupyter/nbconvert/issues/2092
 pandoc >=1.0, <=2.3
 docutils >=0.16, <0.21
 sphinxcontrib-fulltoc >=1.0, <=1.2.0
diff --git a/src/lightning/data/streaming/reader.py b/src/lightning/data/streaming/reader.py
@@ -68,6 +68,7 @@ def __init__(
 
         # FIXME: This should be divided by the number of nodes to provide a more granular support with scaling out
         self._delete_chunks_when_processed = self._config.num_bytes > max_cache_size if max_cache_size else False
+        self._has_exited = False
 
     def download(self, chunk_indexes: List[int]) -> None:
         """Receive the list of the chunk indices to download for the current epoch."""
@@ -111,7 +112,7 @@ def _maybe_delete_chunks(self) -> None:
 
     def _can_delete_chunk(self) -> bool:
         if self._delete_chunks_when_processed:
-            return self._pre_download_counter == self._max_pre_download - 1
+            return self._pre_download_counter >= self._max_pre_download - 1
         return self._max_cache_size is not None and _get_folder_size(self._parent_cache_dir) >= self._max_cache_size
 
     def _pre_load_chunk(self, chunk_index: int) -> None:
@@ -120,9 +121,10 @@ def _pre_load_chunk(self, chunk_index: int) -> None:
 
     def run(self) -> None:
         while True:
-            if self._pre_download_counter <= self._max_pre_download:
+            if self._pre_download_counter < self._max_pre_download:
                 chunk_index = _get_from_queue(self._to_download_queue)
                 if chunk_index == _END_TOKEN:
+                    self._has_exited = True
                     return
 
                 if chunk_index is not None:
diff --git a/src/lightning/fabric/utilities/rank_zero.py b/src/lightning/fabric/utilities/rank_zero.py
@@ -30,17 +30,12 @@
 )
 from typing_extensions import ParamSpec
 
-import lightning.fabric
 from lightning.fabric.utilities.imports import _UTILITIES_GREATER_EQUAL_0_10
 
 rank_zero_module.log = logging.getLogger(__name__)
 
 
-def _get_rank(
-    strategy: Optional["lightning.fabric.strategies.Strategy"] = None,
-) -> Optional[int]:
-    if strategy is not None:
-        return strategy.global_rank
+def _get_rank() -> Optional[int]:
     # SLURM_PROCID can be set even if SLURM is not managing the multiprocessing,
     # therefore LOCAL_RANK needs to be checked first
     rank_keys = ("RANK", "LOCAL_RANK", "SLURM_PROCID", "JSM_NAMESPACE_RANK")
diff --git a/src/lightning/pytorch/callbacks/early_stopping.py b/src/lightning/pytorch/callbacks/early_stopping.py
@@ -26,7 +26,6 @@
 from typing_extensions import override
 
 import lightning.pytorch as pl
-from lightning.fabric.utilities.rank_zero import _get_rank
 from lightning.pytorch.callbacks.callback import Callback
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
 from lightning.pytorch.utilities.rank_zero import rank_prefixed_message, rank_zero_warn
@@ -265,12 +264,8 @@ def _improvement_message(self, current: Tensor) -> str:
         return msg
 
     @staticmethod
-    def _log_info(trainer: Optional["pl.Trainer"], message: str, log_rank_zero_only: bool) -> None:
-        rank = _get_rank(
-            strategy=(trainer.strategy if trainer is not None else None),  # type: ignore[arg-type]
-        )
-        if trainer is not None and trainer.world_size <= 1:
-            rank = None
+    def _log_info(trainer: "pl.Trainer", message: str, log_rank_zero_only: bool) -> None:
+        rank = trainer.global_rank if trainer.world_size > 1 else None
         message = rank_prefixed_message(message, rank)
         if rank is None or not log_rank_zero_only or rank == 0:
             log.info(message)
diff --git a/tests/tests_data/streaming/test_reader.py b/tests/tests_data/streaming/test_reader.py
@@ -1,11 +1,13 @@
 import os
 import shutil
+from time import sleep
 
 import numpy as np
+from lightning.data.streaming import reader
 from lightning.data.streaming.cache import Cache
 from lightning.data.streaming.config import ChunkedIndex
 from lightning.data.streaming.item_loader import PyTreeLoader
-from lightning.data.streaming.reader import PrepareChunksThread, _get_folder_size
+from lightning.data.streaming.reader import _END_TOKEN, PrepareChunksThread, _get_folder_size
 from lightning_cloud.resolver import Dir
 
 
@@ -36,40 +38,11 @@ def test_reader_chunk_removal(tmpdir):
     shutil.rmtree(cache_dir)
     os.makedirs(cache_dir, exist_ok=True)
 
-    generated = []
     for i in range(25):
-        generated.append([i, len(os.listdir(cache_dir))])
+        assert len(os.listdir(cache_dir)) <= 3
         index = ChunkedIndex(i, cache._get_chunk_index_from_index(i), is_last_index=i == 24)
         assert cache[index] == i
 
-    assert generated == [
-        [0, 0],
-        [1, 2],
-        [2, 2],
-        [3, 3],
-        [4, 3],
-        [5, 3],
-        [6, 3],
-        [7, 3],
-        [8, 3],
-        [9, 3],
-        [10, 3],
-        [11, 3],
-        [12, 3],
-        [13, 3],
-        [14, 3],
-        [15, 3],
-        [16, 3],
-        [17, 3],
-        [18, 3],
-        [19, 3],
-        [20, 3],
-        [21, 3],
-        [22, 3],
-        [23, 3],
-        [24, 3],
-    ]
-
     assert len(os.listdir(cache_dir)) == 3
 
 
@@ -82,7 +55,9 @@ def test_get_folder_size(tmpdir):
     assert _get_folder_size(tmpdir) == 928 * 2
 
 
-def test_prepare_chunks_thread(tmpdir):
+def test_prepare_chunks_thread_eviction(tmpdir, monkeypatch):
+    monkeypatch.setattr(reader, "_LONG_DEFAULT_TIMEOUT", 0.1)
+
     cache_dir = os.path.join(tmpdir, "cache_dir")
     os.makedirs(cache_dir, exist_ok=True)
     cache = Cache(input_dir=cache_dir, chunk_size=2, max_cache_size=28020)
@@ -95,8 +70,32 @@ def test_prepare_chunks_thread(tmpdir):
 
     cache._reader._try_load_config()
 
-    thread = PrepareChunksThread(cache._reader.config, item_loader=PyTreeLoader(), max_cache_size=1)
-    assert thread._delete_chunks_when_processed
+    assert len(os.listdir(cache_dir)) == 14
 
     thread = PrepareChunksThread(cache._reader.config, item_loader=PyTreeLoader(), max_cache_size=10000)
     assert not thread._delete_chunks_when_processed
+
+    thread = PrepareChunksThread(cache._reader.config, item_loader=PyTreeLoader(), max_cache_size=1)
+    assert thread._delete_chunks_when_processed
+
+    thread.start()
+
+    assert thread._pre_download_counter == 0
+
+    thread.download([0, 1, 2, 3, 4, 5, _END_TOKEN])
+
+    while thread._pre_download_counter == 0:
+        sleep(0.01)
+
+    assert not thread._has_exited
+
+    for i in range(5):
+        thread.delete([i])
+        while len(os.listdir(cache_dir)) != 14 - (i + 1):
+            sleep(0.01)
+
+    assert thread._pre_download_counter <= 2
+
+    assert len(os.listdir(cache_dir)) == 9
+    assert thread._has_exited
+    thread.join()
diff --git a/tests/tests_pytorch/callbacks/test_early_stopping.py b/tests/tests_pytorch/callbacks/test_early_stopping.py
@@ -480,7 +480,6 @@ def test_early_stopping_squeezes():
     es_mock.assert_called_once_with(torch.tensor(0))
 
 
-@pytest.mark.parametrize("trainer", [Trainer(), None])
 @pytest.mark.parametrize(
     ("log_rank_zero_only", "world_size", "global_rank", "expected_log"),
     [
@@ -492,15 +491,11 @@ def test_early_stopping_squeezes():
         (True, 2, 1, None),
     ],
 )
-def test_early_stopping_log_info(trainer, log_rank_zero_only, world_size, global_rank, expected_log):
+def test_early_stopping_log_info(log_rank_zero_only, world_size, global_rank, expected_log):
     """Checks if log.info() gets called with expected message when used within EarlyStopping."""
     # set the global_rank and world_size if trainer is not None
     # or else always expect the simple logging message
-    if trainer:
-        trainer.strategy.global_rank = global_rank
-        trainer.strategy.world_size = world_size
-    else:
-        expected_log = "bar"
+    trainer = Mock(global_rank=global_rank, world_size=world_size)
 
     with mock.patch("lightning.pytorch.callbacks.early_stopping.log.info") as log_mock:
         EarlyStopping._log_info(trainer, "bar", log_rank_zero_only)