update

thomas · thomas · commit 38c3c634b41c · 2023-11-22T11:40:12.000Z
diff --git a/src/lightning/data/streaming/__init__.py b/src/lightning/data/streaming/__init__.py
@@ -11,8 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from lightning_cloud.resolver import Dir as InputDir
-
 from lightning.data.streaming.cache import Cache
 from lightning.data.streaming.data_processor import DataChunkRecipe, DataProcessor, DataTransformRecipe
 from lightning.data.streaming.dataset import StreamingDataset
@@ -25,5 +23,4 @@
     "DataTransformRecipe",
     "DataChunkRecipe",
     "TokensLoader",
-    "InputDir",
 ]
diff --git a/src/lightning/data/streaming/cache.py b/src/lightning/data/streaming/cache.py
@@ -107,11 +107,11 @@ def filled(self) -> bool:
         return self._is_done
 
     @property
-    def resume_folder(self) -> str:
-        resume_folder = os.path.join(self._cache_dir, "checkpoints", str(self._reader.rank))
-        if not os.path.exists(resume_folder):
-            os.makedirs(resume_folder, exist_ok=True)
-        return resume_folder
+    def checkpoint_dir(self) -> str:
+        checkpoint_dir = os.path.join(self._cache_dir, "checkpoints", str(self.rank))
+        if not os.path.exists(checkpoint_dir):
+            os.makedirs(checkpoint_dir, exist_ok=True)
+        return checkpoint_dir
 
     def __setitem__(self, index: int, data: Any) -> None:
         """Store an item in the writer."""
diff --git a/src/lightning/data/streaming/constants.py b/src/lightning/data/streaming/constants.py
@@ -51,3 +51,5 @@
     18: torch.long,
     19: torch.bool,
 }
+
+_TIME_FORMAT = "%Y-%m-%d_%H-%M-%S.%fZ"
diff --git a/src/lightning/data/streaming/dataset.py b/src/lightning/data/streaming/dataset.py
@@ -15,7 +15,6 @@
 import json
 import os
 import shutil
-import uuid
 from dataclasses import dataclass
 from datetime import datetime
 from time import time
@@ -25,7 +24,12 @@
 from torch.utils.data import IterableDataset
 
 from lightning.data.streaming import Cache
-from lightning.data.streaming.constants import _DEFAULT_CACHE_DIR, _INDEX_FILENAME, _LIGHTNING_CLOUD_LATEST
+from lightning.data.streaming.constants import (
+    _DEFAULT_CACHE_DIR,
+    _INDEX_FILENAME,
+    _LIGHTNING_CLOUD_LATEST,
+    _TIME_FORMAT,
+)
 from lightning.data.streaming.item_loader import BaseItemLoader
 from lightning.data.streaming.sampler import ChunkedIndex
 from lightning.data.streaming.serializers import Serializer
@@ -93,7 +97,6 @@ def __init__(
         self.random_state = None
         self.shuffler: Optional[Shuffle] = None
         self.serializers = serializers
-        self.resume_id = uuid.uuid4()
         self.checkpoint_interval = checkpoint_interval or 60 * 5
         self._state_dict: Optional[Dict] = None
 
@@ -154,13 +157,17 @@ def __iter__(self) -> "StreamingDataset":
 
         # Handle restart
         if self._state_dict:
+            self._validate_state_dict()
             state = self._state_dict[str(self.cache.rank)]
+
             self.chunk_index = state["chunk_index"]
             self.global_index = state["global_index"]
             self.index = state["index"]
+            self.current_epoch = state["current_epoch"]
+
             interval = self.worker_intervals[self.chunk_index]
             current_indexes = np.arange(interval[0], interval[1])
-            current_indexes = self.shuffler(current_indexes)
+            current_indexes = self.shuffler(current_indexes, self.current_epoch, self.chunk_index)
             self.current_indexes = current_indexes[state["index"] :]
             self.has_triggered_download = False
             self.last_time = time()
@@ -200,17 +207,15 @@ def __next__(self) -> Any:
             self.index = 0
 
             # Checkpoint when reaching a new chunk
-            self.checkpoint()
+            self.checkpoint(self.chunk_index)
 
             interval = self.worker_intervals[self.chunk_index]
             current_indexes = np.arange(interval[0], interval[1])
 
             assert self.shuffler is not None
-            self.current_indexes = self.shuffler(current_indexes)
+            self.current_indexes = self.shuffler(current_indexes, self.current_epoch, self.chunk_index)
             self.chunk_index += 1
 
-        last_index = self.chunk_index == len(self.worker_intervals) and len(self.current_indexes) == 1
-
         # Get the first index
         index = self.current_indexes.pop(0)
 
@@ -221,7 +226,7 @@ def __next__(self) -> Any:
                 chunk_index=self.worker_chunks[self.chunk_index - 1],
                 # We provide the chunks indexes only one the first
                 chunk_indexes=None if self.has_triggered_download else self.worker_chunks,
-                last_index=last_index,
+                last_index=(self.chunk_index - 1) == len(self.worker_intervals) and len(self.current_indexes) == 1,
             )
         )
 
@@ -231,37 +236,38 @@ def __next__(self) -> Any:
 
         # Checkpoint based on time
         if (self.last_time - time()) > self.checkpoint_interval:
-            self.checkpoint()
+            self.checkpoint(self.chunk_index - 1)
 
         return data
 
-    def checkpoint(self) -> None:
+    def checkpoint(self, chunk_index: int) -> None:
         import tempfile
 
         with tempfile.NamedTemporaryFile(mode="w+") as tmp:
+            # 1. Write the state to a tempfile
             json.dump(
                 {
                     "rank": self.cache._reader.rank,
                     "current_epoch": self.current_epoch,
                     "input_dir_path": self.input_dir.path,
                     "input_dir_url": self.input_dir.url,
-                    "item_loader": self.item_loader.state_dict(),
+                    "item_loader": self.item_loader.state_dict() if self.item_loader else None,
                     "drop_last": self.drop_last,
                     "seed": self.seed,
                     "checkpoint_interval": self.checkpoint_interval,
-                    "chunk_index": self.chunk_index,
+                    "chunk_index": chunk_index,
                     "global_index": self.global_index,
                     "index": self.index,
                 },
                 tmp,
             )
 
+            # 2. Flush to make sure it is written
             tmp.flush()
 
-            now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S.%fZ")
-            checkpoint_path = os.path.join(self.cache.resume_folder, f"checkpoint-{now}.json")
-
-            # Should avoid corrupted read from the main thread.
+            # 3. Move the file to avoid corrupted read from the main thread.
+            now = datetime.now().strftime(_TIME_FORMAT)
+            checkpoint_path = os.path.join(self.cache.checkpoint_dir, f"checkpoint-{now}.json")
             shutil.copyfile(tmp.name, checkpoint_path)
 
         self.last_time = time()
@@ -274,18 +280,17 @@ def state_dict(self) -> Dict[_DictKey, Any]:
         state_dict = {}
         worker_env = _WorkerEnv.detect()
         if worker_env.world_size == 1:
-            checkpoint_dir = os.path.join(self.cache._cache_dir, "checkpoints")
-            if not os.path.exists(checkpoint_dir):
+            # 1. Check whether the checkpoint_dir exists
+            if not os.path.exists(self.cache.checkpoint_dir):
                 return state_dict
-            for worker_idx in os.listdir(checkpoint_dir):
-                checkpoints = os.listdir(os.path.join(checkpoint_dir, str(worker_idx)))
-                checkpoints = sorted(
-                    checkpoints,
-                    key=lambda item: datetime.strptime(
-                        item.split("checkpoint-")[1].split(".json")[0], "%Y-%m-%d_%H-%M-%S.%fZ"
-                    ),
-                )
-                checkpoint_path = os.path.join(checkpoint_dir, str(worker_idx), checkpoints[-1])
+
+            # 2. Iterate through the workers and read the latest checkpoint
+            for worker_idx in os.listdir(self.cache.checkpoint_dir):
+                checkpoints = os.listdir(os.path.join(self.cache.checkpoint_dir, str(worker_idx)))
+                checkpoints = sorted(checkpoints, key=_string_to_datetime)
+
+                # Load the latest checkpoint for this worker
+                checkpoint_path = os.path.join(self.cache.checkpoint_dir, str(worker_idx), checkpoints[-1])
                 with open(checkpoint_path) as f:
                     state_dict[worker_idx] = json.load(f)
         else:
@@ -296,6 +301,46 @@ def load_state_dict(self, state_dict: Dict[_DictKey, Any]) -> None:
         if state_dict:
             self._state_dict = state_dict
 
+    def _validate_state_dict(self) -> None:
+        env = Environment(dist_env=self.distributed_env, worker_env=self.worker_env)
+
+        if env.num_shards != len(self._state_dict):
+            raise ValueError(
+                "The provided `state` doesn't match the number workers world size. "
+                f"Found {env.num_shards} instead of {len(self._state_dict)}."
+            )
+
+        state = self._state_dict[str(self.cache.rank)]
+
+        if state["input_dir_path"] != self.input_dir.path:
+            raise ValueError(
+                "The provided `input_dir` path doesn't match the current one. "
+                f"Found {self.input_dir.path} instead of {state['input_dir_path']}."
+            )
+
+        if state["input_dir_url"] != self.input_dir.url:
+            raise ValueError(
+                "The provided `input_dir` URL doesn't match the current one. "
+                f"Found {self.input_dir.url} instead of {state['input_dir_url']}."
+            )
+
+        if state["seed"] != self.seed:
+            raise ValueError(
+                "The provided `seed` doesn't match the current one. " f"Found {self.seed} instead of {state['seed']}."
+            )
+
+        if self.item_loader and state["item_loader"] != self.item_loader.state_dict():
+            raise ValueError(
+                "The provided `item_loader` state doesn't match the current one. "
+                f"Found {self.item_loader.state_dict()} instead of {state['item_loader']}."
+            )
+
+        if state["drop_last"] != self.drop_last:
+            raise ValueError(
+                "The provided `drop_last` state doesn't match the current one. "
+                f"Found {self.drop_last} instead of {state['drop_last']}."
+            )
+
 
 def _try_create_cache_dir(input_dir: str, shard_rank: int = 0) -> Optional[str]:
     hash_object = hashlib.md5(input_dir.encode())
@@ -308,6 +353,10 @@ def _try_create_cache_dir(input_dir: str, shard_rank: int = 0) -> Optional[str]:
     return cache_dir
 
 
+def _string_to_datetime(item: str) -> datetime:
+    return datetime.strptime(item.split("checkpoint-")[1].split(".json")[0], _TIME_FORMAT)
+
+
 @dataclass
 class RemoteDir:
     """Holds a remote URL to a directory and a cache directory where the data will be downloaded."""
diff --git a/src/lightning/data/streaming/shuffle.py b/src/lightning/data/streaming/shuffle.py
@@ -28,7 +28,6 @@ def __init__(self, cache: Cache, seed: int, drop_last: bool):
         self.cache = cache
         self.seed = seed
         self.drop_last = drop_last
-        self.random_state = None
 
     @lru_cache(maxsize=10)
     def get_len(self, distributed_env: _DistributedEnv, current_epoch: int) -> int:
@@ -48,7 +47,7 @@ def get_chunks_and_intervals_per_ranks(self, distributed_env: _DistributedEnv, c
         pass
 
     @abstractmethod
-    def __call__(self, array: np.ndarray) -> List[int]:
+    def __call__(self, array: np.ndarray, current_epoch: int) -> List[int]:
         pass
 
 
@@ -68,7 +67,7 @@ def get_chunks_and_intervals_per_ranks(self, distributed_env: _DistributedEnv, c
 
         return chunks_per_ranks, intervals_per_ranks
 
-    def __call__(self, array: np.ndarray) -> List[int]:
+    def __call__(self, array: np.ndarray, current_epoch: int, chunk_index: int) -> List[int]:
         return array.tolist()
 
 
@@ -92,14 +91,12 @@ class FullShuffle(Shuffle):
 
     @lru_cache(maxsize=10)
     def get_chunks_and_intervals_per_ranks(self, distributed_env: _DistributedEnv, current_epoch: int) -> Any:
-        self.random_state = np.random.RandomState(seed=self.seed + current_epoch)  # type: ignore
-
         # 1. Get the intervals
         chunk_intervals = self.cache.get_chunk_intervals()
 
         # 2. Shuffle them
         indexes = range(len(chunk_intervals))
-        shuffled_indexes = self.random_state.permutation(indexes)
+        shuffled_indexes = np.random.RandomState(seed=self.seed + current_epoch).permutation(indexes)
         shuffled_chunk_intervals = np.asarray(chunk_intervals)[shuffled_indexes]
 
         # 3. Compute the items budget of each rank
@@ -147,6 +144,5 @@ def get_chunks_and_intervals_per_ranks(self, distributed_env: _DistributedEnv, c
 
         return chunks_per_ranks, intervals_per_ranks
 
-    def __call__(self, array: np.ndarray) -> List[int]:
-        assert self.random_state
-        return self.random_state.permutation(array).tolist()
+    def __call__(self, array: np.ndarray, current_epoch: int, chunk_index: int) -> List[int]:
+        return np.random.RandomState(seed=self.seed + current_epoch + chunk_index).permutation(array).tolist()
diff --git a/tests/tests_data/streaming/test_data_processor.py b/tests/tests_data/streaming/test_data_processor.py
@@ -161,7 +161,7 @@ def fn(*_, **__):
 
 @pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows")
 @mock.patch("lightning.data.streaming.data_processor._wait_for_disk_usage_higher_than_threshold")
-def test_download_data_target(tmpdir):
+def test_download_data_target(wait_for_disk_usage_higher_than_threshold_mock, tmpdir):
     input_dir = os.path.join(tmpdir, "input_dir")
     os.makedirs(input_dir, exist_ok=True)
 
@@ -194,6 +194,8 @@ def fn(*_, **__):
 
     assert os.listdir(cache_dir) == ["a.txt"]
 
+    wait_for_disk_usage_higher_than_threshold_mock.assert_called()
+
 
 def test_wait_for_disk_usage_higher_than_threshold():
     disk_usage_mock = mock.Mock(side_effect=[mock.Mock(free=10e9), mock.Mock(free=10e9), mock.Mock(free=10e11)])
diff --git a/tests/tests_data/streaming/test_dataset.py b/tests/tests_data/streaming/test_dataset.py
@@ -161,7 +161,7 @@ def test_streaming_dataset_distributed_full_shuffle_odd(drop_last, tmpdir):
     dataset_iter = iter(dataset)
     assert len(dataset_iter) == 548
     process_1_1 = list(dataset_iter)
-    assert process_1_1[:10] == [785, 788, 782, 783, 789, 787, 786, 781, 784, 780]
+    assert process_1_1[:10] == [788, 781, 785, 780, 787, 782, 789, 784, 783, 786]
     assert len(process_1_1) == 548
 
     dataset_2 = StreamingDataset(input_dir=str(tmpdir), shuffle=True, drop_last=drop_last)
@@ -172,7 +172,7 @@ def test_streaming_dataset_distributed_full_shuffle_odd(drop_last, tmpdir):
     dataset_2_iter = iter(dataset_2)
     assert len(dataset_2_iter) == 548 + int(not drop_last)
     process_2_1 = list(dataset_2_iter)
-    assert process_2_1[:10] == [939, 938, 252, 259, 257, 255, 258, 253, 250, 251]
+    assert process_2_1[:10] == [939, 938, 253, 259, 256, 258, 252, 255, 251, 257]
     assert len(process_2_1) == 548 + int(not drop_last)
     assert len([i for i in process_1_1 if i in process_2_1]) == 0
 
@@ -201,7 +201,7 @@ def test_streaming_dataset_distributed_full_shuffle_even(drop_last, tmpdir):
     dataset_iter = iter(dataset)
     assert len(dataset_iter) == 611
     process_1_1 = list(dataset_iter)
-    assert process_1_1[:10] == [185, 184, 182, 189, 187, 181, 183, 180, 186, 188]
+    assert process_1_1[:10] == [188, 181, 185, 180, 187, 182, 189, 184, 183, 186]
     assert len(process_1_1) == 611
 
     dataset_2 = StreamingDataset(input_dir=str(tmpdir), shuffle=True, drop_last=drop_last)
@@ -212,9 +212,8 @@ def test_streaming_dataset_distributed_full_shuffle_even(drop_last, tmpdir):
     dataset_2_iter = iter(dataset_2)
     assert len(dataset_2_iter) == 611
     process_2_1 = list(dataset_2_iter)
-    assert process_2_1[:10] == [813, 815, 816, 812, 818, 811, 817, 814, 819, 277]
+    assert process_2_1[:10] == [818, 812, 816, 811, 819, 813, 815, 814, 817, 273]
     assert len(process_2_1) == 611
-
     assert len([i for i in process_1_1 if i in process_2_1]) == 0
 
 
@@ -530,7 +529,7 @@ def test_s3_streaming_dataset():
     assert dataset.input_dir.path is None
 
 
-def test_resumable_dataset(tmpdir):
+def test_resumable_dataset_single_worker(tmpdir):
     seed_everything(42)
 
     block_size = 20
diff --git a/tests/tests_pytorch/checkpointing/test_model_checkpoint.py b/tests/tests_pytorch/checkpointing/test_model_checkpoint.py
@@ -1474,8 +1474,8 @@ def test_train_epoch_end_ckpt_with_no_validation():
     assert not trainer.checkpoint_callback._should_save_on_train_epoch_end(trainer)
 
 
-@pytest.mark.parametrize("same_resume_folder", [True, False])
-def test_resume_and_old_checkpoint_files_remain(same_resume_folder, tmp_path):
+@pytest.mark.parametrize("same_checkpoint_dir", [True, False])
+def test_resume_and_old_checkpoint_files_remain(same_checkpoint_dir, tmp_path):
     """Test that checkpoints saved in the resume-folder won't be deleted under the save-top-k mechanism."""
     model = BoringModel()
     trainer_kwargs = {
@@ -1488,7 +1488,7 @@ def test_resume_and_old_checkpoint_files_remain(same_resume_folder, tmp_path):
     }
     first = tmp_path / "first"
     second = tmp_path / "second"
-    new_dirpath = first if same_resume_folder else second
+    new_dirpath = first if same_checkpoint_dir else second
 
     # Generate checkpoints in the first folder
     callback = ModelCheckpoint(dirpath=first, monitor="step", mode="max", save_top_k=2, every_n_train_steps=2)
@@ -1500,7 +1500,7 @@ def test_resume_and_old_checkpoint_files_remain(same_resume_folder, tmp_path):
     callback = ModelCheckpoint(dirpath=new_dirpath, monitor="step", mode="max", save_top_k=2, every_n_train_steps=2)
     trainer = Trainer(callbacks=callback, max_steps=8, **trainer_kwargs)
     trainer.fit(model, ckpt_path=str(first / "epoch=0-step=4.ckpt"))
-    if same_resume_folder:
+    if same_checkpoint_dir:
         assert set(os.listdir(first)) == {
             "epoch=0-step=4.ckpt",  # do not delete checkpoint from which we resume from
             "epoch=0-step=6.ckpt",

Original file line number	Diff line number	Diff line change
`@@ -51,3 +51,5 @@`
`51`	`51`	`18: torch.long,`
`52`	`52`	`19: torch.bool,`
`53`	`53`	`}`
	`54`	`+`
	`55`	`+_TIME_FORMAT = "%Y-%m-%d_%H-%M-%S.%fZ"`