Rename Strategy.reduce to Strategy.all_reduce in Lite (#16370)

awaelchli · web-flow · commit f1e0fda8798c · 2023-01-16T08:17:45.000-05:00
diff --git a/src/lightning_fabric/CHANGELOG.md b/src/lightning_fabric/CHANGELOG.md
@@ -5,6 +5,9 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+- Renamed `Strategy.reduce` to `Strategy.all_reduce` in all strategies ([#16370](https://github.com/Lightning-AI/lightning/issues/16370))
+
+
 ## [1.9.0] - 2023-01-12
 
 ### Added
diff --git a/src/lightning_fabric/strategies/ddp.py b/src/lightning_fabric/strategies/ddp.py
@@ -120,7 +120,7 @@ def setup_module(self, module: Module) -> DistributedDataParallel:
     def module_to_device(self, module: Module) -> None:
         module.to(self.root_device)
 
-    def reduce(
+    def all_reduce(
         self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"
     ) -> Tensor:
         """Reduces a tensor from several distributed processes to one aggregated tensor.
diff --git a/src/lightning_fabric/strategies/dp.py b/src/lightning_fabric/strategies/dp.py
@@ -65,7 +65,7 @@ def batch_to_device(self, batch: Any, device: Optional[torch.device] = None) ->
         # DataParallel handles the transfer of batch to the device
         return batch
 
-    def reduce(
+    def all_reduce(
         self, collection: TReduce, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"
     ) -> TReduce:
         def mean(t: Tensor) -> Tensor:
diff --git a/src/lightning_fabric/strategies/fsdp.py b/src/lightning_fabric/strategies/fsdp.py
@@ -245,7 +245,7 @@ def module_sharded_context(self) -> Generator:
         ):
             yield
 
-    def reduce(
+    def all_reduce(
         self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"
     ) -> Tensor:
         if isinstance(tensor, Tensor):
diff --git a/src/lightning_fabric/strategies/parallel.py b/src/lightning_fabric/strategies/parallel.py
@@ -94,7 +94,7 @@ def reduce_boolean_decision(self, decision: bool, all: bool = True) -> bool:
             bool: The reduced boolean decision.
         """
         decision = torch.tensor(int(decision), device=self.root_device)
-        decision = self.reduce(decision, reduce_op=ReduceOp.SUM)
+        decision = self.all_reduce(decision, reduce_op=ReduceOp.SUM)
         decision = bool(decision == self.world_size) if all else bool(decision)
         return decision
 
diff --git a/src/lightning_fabric/strategies/single_device.py b/src/lightning_fabric/strategies/single_device.py
@@ -53,7 +53,7 @@ def is_global_zero(self) -> bool:
     def module_to_device(self, module: Module) -> None:
         module.to(self.root_device)
 
-    def reduce(self, tensor: Any | Tensor, *args: Any, **kwargs: Any) -> Any | Tensor:
+    def all_reduce(self, tensor: Any | Tensor, *args: Any, **kwargs: Any) -> Any | Tensor:
         """Reduces a tensor from several distributed processes to one aggregated tensor. As this plugin only
         operates with a single device, the reduction is simply the identity.
 
diff --git a/src/lightning_fabric/strategies/strategy.py b/src/lightning_fabric/strategies/strategy.py
@@ -169,7 +169,17 @@ def optimizer_step(
         return self.precision.optimizer_step(optimizer, **kwargs)
 
     @abstractmethod
-    def reduce(
+    def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor:
+        """Perform an all_gather on all processes.
+
+        Args:
+            tensor: the tensor to all_gather
+            group: the process group to gather results from
+            sync_grads: flag that allows users to synchronize gradients for all_gather op
+        """
+
+    @abstractmethod
+    def all_reduce(
         self,
         tensor: Union[Tensor, Any],
         group: Optional[Any] = None,
@@ -201,16 +211,6 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
             src: source rank
         """
 
-    @abstractmethod
-    def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor:
-        """Perform an all_gather on all processes.
-
-        Args:
-            tensor: the tensor to all_gather
-            group: the process group to gather results from
-            sync_grads: flag that allows users to synchronize gradients for all_gather op
-        """
-
     def reduce_boolean_decision(self, decision: bool, all: bool = True) -> bool:
         """Reduce a boolean decision across all processes."""
         return decision
diff --git a/src/lightning_fabric/strategies/xla.py b/src/lightning_fabric/strategies/xla.py
@@ -118,7 +118,25 @@ def process_dataloader(self, dataloader: DataLoader) -> "MpDeviceLoader":
         dataloader.dataset = dataloader._loader.dataset
         return dataloader
 
-    def reduce(
+    def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor:
+        """Function to gather a tensor from several distributed processes.
+
+        Args:
+            tensor: tensor of shape (batch, ...)
+            group: not available with TPUs
+            sync_grads: flag that allows users to synchronize gradients for the all_gather operation
+        Return:
+            A tensor of shape (world_size, batch, ...)
+        """
+        if isinstance(tensor, Tensor) and tensor.dim() == 0:
+            tensor = tensor.unsqueeze(0)
+
+        import torch_xla.core.functions as xf
+        import torch_xla.core.xla_model as xm
+
+        return xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
+
+    def all_reduce(
         self, output: Union[Tensor, Any], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None
     ) -> Tensor:
         if not isinstance(output, Tensor):
@@ -160,24 +178,6 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
         obj = torch.load(buffer)
         return obj
 
-    def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor:
-        """Function to gather a tensor from several distributed processes.
-
-        Args:
-            tensor: tensor of shape (batch, ...)
-            group: not available with TPUs
-            sync_grads: flag that allows users to synchronize gradients for the all_gather operation
-        Return:
-            A tensor of shape (world_size, batch, ...)
-        """
-        if isinstance(tensor, Tensor) and tensor.dim() == 0:
-            tensor = tensor.unsqueeze(0)
-
-        import torch_xla.core.functions as xf
-        import torch_xla.core.xla_model as xm
-
-        return xf.all_gather(tensor) if sync_grads else xm.all_gather(tensor)
-
     def save_checkpoint(
         self, checkpoint: Dict[str, Any], filepath: _PATH, storage_options: Optional[Any] = None
     ) -> None:
diff --git a/tests/tests_fabric/strategies/launchers/test_xla.py b/tests/tests_fabric/strategies/launchers/test_xla.py
@@ -60,14 +60,14 @@ def test_broadcast_on_tpu():
 
 def tpu_reduce_fn(strategy):
     with pytest.raises(ValueError, match="XLAStrategy only supports"):
-        strategy.reduce(1, reduce_op="undefined")
+        strategy.all_reduce(1, reduce_op="undefined")
 
     with pytest.raises(ValueError, match="XLAStrategy only supports"):
-        strategy.reduce(1, reduce_op=ReduceOp.MAX)
+        strategy.all_reduce(1, reduce_op=ReduceOp.MAX)
 
         # it is faster to loop over here than to parameterize the test
         for reduce_op in ("mean", "AVG", "sum", ReduceOp.SUM):
-            result = strategy.reduce(1, reduce_op=reduce_op)
+            result = strategy.all_reduce(1, reduce_op=reduce_op)
             if isinstance(reduce_op, str) and reduce_op.lower() in ("mean", "avg"):
                 assert result.item() == 1
             else:
@@ -77,7 +77,7 @@ def tpu_reduce_fn(strategy):
 @RunIf(tpu=True)
 @mock.patch.dict(os.environ, os.environ.copy(), clear=True)
 def test_tpu_reduce():
-    """Test tpu spawn reduce operation."""
+    """Test tpu spawn all_reduce operation."""
     xla_launch(tpu_reduce_fn)
 
 
diff --git a/tests/tests_fabric/strategies/test_single_device.py b/tests/tests_fabric/strategies/test_single_device.py
@@ -42,7 +42,7 @@ def test_single_device_collectives():
     strategy = SingleDeviceStrategy()
     tensor = Mock()
     assert strategy.all_gather(tensor) == tensor
-    assert strategy.reduce(tensor) == tensor
+    assert strategy.all_reduce(tensor) == tensor
     assert strategy.broadcast(tensor) == tensor
 
 
diff --git a/tests/tests_fabric/strategies/test_xla.py b/tests/tests_fabric/strategies/test_xla.py
@@ -60,14 +60,14 @@ def test_broadcast_on_tpu():
 
 def tpu_reduce_fn(strategy):
     with pytest.raises(ValueError, match="XLAStrategy only supports"):
-        strategy.reduce(1, reduce_op="undefined")
+        strategy.all_reduce(1, reduce_op="undefined")
 
     with pytest.raises(ValueError, match="XLAStrategy only supports"):
-        strategy.reduce(1, reduce_op=ReduceOp.MAX)
+        strategy.all_reduce(1, reduce_op=ReduceOp.MAX)
 
         # it is faster to loop over here than to parameterize the test
         for reduce_op in ("mean", "AVG", "sum", ReduceOp.SUM):
-            result = strategy.reduce(1, reduce_op=reduce_op)
+            result = strategy.all_reduce(1, reduce_op=reduce_op)
             if isinstance(reduce_op, str) and reduce_op.lower() in ("mean", "avg"):
                 assert result.item() == 1
             else:
@@ -77,7 +77,7 @@ def tpu_reduce_fn(strategy):
 @RunIf(tpu=True)
 @mock.patch.dict(os.environ, os.environ.copy(), clear=True)
 def test_tpu_reduce():
-    """Test tpu spawn reduce operation."""
+    """Test tpu spawn all_reduce operation."""
     xla_launch(tpu_reduce_fn)