From 0060424996114f6b659d5d358bbf29ebfe65a2fd Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 00:22:45 +0100
Subject: [PATCH 01/18] third party docs

---
 dockers/base-cuda/Dockerfile                |  9 -------
 dockers/nvidia/Dockerfile                   |  2 --
 docs/source-pytorch/extensions/strategy.rst | 27 ++++++++++++++++++---
 requirements/pytorch/strategies.txt         |  1 +
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index d2bd534e33776..424b82ce532ce 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -98,18 +98,9 @@ RUN \
     pip install -r requirements/pytorch/base.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
     rm assistant.py
 
-RUN \
-    # install ColossalAI
-    # TODO: 1.13 wheels are not released, remove skip once they are
-    if [[ $PYTORCH_VERSION != "1.13" ]]; then \
-        pip install "colossalai==0.2.4"; \
-        python -c "import colossalai; print(colossalai.__version__)" ; \
-    fi
 
 RUN \
     # install rest of strategies
-    # remove colossalai from requirements since they are installed separately
-    python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
     cat requirements/pytorch/strategies.txt && \
     pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html
 
diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
index 9bb97e92af04e..cb76595f3eac7 100644
--- a/dockers/nvidia/Dockerfile
+++ b/dockers/nvidia/Dockerfile
@@ -43,8 +43,6 @@ RUN \
 
 # Installations \
     pip install "Pillow>=8.2, !=8.3.0" "cryptography>=3.4" "py>=1.10" --no-cache-dir && \
-    # remove colossalai from requirements since they are installed separately
-    python -c "fname = 'lightning/requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'colossalai' not in line] ; open(fname, 'w').writelines(lines)" ; \
     PACKAGE_NAME=pytorch pip install './lightning[extra,loggers,strategies]' --no-cache-dir && \
     rm -rf lightning && \
     pip list
diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst
index c92de855c440a..cbfd559e47e92 100644
--- a/docs/source-pytorch/extensions/strategy.rst
+++ b/docs/source-pytorch/extensions/strategy.rst
@@ -23,7 +23,7 @@ plugin and other optional plugins such as the :ref:`ClusterEnvironment <extensio
 We expose Strategies mainly for expert users that want to extend Lightning for new hardware support or new distributed backends (e.g. a backend not yet supported by `PyTorch <https://pytorch.org/docs/stable/distributed.html#backends>`_ itself).
 
 
-----------
+----
 
 *****************************
 Selecting a Built-in Strategy
@@ -69,9 +69,6 @@ The below table lists all relevant strategies available in Lightning with their
    * - Name
      - Class
      - Description
-   * - colossalai
-     - :class:`~pytorch_lightning.strategies.ColossalAIStrategy`
-     - Colossal-AI provides a collection of parallel components for you. It aims to support you to write your distributed deep learning models just like how you write your model on your laptop. `Learn more. <https://www.colossalai.org/>`__
    * - fsdp
      - :class:`~pytorch_lightning.strategies.FSDPStrategy`
      - Strategy for Fully Sharded Data Parallel training. :ref:`Learn more. <advanced/model_parallel:Fully Sharded Training>`
@@ -105,6 +102,28 @@ The below table lists all relevant strategies available in Lightning with their
 
 ----
 
+
+**********************
+Third-party Strategies
+**********************
+
+There are powerful third-party strategies that integrate well with Lightning but aren't maintained as part of the ``lightning`` package.
+
+.. list-table:: List of third-party strategy implementations
+   :widths: 20 20 20
+   :header-rows: 1
+
+    * - Name
+      - Package
+      - Description
+    * - colossalai
+      - `Lightning-AI/lightning-colossalai <https://github.com/Lightning-AI/lightning-colossalai>`_
+      - Colossal-AI provides a collection of parallel components for you. It aims to support you to write your distributed deep learning models just like how you write your model on your laptop. `Learn more. <https://www.colossalai.org/>`__
+
+
+----
+
+
 ************************
 Create a Custom Strategy
 ************************
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index c8a5c9531fe3d..6f5d1e431f18c 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,3 +2,4 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
+lightning_colossalai @ https://github.com/Lightning-AI/lightning-colossalai  # TODO: Update with pypi release once available

From c3d2ad8931cdb52d7aaff5d0cc07ac8d7cb1c004 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 00:35:12 +0100
Subject: [PATCH 02/18] move

---
 .../advanced/model_parallel/colossalai.rst    |  96 ++++++++++++++
 .../{ => model_parallel}/model_parallel.rst   | 125 +-----------------
 2 files changed, 101 insertions(+), 120 deletions(-)
 create mode 100644 docs/source-pytorch/advanced/model_parallel/colossalai.rst
 rename docs/source-pytorch/advanced/{ => model_parallel}/model_parallel.rst (84%)

diff --git a/docs/source-pytorch/advanced/model_parallel/colossalai.rst b/docs/source-pytorch/advanced/model_parallel/colossalai.rst
new file mode 100644
index 0000000000000..e8c8528158e02
--- /dev/null
+++ b/docs/source-pytorch/advanced/model_parallel/colossalai.rst
@@ -0,0 +1,96 @@
+.. _colossalai:
+
+###########
+Colossal-AI
+###########
+
+
+:class:`~pytorch_lightning.strategies.colossalai.ColossalAIStrategy` implements ZeRO-DP with chunk-based memory management.
+With this chunk mechanism, really large models can be trained with a small number of GPUs.
+It supports larger trainable model size and batch size than usual heterogeneous training by reducing CUDA memory fragments and CPU memory consumption.
+Also, it speeds up this kind of heterogeneous training by fully utilizing all kinds of resources.
+
+When enabling chunk mechanism, a set of consecutive parameters are stored in a chunk, and then the chunk is sharded across different processes.
+This can reduce communication and data transmission frequency and fully utilize communication and PCI-E bandwidth, which makes training faster.
+
+Unlike traditional implementations, which adopt static memory partition, we implemented a dynamic heterogeneous memory management system named Gemini.
+During the first training step, the warmup phase will sample the maximum non-model data memory (memory usage expect parameters, gradients, and optimizer states).
+In later training, it will use the collected memory usage information to evict chunks dynamically.
+Gemini allows you to fit much larger models with limited GPU memory.
+
+According to our benchmark results, we can train models with up to 24 billion parameters in 1 GPU.
+You can install colossalai by consulting `how to download colossalai <https://colossalai.org/download>`_.
+Then, run this benchmark in `Colossalai-PL/gpt <https://github.com/hpcaitech/ColossalAI-Pytorch-lightning/tree/main/benchmark/gpt>`_.
+
+Here is an example showing how to use ColossalAI:
+
+.. code-block:: python
+
+    from colossalai.nn.optimizer import HybridAdam
+
+
+    class MyBert(LightningModule):
+        ...
+
+        def configure_sharded_model(self) -> None:
+            # create your model here
+            self.model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+
+        def configure_optimizers(self):
+            # use the specified optimizer
+            optimizer = HybridAdam(self.model.parameters(), self.lr)
+
+        ...
+
+
+    model = MyBert()
+    trainer = Trainer(accelerator="gpu", devices=1, precision=16, strategy="colossalai")
+    trainer.fit(model)
+
+You can find more examples in the `Colossalai-PL <https://github.com/hpcaitech/ColossalAI-Pytorch-lightning>`_ repository.
+
+.. note::
+
+    *   The only accelerator which ColossalAI supports is ``"gpu"``. But CPU resources will be used when the placement policy is set to "auto" or "cpu".
+
+    *   The only precision which ColossalAI allows is 16 (FP16).
+
+    *   It only supports a single optimizer, which must be ``colossalai.nn.optimizer.CPUAdam`` or ``colossalai.nn.optimizer.
+        HybridAdam`` now. You can set ``adamw_mode`` to False to use normal Adam. Noticing that ``HybridAdam`` is highly optimized, it uses fused CUDA kernel and parallel CPU kernel.
+        It is recomended to use ``HybridAdam``, since it updates parameters in GPU and CPU both.
+
+    *   Your model must be created using the :meth:`~pytorch_lightning.core.module.LightningModule.configure_sharded_model` method.
+
+    *   ``ColossalaiStrategy`` doesn't support gradient accumulation as of now.
+
+.. _colossal_placement_policy:
+
+Placement Policy
+================
+
+Placement policies can help users fully exploit their GPU-CPU heterogeneous memory space for better training efficiency.
+There are three options for the placement policy.
+They are "cpu", "cuda" and "auto" respectively.
+
+When the placement policy is set to "cpu", all participated parameters will be offloaded into CPU memory immediately at the end of every auto-grad operation.
+In this way, "cpu" placement policy uses the least CUDA memory.
+It is the best choice for users who want to exceptionally enlarge their model size or training batch size.
+
+When using "cuda" option, all parameters are placed in the CUDA memory, no CPU resources will be used during the training.
+It is for users who get plenty of CUDA memory.
+
+The third option, "auto", enables Gemini.
+It monitors the consumption of CUDA memory during the warmup phase and collects CUDA memory usage of all auto-grad operations.
+In later training steps, Gemini automatically manages the data transmission between GPU and CPU according to collected CUDA memory usage information.
+It is the fastest option when CUDA memory is enough.
+
+Here's an example of changing the placement policy to "cpu".
+
+.. code-block:: python
+
+    from pytorch_lightning.strategies import ColossalAIStrategy
+
+    model = MyModel()
+    my_strategy = ColossalAIStrategy(placement_policy="cpu")
+    trainer = Trainer(accelerator="gpu", devices=4, precision=16, strategy=my_strategy)
+    trainer.fit(model)
\ No newline at end of file
diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel/model_parallel.rst
similarity index 84%
rename from docs/source-pytorch/advanced/model_parallel.rst
rename to docs/source-pytorch/advanced/model_parallel/model_parallel.rst
index 9b3030f02ec8c..5c83a7f1f2a69 100644
--- a/docs/source-pytorch/advanced/model_parallel.rst
+++ b/docs/source-pytorch/advanced/model_parallel/model_parallel.rst
@@ -52,133 +52,18 @@ Sharding techniques help when model sizes are fairly large; roughly 500M+ parame
 * When your model is small (ResNet50 of around 80M Parameters), unless you are using unusually large batch sizes or inputs.
 * Due to high distributed communication between devices, if running on a slow network/interconnect, the training might be much slower than expected and then it's up to you to determince the tradeoff here.
 
-----------
-
-.. _colossalai:
-
-***********
-Colossal-AI
-***********
-
-:class:`~pytorch_lightning.strategies.colossalai.ColossalAIStrategy` implements ZeRO-DP with chunk-based memory management.
-With this chunk mechanism, really large models can be trained with a small number of GPUs.
-It supports larger trainable model size and batch size than usual heterogeneous training by reducing CUDA memory fragments and CPU memory consumption.
-Also, it speeds up this kind of heterogeneous training by fully utilizing all kinds of resources.
-
-When enabling chunk mechanism, a set of consecutive parameters are stored in a chunk, and then the chunk is sharded across different processes.
-This can reduce communication and data transmission frequency and fully utilize communication and PCI-E bandwidth, which makes training faster.
-
-Unlike traditional implementations, which adopt static memory partition, we implemented a dynamic heterogeneous memory management system named Gemini.
-During the first training step, the warmup phase will sample the maximum non-model data memory (memory usage expect parameters, gradients, and optimizer states).
-In later training, it will use the collected memory usage information to evict chunks dynamically.
-Gemini allows you to fit much larger models with limited GPU memory.
-
-According to our benchmark results, we can train models with up to 24 billion parameters in 1 GPU.
-You can install colossalai by consulting `how to download colossalai <https://colossalai.org/download>`_.
-Then, run this benchmark in `Colossalai-PL/gpt <https://github.com/hpcaitech/ColossalAI-Pytorch-lightning/tree/main/benchmark/gpt>`_.
-
-Here is an example showing how to use ColossalAI:
-
-.. code-block:: python
-
-    from colossalai.nn.optimizer import HybridAdam
-
-
-    class MyBert(LightningModule):
-        ...
-
-        def configure_sharded_model(self) -> None:
-            # create your model here
-            self.model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
-
-        def configure_optimizers(self):
-            # use the specified optimizer
-            optimizer = HybridAdam(self.model.parameters(), self.lr)
-
-        ...
-
-
-    model = MyBert()
-    trainer = Trainer(accelerator="gpu", devices=1, precision=16, strategy="colossalai")
-    trainer.fit(model)
-
-You can find more examples in the `Colossalai-PL <https://github.com/hpcaitech/ColossalAI-Pytorch-lightning>`_ repository.
-
-.. note::
-
-    *   The only accelerator which ColossalAI supports is ``"gpu"``. But CPU resources will be used when the placement policy is set to "auto" or "cpu".
 
-    *   The only precision which ColossalAI allows is 16 (FP16).
+Cutting-edge and experimental strategies
+========================================
 
-    *   It only supports a single optimizer, which must be ``colossalai.nn.optimizer.CPUAdam`` or ``colossalai.nn.optimizer.
-        HybridAdam`` now. You can set ``adamw_mode`` to False to use normal Adam. Noticing that ``HybridAdam`` is highly optimized, it uses fused CUDA kernel and parallel CPU kernel.
-        It is recomended to use ``HybridAdam``, since it updates parameters in GPU and CPU both.
+TODO
 
-    *   Your model must be created using the :meth:`~pytorch_lightning.core.module.LightningModule.configure_sharded_model` method.
+:doc:`Colossal-AI Strategy <./colossalai>`
 
-    *   ``ColossalaiStrategy`` doesn't support gradient accumulation as of now.
-
-.. _colossal_placement_policy:
-
-Placement Policy
-================
-
-Placement policies can help users fully exploit their GPU-CPU heterogeneous memory space for better training efficiency.
-There are three options for the placement policy.
-They are "cpu", "cuda" and "auto" respectively.
-
-When the placement policy is set to "cpu", all participated parameters will be offloaded into CPU memory immediately at the end of every auto-grad operation.
-In this way, "cpu" placement policy uses the least CUDA memory.
-It is the best choice for users who want to exceptionally enlarge their model size or training batch size.
-
-When using "cuda" option, all parameters are placed in the CUDA memory, no CPU resources will be used during the training.
-It is for users who get plenty of CUDA memory.
-
-The third option, "auto", enables Gemini.
-It monitors the consumption of CUDA memory during the warmup phase and collects CUDA memory usage of all auto-grad operations.
-In later training steps, Gemini automatically manages the data transmission between GPU and CPU according to collected CUDA memory usage information.
-It is the fastest option when CUDA memory is enough.
-
-Here's an example of changing the placement policy to "cpu".
-
-.. code-block:: python
-
-    from pytorch_lightning.strategies import ColossalAIStrategy
-
-    model = MyModel()
-    my_strategy = ColossalAIStrategy(placement_policy="cpu")
-    trainer = Trainer(accelerator="gpu", devices=4, precision=16, strategy=my_strategy)
-    trainer.fit(model)
-
-.. _sharded-training:
-
-****************
-Sharded Training
-****************
-
-The technique can be found within `DeepSpeed ZeRO <https://arxiv.org/abs/1910.02054>`_ and
-`ZeRO-2 <https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/>`_,
-however the implementation is built from the ground up to be PyTorch compatible and standalone.
-Sharded Training allows you to maintain GPU scaling efficiency, whilst reducing memory overhead drastically. In short, expect near-normal linear scaling (if your network allows), and significantly reduced memory usage when training large models.
-
-Sharded Training still utilizes Data Parallel Training under the hood, except optimizer states and gradients are sharded across GPUs.
-This means the memory overhead per GPU is lower, as each GPU only has to maintain a partition of your optimizer state and gradients.
-
-The benefits vary by model and parameter sizes, but we've recorded up to a 63% memory reduction per GPU allowing us to double our model sizes. Because of efficient communication,
-these benefits in multi-GPU setups are almost free and throughput scales well with multi-node setups.
-
-It is highly recommended to use Sharded Training in multi-GPU environments where memory is limited, or where training larger models are beneficial (500M+ parameter models).
-A technical note: as batch size scales, storing activations for the backwards pass becomes the bottleneck in training. As a result, sharding optimizer state and gradients becomes less impactful.
-
-.. code-block:: python
-
-    # train using Sharded DDP
-    trainer = Trainer(strategy="ddp_sharded")
-
-Internally we re-initialize your optimizers and shard them across your machines and processes. We handle all communication using PyTorch distributed, so no code changes are required.
 
 ----
 
+
 .. _fully-sharded-training:
 
 **********************

From 33a4a45d530eee51115bc207b60e3dedc7c66029 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 15 Feb 2023 23:38:24 +0000
Subject: [PATCH 03/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/source-pytorch/advanced/model_parallel/colossalai.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/advanced/model_parallel/colossalai.rst b/docs/source-pytorch/advanced/model_parallel/colossalai.rst
index e8c8528158e02..d84ad3d03a30d 100644
--- a/docs/source-pytorch/advanced/model_parallel/colossalai.rst
+++ b/docs/source-pytorch/advanced/model_parallel/colossalai.rst
@@ -93,4 +93,4 @@ Here's an example of changing the placement policy to "cpu".
     model = MyModel()
     my_strategy = ColossalAIStrategy(placement_policy="cpu")
     trainer = Trainer(accelerator="gpu", devices=4, precision=16, strategy=my_strategy)
-    trainer.fit(model)
\ No newline at end of file
+    trainer.fit(model)

From 6e285f7be9ce33728dbd2e253ab2b6ff25d0c993 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 00:41:02 +0100
Subject: [PATCH 04/18] .git

---
 requirements/pytorch/strategies.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 6f5d1e431f18c..f2355e814ead7 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,4 +2,4 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
-lightning_colossalai @ https://github.com/Lightning-AI/lightning-colossalai  # TODO: Update with pypi release once available
+lightning_colossalai @ https://github.com/Lightning-AI/lightning-colossalai.git  # TODO: Update with pypi release once available

From 837431991da77029b0b4481a201c3774156d0383 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 00:49:05 +0100
Subject: [PATCH 05/18] git

---
 requirements/pytorch/strategies.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index f2355e814ead7..0cd5d3764ee5c 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,4 +2,4 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
-lightning_colossalai @ https://github.com/Lightning-AI/lightning-colossalai.git  # TODO: Update with pypi release once available
+git+https://github.com/Lightning-AI/lightning-colossalai.git  # TODO: Update with pypi release once available

From b20d9397746dc2705d18f7f6da08cca13c75c10f Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 06:04:21 +0100
Subject: [PATCH 06/18] update docs

---
 .../advanced/model_parallel/colossalai.rst    | 36 +++++++++----------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/docs/source-pytorch/advanced/model_parallel/colossalai.rst b/docs/source-pytorch/advanced/model_parallel/colossalai.rst
index d84ad3d03a30d..81f9fde08011e 100644
--- a/docs/source-pytorch/advanced/model_parallel/colossalai.rst
+++ b/docs/source-pytorch/advanced/model_parallel/colossalai.rst
@@ -5,7 +5,7 @@ Colossal-AI
 ###########
 
 
-:class:`~pytorch_lightning.strategies.colossalai.ColossalAIStrategy` implements ZeRO-DP with chunk-based memory management.
+The Colossal-AI strategy implements ZeRO-DP with chunk-based memory management.
 With this chunk mechanism, really large models can be trained with a small number of GPUs.
 It supports larger trainable model size and batch size than usual heterogeneous training by reducing CUDA memory fragments and CPU memory consumption.
 Also, it speeds up this kind of heterogeneous training by fully utilizing all kinds of resources.
@@ -19,35 +19,31 @@ In later training, it will use the collected memory usage information to evict c
 Gemini allows you to fit much larger models with limited GPU memory.
 
 According to our benchmark results, we can train models with up to 24 billion parameters in 1 GPU.
-You can install colossalai by consulting `how to download colossalai <https://colossalai.org/download>`_.
-Then, run this benchmark in `Colossalai-PL/gpt <https://github.com/hpcaitech/ColossalAI-Pytorch-lightning/tree/main/benchmark/gpt>`_.
 
-Here is an example showing how to use ColossalAI:
+You can install the Colossal-AI integration by running
 
-.. code-block:: python
+.. code-block:: bash
+
+    pip install lightning-colossalai
+
+This will install both the `colossalai <https://colossalai.org/download>`_ package as well as the ``ColossalAIStrategy`` for the Lightning Trainer:
 
-    from colossalai.nn.optimizer import HybridAdam
+.. code-block:: python
 
+    trainer = Trainer(strategy="colossalai", precision=16, devices=...)
 
-    class MyBert(LightningModule):
-        ...
 
-        def configure_sharded_model(self) -> None:
-            # create your model here
-            self.model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+You can tune several settings by instantiating the strategy objects and pass options in:
 
-        def configure_optimizers(self):
-            # use the specified optimizer
-            optimizer = HybridAdam(self.model.parameters(), self.lr)
+.. code-block:: python
 
-        ...
+    from lightning_colossalai import ColossalAIStrategy
 
+    strategy = ColossalAIStrategy(...)
+    trainer = Trainer(strategy=strategy, precision=16, devices=...)
 
-    model = MyBert()
-    trainer = Trainer(accelerator="gpu", devices=1, precision=16, strategy="colossalai")
-    trainer.fit(model)
 
-You can find more examples in the `Colossalai-PL <https://github.com/hpcaitech/ColossalAI-Pytorch-lightning>`_ repository.
+See a full example of a benchmark with the a `GPT-2 model <https://github.com/hpcaitech/ColossalAI-Pytorch-lightning/tree/main/benchmark/gpt>`_ of up to 24 billion parameters
 
 .. note::
 
@@ -88,7 +84,7 @@ Here's an example of changing the placement policy to "cpu".
 
 .. code-block:: python
 
-    from pytorch_lightning.strategies import ColossalAIStrategy
+    from lightning_colossalai import ColossalAIStrategy
 
     model = MyModel()
     my_strategy = ColossalAIStrategy(placement_policy="cpu")

From faacb87318514f19190228d2bb3fdaa239b10f07 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 06:10:03 +0100
Subject: [PATCH 07/18] colossal ai intro text

---
 .../advanced/model_parallel/model_parallel.rst             | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/source-pytorch/advanced/model_parallel/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel/model_parallel.rst
index 5c83a7f1f2a69..14ff8c3e3c5ba 100644
--- a/docs/source-pytorch/advanced/model_parallel/model_parallel.rst
+++ b/docs/source-pytorch/advanced/model_parallel/model_parallel.rst
@@ -53,12 +53,11 @@ Sharding techniques help when model sizes are fairly large; roughly 500M+ parame
 * Due to high distributed communication between devices, if running on a slow network/interconnect, the training might be much slower than expected and then it's up to you to determince the tradeoff here.
 
 
-Cutting-edge and experimental strategies
+Cutting-edge and Experimental Strategies
 ========================================
 
-TODO
-
-:doc:`Colossal-AI Strategy <./colossalai>`
+Cutting-edge Lightning strategies are being developed by third-parties outside of Lightning.
+If you want to be the first to try the latest and greatest experimental features for model-parallel training, check out the :doc:`Colossal-AI Strategy <./colossalai>` integration.
 
 
 ----

From 08359e4833136ab194b46893a69c57313d442f5e Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 06:11:39 +0100
Subject: [PATCH 08/18] workaround

---
 .azure/gpu-tests-pytorch.yml        | 2 ++
 requirements/pytorch/strategies.txt | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
index e11b515899de4..1bbc72724d8df 100644
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@@ -113,6 +113,8 @@ jobs:
     - bash: |
         set -e
         pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL}
+        # TODO: Update with pypi release once available and move to strategies.txt
+        pip install git+https://github.com/Lightning-AI/lightning-colossalai.git
         python requirements/pytorch/check-avail-strategies.py
       condition: eq(variables['scope'], 'strategies')
       displayName: 'Install strategies'
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 0cd5d3764ee5c..c8a5c9531fe3d 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,4 +2,3 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
-git+https://github.com/Lightning-AI/lightning-colossalai.git  # TODO: Update with pypi release once available

From 634bb5158d7b15ee3baaa07d33c924376573a42b Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 06:25:58 +0100
Subject: [PATCH 09/18] changelog

---
 src/lightning/pytorch/CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
index e0a3c1969e26a..19639fa32362b 100644
--- a/src/lightning/pytorch/CHANGELOG.md
+++ b/src/lightning/pytorch/CHANGELOG.md
@@ -303,6 +303,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed the `QuantizationAwareTraining` callback ([#16750](https://github.com/Lightning-AI/lightning/pull/16750))
 
 
+- Removed the `ColossalAIStrategy` and `ColossalAIPrecisionPlugin` in favor of the new [lightning-colossalai](https://github.com/Lightning-AI/lightning-colossalai) package ([#16757](https://github.com/Lightning-AI/lightning/pull/16757), [#16778](https://github.com/Lightning-AI/lightning/pull/16778))
+
+
 ### Fixed
 
 - Fixed an attribute error and improved input validation for invalid strategy types being passed to Trainer ([#16693](https://github.com/Lightning-AI/lightning/pull/16693))

From 48f76b6bdb8e55cde11c0a1803258bc3f9c28745 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 06:11:39 +0100
Subject: [PATCH 10/18] Revert "workaround"

This reverts commit 08359e4833136ab194b46893a69c57313d442f5e.
---
 .azure/gpu-tests-pytorch.yml        | 2 --
 requirements/pytorch/strategies.txt | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
index 1bbc72724d8df..e11b515899de4 100644
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@@ -113,8 +113,6 @@ jobs:
     - bash: |
         set -e
         pip install -r requirements/pytorch/strategies.txt --find-links ${TORCH_URL}
-        # TODO: Update with pypi release once available and move to strategies.txt
-        pip install git+https://github.com/Lightning-AI/lightning-colossalai.git
         python requirements/pytorch/check-avail-strategies.py
       condition: eq(variables['scope'], 'strategies')
       displayName: 'Install strategies'
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index c8a5c9531fe3d..0cd5d3764ee5c 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,3 +2,4 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
+git+https://github.com/Lightning-AI/lightning-colossalai.git  # TODO: Update with pypi release once available

From b567ff53bc5c93ed49c56d247ed0e8c1e400a5c5 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 12:56:48 +0100
Subject: [PATCH 11/18] update

---
 requirements/pytorch/strategies.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 0cd5d3764ee5c..385c8d13ab903 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,4 +2,4 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
-git+https://github.com/Lightning-AI/lightning-colossalai.git  # TODO: Update with pypi release once available
+lightning-colossalai>=0.0.0, <0.2.0

From 576ff8a1fa608ce349339fe1e6c3411c327b10c5 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 12:59:25 +0100
Subject: [PATCH 12/18] update

---
 requirements/pytorch/strategies.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 385c8d13ab903..4db2eb301121b 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,4 +2,4 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
-lightning-colossalai>=0.0.0, <0.2.0
+lightning-colossalai==0.1.0dev

From 0922829474a5bd7e21c066a643326903fffee995 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 13:12:02 +0100
Subject: [PATCH 13/18] update links to moved document

---
 docs/source-pytorch/accelerators/gpu_advanced.rst | 2 +-
 docs/source-pytorch/advanced/training_tricks.rst  | 2 +-
 docs/source-pytorch/common/trainer.rst            | 2 +-
 docs/source-pytorch/common_usecases.rst           | 2 +-
 docs/source-pytorch/extensions/strategy.rst       | 4 ++--
 docs/source-pytorch/guides/speed.rst              | 2 +-
 docs/source-pytorch/index.rst                     | 4 ++--
 docs/source-pytorch/levels/advanced_level_22.rst  | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source-pytorch/accelerators/gpu_advanced.rst b/docs/source-pytorch/accelerators/gpu_advanced.rst
index d9cea9c5672e6..b3d7c4cc4a2f1 100644
--- a/docs/source-pytorch/accelerators/gpu_advanced.rst
+++ b/docs/source-pytorch/accelerators/gpu_advanced.rst
@@ -22,7 +22,7 @@ For experts pushing the state-of-the-art in model development, Lightning offers
    :header: Train 1 trillion+ parameter models
    :description:
    :col_css: col-md-4
-   :button_link: ../advanced/model_parallel.html
+   :button_link: ../advanced/model_parallel/model_parallel.html
    :height: 150
    :tag: advanced
 
diff --git a/docs/source-pytorch/advanced/training_tricks.rst b/docs/source-pytorch/advanced/training_tricks.rst
index 89b2135752c83..192a4735293d8 100644
--- a/docs/source-pytorch/advanced/training_tricks.rst
+++ b/docs/source-pytorch/advanced/training_tricks.rst
@@ -377,7 +377,7 @@ Advanced GPU Optimizations
 **************************
 
 When training on single or multiple GPU machines, Lightning offers a host of advanced optimizations to improve throughput, memory efficiency, and model scaling.
-Refer to :doc:`Advanced GPU Optimized Training <../advanced/model_parallel>` for more details.
+Refer to :doc:`Advanced GPU Optimized Training <../advanced/model_parallel/model_parallel>` for more details.
 
 ----------
 
diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst
index fd8b3af0cf982..255d2821436ac 100644
--- a/docs/source-pytorch/common/trainer.rst
+++ b/docs/source-pytorch/common/trainer.rst
@@ -1084,7 +1084,7 @@ Supports passing different training strategies with aliases (ddp, ddp_spawn, etc
 
 See Also:
     - :ref:`Multi GPU Training <multi_gpu>`.
-    - :doc:`Model Parallel GPU training guide <../advanced/model_parallel>`.
+    - :doc:`Model Parallel GPU training guide <../advanced/model_parallel/model_parallel>`.
     - :doc:`TPU training guide <../accelerators/tpu>`.
 
 
diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst
index 06165658704ea..4265f951638df 100644
--- a/docs/source-pytorch/common_usecases.rst
+++ b/docs/source-pytorch/common_usecases.rst
@@ -113,7 +113,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
    :header: Train 1 trillion+ parameter models
    :description: Scale GPU training to 1 trillion + parameter models
    :col_css: col-md-12
-   :button_link: advanced/model_parallel.html
+   :button_link: advanced/model_parallel/model_parallel.html
    :height: 100
 
 .. displayitem::
diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst
index 0e96fabb5bb7f..19646f802d477 100644
--- a/docs/source-pytorch/extensions/strategy.rst
+++ b/docs/source-pytorch/extensions/strategy.rst
@@ -71,7 +71,7 @@ The below table lists all relevant strategies available in Lightning with their
      - Description
    * - fsdp
      - :class:`~pytorch_lightning.strategies.FSDPStrategy`
-     - Strategy for Fully Sharded Data Parallel training. :ref:`Learn more. <advanced/model_parallel:Fully Sharded Training>`
+     - Strategy for Fully Sharded Data Parallel training. :ref:`Learn more. <advanced/model_parallel/model_parallel:Fully Sharded Training>`
    * - ddp_spawn
      - :class:`~pytorch_lightning.strategies.DDPSpawnStrategy`
      - Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes. :ref:`Learn more. <accelerators/gpu_intermediate:Distributed Data Parallel Spawn>`
@@ -80,7 +80,7 @@ The below table lists all relevant strategies available in Lightning with their
      - Strategy for multi-process single-device training on one or multiple nodes. :ref:`Learn more. <accelerators/gpu_intermediate:Distributed Data Parallel>`
    * - deepspeed
      - :class:`~pytorch_lightning.strategies.DeepSpeedStrategy`
-     - Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. :ref:`Learn more. <advanced/model_parallel:deepspeed>`
+     - Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. :ref:`Learn more. <advanced/model_parallel/model_parallel:deepspeed>`
    * - hpu_parallel
      - :class:`~pytorch_lightning.strategies.HPUParallelStrategy`
      - Strategy for distributed training on multiple HPU devices. :doc:`Learn more. <../accelerators/hpu>`
diff --git a/docs/source-pytorch/guides/speed.rst b/docs/source-pytorch/guides/speed.rst
index b5120f394eabf..f7401a9e5bc45 100644
--- a/docs/source-pytorch/guides/speed.rst
+++ b/docs/source-pytorch/guides/speed.rst
@@ -47,7 +47,7 @@ GPU Training Speedup Tips
 -------------------------
 
 When training on single or multiple GPU machines, Lightning offers a host of advanced optimizations to improve throughput, memory efficiency, and model scaling.
-Refer to :doc:`Advanced GPU Optimized Training for more details <../advanced/model_parallel>`.
+Refer to :doc:`Advanced GPU Optimized Training for more details <../advanced/model_parallel/model_parallel>`.
 
 |
 
diff --git a/docs/source-pytorch/index.rst b/docs/source-pytorch/index.rst
index 3130ee5ca4102..1268bf35ebc9b 100644
--- a/docs/source-pytorch/index.rst
+++ b/docs/source-pytorch/index.rst
@@ -200,7 +200,7 @@ Current Lightning Users
    clouds/cluster
    Save and load model progress <common/checkpointing>
    Save memory with half-precision <common/precision>
-   advanced/model_parallel
+   advanced/model_parallel/model_parallel
    Train on single or multiple GPUs <accelerators/gpu>
    Train on single or multiple HPUs <accelerators/hpu>
    Train on single or multiple IPUs <accelerators/ipu>
@@ -240,7 +240,7 @@ Current Lightning Users
    TPU <accelerators/tpu>
    Metrics <https://torchmetrics.readthedocs.io/en/stable/>
    Model <model/build_model.rst>
-   Model Parallel <advanced/model_parallel>
+   Model Parallel <advanced/model_parallel/model_parallel>
    Plugins <extensions/plugins>
    Progress bar <common/progress_bar>
    Production <deploy/production_advanced>
diff --git a/docs/source-pytorch/levels/advanced_level_22.rst b/docs/source-pytorch/levels/advanced_level_22.rst
index a90a482622cb9..e40d252bc3062 100644
--- a/docs/source-pytorch/levels/advanced_level_22.rst
+++ b/docs/source-pytorch/levels/advanced_level_22.rst
@@ -27,7 +27,7 @@ Scale to 1 trillion+ parameters with multiple distributed strategies.
    :header: Reach 1 trillion parameters on GPUs
    :description: Scale to 1 trillion params on GPUs with FSDP and Deepspeed.
    :col_css: col-md-6
-   :button_link: ../advanced/model_parallel.html
+   :button_link: ../advanced/model_parallel/model_parallel.html
    :height: 150
    :tag: advanced
 

From 2e6e4d25a7601511a1e9c7d7e0573ed9a5cc490e Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 15:10:23 +0100
Subject: [PATCH 14/18] fix docs build errors

---
 docs/source-pytorch/advanced/model_parallel/colossalai.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/advanced/model_parallel/colossalai.rst b/docs/source-pytorch/advanced/model_parallel/colossalai.rst
index 81f9fde08011e..1d49346b79d4a 100644
--- a/docs/source-pytorch/advanced/model_parallel/colossalai.rst
+++ b/docs/source-pytorch/advanced/model_parallel/colossalai.rst
@@ -1,4 +1,4 @@
-.. _colossalai:
+:orphan:
 
 ###########
 Colossal-AI

From 1cdbc771d82ed35042c11daa4939d2cc0647dca5 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 15:18:42 +0100
Subject: [PATCH 15/18] avoid change in index.html codeowner review

---
 docs/source-pytorch/accelerators/gpu_advanced.rst             | 2 +-
 .../advanced/{model_parallel => }/model_parallel.rst          | 2 +-
 .../advanced/{model_parallel => third_party}/colossalai.rst   | 0
 docs/source-pytorch/advanced/training_tricks.rst              | 2 +-
 docs/source-pytorch/common/trainer.rst                        | 2 +-
 docs/source-pytorch/common_usecases.rst                       | 2 +-
 docs/source-pytorch/extensions/strategy.rst                   | 4 ++--
 docs/source-pytorch/guides/speed.rst                          | 2 +-
 docs/source-pytorch/index.rst                                 | 4 ++--
 docs/source-pytorch/levels/advanced_level_22.rst              | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)
 rename docs/source-pytorch/advanced/{model_parallel => }/model_parallel.rst (99%)
 rename docs/source-pytorch/advanced/{model_parallel => third_party}/colossalai.rst (100%)

diff --git a/docs/source-pytorch/accelerators/gpu_advanced.rst b/docs/source-pytorch/accelerators/gpu_advanced.rst
index b3d7c4cc4a2f1..d9cea9c5672e6 100644
--- a/docs/source-pytorch/accelerators/gpu_advanced.rst
+++ b/docs/source-pytorch/accelerators/gpu_advanced.rst
@@ -22,7 +22,7 @@ For experts pushing the state-of-the-art in model development, Lightning offers
    :header: Train 1 trillion+ parameter models
    :description:
    :col_css: col-md-4
-   :button_link: ../advanced/model_parallel/model_parallel.html
+   :button_link: ../advanced/model_parallel.html
    :height: 150
    :tag: advanced
 
diff --git a/docs/source-pytorch/advanced/model_parallel/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst
similarity index 99%
rename from docs/source-pytorch/advanced/model_parallel/model_parallel.rst
rename to docs/source-pytorch/advanced/model_parallel.rst
index 14ff8c3e3c5ba..cf0db89682e89 100644
--- a/docs/source-pytorch/advanced/model_parallel/model_parallel.rst
+++ b/docs/source-pytorch/advanced/model_parallel.rst
@@ -57,7 +57,7 @@ Cutting-edge and Experimental Strategies
 ========================================
 
 Cutting-edge Lightning strategies are being developed by third-parties outside of Lightning.
-If you want to be the first to try the latest and greatest experimental features for model-parallel training, check out the :doc:`Colossal-AI Strategy <./colossalai>` integration.
+If you want to be the first to try the latest and greatest experimental features for model-parallel training, check out the :doc:`Colossal-AI Strategy <./third_party/colossalai>` integration.
 
 
 ----
diff --git a/docs/source-pytorch/advanced/model_parallel/colossalai.rst b/docs/source-pytorch/advanced/third_party/colossalai.rst
similarity index 100%
rename from docs/source-pytorch/advanced/model_parallel/colossalai.rst
rename to docs/source-pytorch/advanced/third_party/colossalai.rst
diff --git a/docs/source-pytorch/advanced/training_tricks.rst b/docs/source-pytorch/advanced/training_tricks.rst
index 192a4735293d8..89b2135752c83 100644
--- a/docs/source-pytorch/advanced/training_tricks.rst
+++ b/docs/source-pytorch/advanced/training_tricks.rst
@@ -377,7 +377,7 @@ Advanced GPU Optimizations
 **************************
 
 When training on single or multiple GPU machines, Lightning offers a host of advanced optimizations to improve throughput, memory efficiency, and model scaling.
-Refer to :doc:`Advanced GPU Optimized Training <../advanced/model_parallel/model_parallel>` for more details.
+Refer to :doc:`Advanced GPU Optimized Training <../advanced/model_parallel>` for more details.
 
 ----------
 
diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst
index 255d2821436ac..fd8b3af0cf982 100644
--- a/docs/source-pytorch/common/trainer.rst
+++ b/docs/source-pytorch/common/trainer.rst
@@ -1084,7 +1084,7 @@ Supports passing different training strategies with aliases (ddp, ddp_spawn, etc
 
 See Also:
     - :ref:`Multi GPU Training <multi_gpu>`.
-    - :doc:`Model Parallel GPU training guide <../advanced/model_parallel/model_parallel>`.
+    - :doc:`Model Parallel GPU training guide <../advanced/model_parallel>`.
     - :doc:`TPU training guide <../accelerators/tpu>`.
 
 
diff --git a/docs/source-pytorch/common_usecases.rst b/docs/source-pytorch/common_usecases.rst
index 4265f951638df..06165658704ea 100644
--- a/docs/source-pytorch/common_usecases.rst
+++ b/docs/source-pytorch/common_usecases.rst
@@ -113,7 +113,7 @@ Customize and extend Lightning for things like custom hardware or distributed st
    :header: Train 1 trillion+ parameter models
    :description: Scale GPU training to 1 trillion + parameter models
    :col_css: col-md-12
-   :button_link: advanced/model_parallel/model_parallel.html
+   :button_link: advanced/model_parallel.html
    :height: 100
 
 .. displayitem::
diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst
index 19646f802d477..0e96fabb5bb7f 100644
--- a/docs/source-pytorch/extensions/strategy.rst
+++ b/docs/source-pytorch/extensions/strategy.rst
@@ -71,7 +71,7 @@ The below table lists all relevant strategies available in Lightning with their
      - Description
    * - fsdp
      - :class:`~pytorch_lightning.strategies.FSDPStrategy`
-     - Strategy for Fully Sharded Data Parallel training. :ref:`Learn more. <advanced/model_parallel/model_parallel:Fully Sharded Training>`
+     - Strategy for Fully Sharded Data Parallel training. :ref:`Learn more. <advanced/model_parallel:Fully Sharded Training>`
    * - ddp_spawn
      - :class:`~pytorch_lightning.strategies.DDPSpawnStrategy`
      - Spawns processes using the :func:`torch.multiprocessing.spawn` method and joins processes after training finishes. :ref:`Learn more. <accelerators/gpu_intermediate:Distributed Data Parallel Spawn>`
@@ -80,7 +80,7 @@ The below table lists all relevant strategies available in Lightning with their
      - Strategy for multi-process single-device training on one or multiple nodes. :ref:`Learn more. <accelerators/gpu_intermediate:Distributed Data Parallel>`
    * - deepspeed
      - :class:`~pytorch_lightning.strategies.DeepSpeedStrategy`
-     - Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. :ref:`Learn more. <advanced/model_parallel/model_parallel:deepspeed>`
+     - Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. :ref:`Learn more. <advanced/model_parallel:deepspeed>`
    * - hpu_parallel
      - :class:`~pytorch_lightning.strategies.HPUParallelStrategy`
      - Strategy for distributed training on multiple HPU devices. :doc:`Learn more. <../accelerators/hpu>`
diff --git a/docs/source-pytorch/guides/speed.rst b/docs/source-pytorch/guides/speed.rst
index f7401a9e5bc45..b5120f394eabf 100644
--- a/docs/source-pytorch/guides/speed.rst
+++ b/docs/source-pytorch/guides/speed.rst
@@ -47,7 +47,7 @@ GPU Training Speedup Tips
 -------------------------
 
 When training on single or multiple GPU machines, Lightning offers a host of advanced optimizations to improve throughput, memory efficiency, and model scaling.
-Refer to :doc:`Advanced GPU Optimized Training for more details <../advanced/model_parallel/model_parallel>`.
+Refer to :doc:`Advanced GPU Optimized Training for more details <../advanced/model_parallel>`.
 
 |
 
diff --git a/docs/source-pytorch/index.rst b/docs/source-pytorch/index.rst
index 1268bf35ebc9b..3130ee5ca4102 100644
--- a/docs/source-pytorch/index.rst
+++ b/docs/source-pytorch/index.rst
@@ -200,7 +200,7 @@ Current Lightning Users
    clouds/cluster
    Save and load model progress <common/checkpointing>
    Save memory with half-precision <common/precision>
-   advanced/model_parallel/model_parallel
+   advanced/model_parallel
    Train on single or multiple GPUs <accelerators/gpu>
    Train on single or multiple HPUs <accelerators/hpu>
    Train on single or multiple IPUs <accelerators/ipu>
@@ -240,7 +240,7 @@ Current Lightning Users
    TPU <accelerators/tpu>
    Metrics <https://torchmetrics.readthedocs.io/en/stable/>
    Model <model/build_model.rst>
-   Model Parallel <advanced/model_parallel/model_parallel>
+   Model Parallel <advanced/model_parallel>
    Plugins <extensions/plugins>
    Progress bar <common/progress_bar>
    Production <deploy/production_advanced>
diff --git a/docs/source-pytorch/levels/advanced_level_22.rst b/docs/source-pytorch/levels/advanced_level_22.rst
index e40d252bc3062..a90a482622cb9 100644
--- a/docs/source-pytorch/levels/advanced_level_22.rst
+++ b/docs/source-pytorch/levels/advanced_level_22.rst
@@ -27,7 +27,7 @@ Scale to 1 trillion+ parameters with multiple distributed strategies.
    :header: Reach 1 trillion parameters on GPUs
    :description: Scale to 1 trillion params on GPUs with FSDP and Deepspeed.
    :col_css: col-md-6
-   :button_link: ../advanced/model_parallel/model_parallel.html
+   :button_link: ../advanced/model_parallel.html
    :height: 150
    :tag: advanced
 

From 631a8b8a414fd6639d7e9ad164eba7458259c199 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 15:19:39 +0100
Subject: [PATCH 16/18] clarify mixed precision

---
 docs/source-pytorch/advanced/third_party/colossalai.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/advanced/third_party/colossalai.rst b/docs/source-pytorch/advanced/third_party/colossalai.rst
index 1d49346b79d4a..5223bdc0ad60d 100644
--- a/docs/source-pytorch/advanced/third_party/colossalai.rst
+++ b/docs/source-pytorch/advanced/third_party/colossalai.rst
@@ -49,7 +49,7 @@ See a full example of a benchmark with the a `GPT-2 model <https://github.com/hp
 
     *   The only accelerator which ColossalAI supports is ``"gpu"``. But CPU resources will be used when the placement policy is set to "auto" or "cpu".
 
-    *   The only precision which ColossalAI allows is 16 (FP16).
+    *   The only precision which ColossalAI allows is 16-bit mixed precision (FP16).
 
     *   It only supports a single optimizer, which must be ``colossalai.nn.optimizer.CPUAdam`` or ``colossalai.nn.optimizer.
         HybridAdam`` now. You can set ``adamw_mode`` to False to use normal Adam. Noticing that ``HybridAdam`` is highly optimized, it uses fused CUDA kernel and parallel CPU kernel.

From 3d69e2bb3ca650ed6560847024fe299644de57d5 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 15:38:04 +0100
Subject: [PATCH 17/18] fix undefined label error

---
 docs/source-pytorch/advanced/model_parallel.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst
index cf0db89682e89..6603eae0da6c9 100644
--- a/docs/source-pytorch/advanced/model_parallel.rst
+++ b/docs/source-pytorch/advanced/model_parallel.rst
@@ -37,7 +37,7 @@ This means we cannot sacrifice throughput as much as if we were fine-tuning, bec
 Overall:
 
 * When **fine-tuning** a model, use advanced memory efficient strategies such as :ref:`fully-sharded-training`, :ref:`deepspeed-zero-stage-3` or :ref:`deepspeed-zero-stage-3-offload`, allowing you to fine-tune larger models if you are limited on compute
-* When **pre-training** a model, use simpler optimizations such :ref:`sharded-training` or :ref:`deepspeed-zero-stage-2`, scaling the number of GPUs to reach larger parameter sizes
+* When **pre-training** a model, use simpler optimizations such as :ref:`deepspeed-zero-stage-2`, scaling the number of GPUs to reach larger parameter sizes
 * For both fine-tuning and pre-training, use :ref:`deepspeed-activation-checkpointing` as the throughput degradation is not significant
 
 For example when using 128 GPUs, you can **pre-train** large 10 to 20 Billion parameter models using :ref:`deepspeed-zero-stage-2` without having to take a performance hit with more advanced optimized multi-gpu strategy.

From a600f8fedef41eb15d51258ee6f2812094480b4a Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Thu, 16 Feb 2023 16:46:09 +0100
Subject: [PATCH 18/18] weird whitespace fix

---
 docs/source-pytorch/extensions/strategy.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst
index 0e96fabb5bb7f..034d508474745 100644
--- a/docs/source-pytorch/extensions/strategy.rst
+++ b/docs/source-pytorch/extensions/strategy.rst
@@ -110,12 +110,12 @@ There are powerful third-party strategies that integrate well with Lightning but
    :widths: 20 20 20
    :header-rows: 1
 
-    * - Name
-      - Package
-      - Description
-    * - colossalai
-      - `Lightning-AI/lightning-colossalai <https://github.com/Lightning-AI/lightning-colossalai>`_
-      - Colossal-AI provides a collection of parallel components for you. It aims to support you to write your distributed deep learning models just like how you write your model on your laptop. `Learn more. <https://www.colossalai.org/>`__
+   * - Name
+     - Package
+     - Description
+   * - colossalai
+     - `Lightning-AI/lightning-colossalai <https://github.com/Lightning-AI/lightning-colossalai>`_
+     - Colossal-AI provides a collection of parallel components for you. It aims to support you to write your distributed deep learning models just like how you write your model on your laptop. `Learn more. <https://www.colossalai.org/>`__
 
 
 ----