kvcache-ai · alogfans · Sep 26, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -307,6 +307,100 @@ jobs:
         name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}
         path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
 
+  build-with-ep:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.12']
+        torch-version: ['2.7.1', '2.8.0']
+    env:
+      BUILD_WITH_EP: "1"
+      SCCACHE_GHA_ENABLED: "true"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Free up disk space
+      run: |
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /opt/hostedtoolcache/CodeQL
+
+    - name: Install CUDA Toolkit
+      uses: Jimver/[email protected]
+      with:
+        cuda: '12.8.1'
+        linux-local-args: '["--toolkit"]'
+        method: 'network'
+        sub-packages: '["nvcc", "nvrtc-dev"]'
+        non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
+
+    - name: Run sccache-cache
+      uses: mozilla-actions/[email protected]
+
+    - name: Configure sccache
+      uses: actions/github-script@v7
+      with:
+        script: |
+          core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
+          core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+    - name: Run sccache stat for check
+      shell: bash
+      run: ${SCCACHE_PATH} --show-stats
+
+    - name: Install dependencies
+      run: |
+        sudo apt update -y
+        sudo bash -x dependencies.sh -y
+        pip install toml-cli  # for updating the version
+        pip install torch==${{ matrix.torch-version }}
+      shell: bash
+
+    - name: Build transfer engine with EP
+      run: |
+        mkdir build
+        cd build
+        export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
+        export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
+        cmake .. -DUSE_ETCD=ON -DUSE_REDIS=ON -DUSE_HTTP=ON -DWITH_STORE=ON -DWITH_P2P_STORE=ON -DWITH_EP=ON -DWITH_METRICS=ON -DBUILD_UNIT_TESTS=ON -DBUILD_EXAMPLES=ON -DENABLE_SCCACHE=ON -DUSE_CUDA=OFF -DUSE_MNNVL=OFF -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/cuda/lib64/stubs"
+        make -j
+        sudo make install
+      shell: bash
+
+    - name: Build nvlink_allocator.so
+      run: |
+        mkdir -p build/mooncake-transfer-engine/nvlink-allocator
+        cd mooncake-transfer-engine/nvlink-allocator
+        bash build.sh --ci-build ../../build/mooncake-transfer-engine/nvlink-allocator/
+      shell: bash
+
+    - name: Generate Python version tag
+      id: generate_tag_flags
+      run: |
+        echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
+      shell: bash
+
+    - name: Build Python wheel
+      run: |
+        BASE_VERSION=$(toml get --toml-path mooncake-wheel/pyproject.toml project.version | tr -d '"')
+        TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+        toml set --toml-path mooncake-wheel/pyproject.toml project.version "${BASE_VERSION}+ep_torch${TORCH_VERSION}"
+        # Build wheel with specific Python version
+        PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }} ./scripts/build_wheel.sh
+      shell: bash
+
+    - name: Upload Python wheel artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: mooncake-wheel-ubuntu-py${{ steps.generate_tag_flags.outputs.python_version_tag }}+ep_torch${{ matrix.torch-version }}
+        path: mooncake-wheel/dist-py${{ steps.generate_tag_flags.outputs.python_version_tag }}/*.whl
+
   build-docker:
     name: Build Docker Image
     runs-on: ubuntu-22.04

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -95,8 +95,106 @@ jobs:
           name: mooncake-wheel-py${{ steps.generate_tag_release.outputs.python_version_tag }}
           path: mooncake-wheel/dist-py${{ steps.generate_tag_release.outputs.python_version_tag }}/*.whl
 
+  build-with-ep:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.12']
+        torch-version: ['2.7.1', '2.8.0']
+    env:
+      BUILD_WITH_EP: "1"
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Free up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+
+      - name: Install CUDA Toolkit
+        uses: Jimver/[email protected]
+        with:
+          cuda: '12.8.1'
+          linux-local-args: '["--toolkit"]'
+          method: 'network'
+          sub-packages: '["nvcc", "nvrtc-dev"]'
+          non-cuda-sub-packages: '["libcusparse-dev", "libcublas-dev", "libcusolver-dev"]'
+
+      - name: Run sccache-cache
+        uses: mozilla-actions/[email protected]
+
+      - name: Configure sccache
+        uses: actions/github-script@v7
+        with:
+          script: |
+            core.exportVariable('ACTIONS_RESULTS_URL', process.env.ACTIONS_RESULTS_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+
+      - name: Run sccache stat for check
+        shell: bash
+        run: ${SCCACHE_PATH} --show-stats
+
+      - name: Configure project
+        run: |
+          sudo apt update -y
+          sudo bash -x dependencies.sh -y
+          pip install toml-cli  # for updating the version
+          pip install torch==${{ matrix.torch-version }}
+          mkdir build
+          cd build
+          cmake .. -DUSE_HTTP=ON -DUSE_ETCD=ON -DWITH_EP=ON -DSTORE_USE_ETCD=ON -DENABLE_SCCACHE=ON -DCMAKE_BUILD_TYPE=Release
+        shell: bash
+
+      - name: Build project
+        run: |
+          cd build
+          make -j
+          sudo make install
+        shell: bash
+
+      - name: Build nvlink_allocator.so
+        run: |
+          mkdir -p build/mooncake-transfer-engine/nvlink-allocator
+          cd mooncake-transfer-engine/nvlink-allocator
+          bash build.sh --ci-build ../../build/mooncake-transfer-engine/nvlink-allocator/
+        shell: bash
+
+      - name: Generate Python version tag
+        id: generate_tag_release
+        run: |
+          echo "python_version_tag=$(echo ${{ matrix.python-version }} | tr -d '.')" >> $GITHUB_OUTPUT
+        shell: bash
+
+      - name: Build Python wheel
+        run: |
+          BASE_VERSION=$(toml get --toml-path mooncake-wheel/pyproject.toml project.version | tr -d '"')
+          TORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
+          toml set --toml-path mooncake-wheel/pyproject.toml project.version "${BASE_VERSION}+ep_torch${TORCH_VERSION}"
+          # Set LD_LIBRARY_PATH for wheel building
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+          PYTHON_VERSION=${{ matrix.python-version }} OUTPUT_DIR=dist-py${{ steps.generate_tag_release.outputs.python_version_tag }} ./scripts/build_wheel.sh
+        env:
+          VERSION: ${{ env.VERSION }}
+
+      - name: Upload Python wheel artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: mooncake-wheel-py${{ steps.generate_tag_release.outputs.python_version_tag }}+ep_torch${{ matrix.torch-version }}
+          path: mooncake-wheel/dist-py${{ steps.generate_tag_release.outputs.python_version_tag }}/*.whl
+
   publish-release:
-    needs: build
+    needs:
+      - build
+      - build-with-ep
     runs-on: ubuntu-22.04
     permissions:
       contents: write

diff --git a/.typos.toml b/.typos.toml
@@ -1,8 +1,7 @@
 [default]
-extend-ignore-words = ["CANN"]
-
-[files]
-extend-exclude = ["mooncake-ep/csrc/*.h"]
+extend-ignore-words = ["CANN", "ASO", "fre"]
 
 [default.extend-words]
 CANN = "CANN"
+ASO = "ASO"
+fre = "fre"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,6 +15,7 @@ endif()
 option(WITH_STORE "build mooncake store library and sample code" ON)
 option(WITH_P2P_STORE "build p2p store library and sample code" OFF)
 option(WITH_RUST_EXAMPLE "build the Rust interface and sample code for the transfer engine" OFF)
+option(WITH_EP "build mooncake with expert parallelism support" OFF)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extern/pybind11)
 set(PYTHON_EXECUTABLE "python3")
@@ -51,6 +52,12 @@ if (WITH_STORE)
   include_directories(mooncake-store/include)
 endif()
 
+if (WITH_EP)
+  message(STATUS "Mooncake EP will be built")
+  add_subdirectory(mooncake-ep)
+  include_directories(mooncake-ep/include)
+endif()
+
 add_subdirectory(mooncake-integration)
 
 if (WITH_P2P_STORE)

diff --git a/doc/en/build.md b/doc/en/build.md
@@ -6,6 +6,11 @@ This document describes how to build Mooncake from source.
    ```bash
    pip3 install mooncake-transfer-engine --upgrade
    ```
+- To install with the Mooncake Backend and Mooncake EP support, use the following command:
+   ```bash
+   # replace torch2.8.0 with the corresponding version
+   pip3 install mooncake-transfer-engine==0.3.7+ep.torch2.8.0 --upgrade
+   ```
 
 ## Automatic
 

diff --git a/doc/en/ep-backend.md b/doc/en/ep-backend.md
@@ -0,0 +1,70 @@
+# Mooncake EP & Mooncake Backend
+
+## Overview
+
+Mooncake EP is an adaption of [DeepEP](https://github.com/deepseek-ai/DeepEP) that supports **fault tolerance** and fast data transfer with **IBGDA**, designed as a critical component for large-scale, latency-sensitive MoE (Mixture of Experts) inference. Mooncake EP aims to retain full compatibility with the DeepEP API, with the addition of a `broken_ranks` tensor passed to both the `dispatch` and `combine` functions to capture information about any rank failures. By integrating with the EPLB module, Mooncake EP ensures fault tolerance during MoE inference, enabling robust performance even in large-scale, fault-prone environments.
+
+Mooncake Backend is a PyTorch distributed backend (a replacement for NCCL and Gloo) that provides **fault-tolerant collective communication primitives** and can be seamlessly integrated into machine learning systems. Built with the [Transfer Engine](transfer-engine.md), Mooncake Backend ensures that collective communications can continue even in the event of rank failures. Furthermore, it reports these failures to the upper layers of the system, allowing for graceful error handling without disrupting ongoing operations.
+
+## Usage
+
+### Mooncake EP
+
+> **Note:** Mooncake EP currently supports only the low-latency transfer mode.
+
+The API is largely consistent with DeepEP's, with only minor differences in a few parameters. Mooncake EP exposes a `Buffer` that can be imported from `mooncake.mooncake_ep_buffer`. For an example, refer to `mooncake-wheel/tests/test_mooncake_ep.py`.
+
+#### Buffer.get_buffer_size_hint()
+
+**Signature:**
+
+```python
+@staticmethod
+def get_ep_buffer_size_hint(num_max_dispatch_tokens_per_rank: int, hidden: int, num_ranks: int, num_experts: int) -> int
+```
+
+Calculates the number of bytes to pre-allocate for data transfer.
+
+#### Buffer.\_\_init\_\_()
+
+**Signature:**
+
+```python
+def __init__(self, group: dist.ProcessGroup, num_ep_buffer_bytes: int = 0)
+```
+
+The constructor. Ensure that only one instance is created.
+
+- **group**: Must be a Mooncake Backend process group.
+- **num_ep_buffer_bytes**: The number of bytes acquired with `Buffer.get_buffer_size_hint()`
+
+#### Buffer.dispatch/Buffer.combine
+
+**Signature:** Similar to DeepEP's `low_latency_dispatch`/`low_latency_combine`, with two additional parameters:
+
+- **broken_ranks**: A tensor of shape `(num_ranks,)` containing values of 0 or 1. The indices of the broken ranks will be set to 1.
+- **timeout_us**: The timeout in microseconds for a rank to be considered broken. Set to -1 for infinite timeout.
+
+### Mooncake Backend
+
+Basic usage:
+
+```python
+import torch
+import torch.distributed as dist
+from mooncake import ep
+
+broken_ranks = torch.zeros((world_size,), dtype=torch.int32, device="cuda")
+dist.init_process_group(
+    backend="mooncake",
+    rank=rank,
+    world_size=world_size,
+    pg_options=ep.MooncakeBackendOptions(broken_ranks),
+)
+
+dist.all_gather(...)           # Standard API usage
+assert not broken_ranks.any()  # Verify that no ranks are broken
+```
+
+For a full example, see `mooncake-wheel/tests/test_mooncake_backend.py`.
+
diff --git a/mooncake-ep/CMakeLists.txt b/mooncake-ep/CMakeLists.txt
@@ -0,0 +1,42 @@
+cmake_minimum_required(VERSION 3.16)
+project(mooncake-ep)
+
+# Find PyTorch's CMake prefix path
+execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} -c "import torch; print(torch.utils.cmake_prefix_path)"
+    OUTPUT_VARIABLE PYTORCH_CMAKE_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+if(NOT PYTORCH_CMAKE_PATH)
+  message(WARNING "Could not find PyTorch CMake path! Please set Torch_DIR.")
+else ()
+  message(STATUS "Found PyTorch CMake path: ${PYTORCH_CMAKE_PATH}")
+  list(APPEND CMAKE_PREFIX_PATH "${PYTORCH_CMAKE_PATH}/Torch")
+endif()
+
+set(TORCH_CUDA_ARCH_LIST "8.0;9.0")
+
+find_package(CUDAToolkit REQUIRED)
+# https://discuss.pytorch.org/t/failed-to-find-nvtoolsext/179635/13
+if(NOT TARGET CUDA::nvToolsExt AND TARGET CUDA::nvtx3)
+  add_library(CUDA::nvToolsExt INTERFACE IMPORTED)
+  target_compile_definitions(
+      CUDA::nvToolsExt INTERFACE
+      TORCH_CUDA_USE_NVTX3
+  )
+  target_link_libraries(CUDA::nvToolsExt INTERFACE CUDA::nvtx3)
+endif()
+find_package(Torch REQUIRED)
+include_directories(${TORCH_INCLUDE_DIRS})
+
+include_directories(include)
+add_subdirectory(include)
+add_subdirectory(src)
+
+if (BUILD_UNIT_TESTS)
+  add_subdirectory(tests)
+endif()
+
+if (BUILD_EXAMPLES)
+  add_subdirectory(example)
+endif()
diff --git a/mooncake-ep/csrc/CMakeLists.txt b/mooncake-ep/csrc/CMakeLists.txt