EleutherAI
diff --git a/‎megatron/data/data_utils.py
Lines changed: 12 additions & 2 deletions b/‎megatron/data/data_utils.py
Lines changed: 12 additions & 2 deletions
diff --git a/‎megatron/initialize.py
Lines changed: 11 additions & 5 deletions b/‎megatron/initialize.py
Lines changed: 11 additions & 5 deletions
diff --git a/‎megatron/model/fused_layer_norm.py
Lines changed: 4 additions & 4 deletions b/‎megatron/model/fused_layer_norm.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎megatron/model/gpt2_model.py
Lines changed: 24 additions & 1 deletion b/‎megatron/model/gpt2_model.py
Lines changed: 24 additions & 1 deletion
diff --git a/‎megatron/model/positional_embeddings.py
Lines changed: 22 additions & 1 deletion b/‎megatron/model/positional_embeddings.py
Lines changed: 22 additions & 1 deletion
diff --git a/‎megatron/model/transformer.py
Lines changed: 101 additions & 2 deletions b/‎megatron/model/transformer.py
Lines changed: 101 additions & 2 deletions
diff --git a/‎megatron/model/utils.py
Lines changed: 2 additions & 2 deletions b/‎megatron/model/utils.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎megatron/mpu/__init__.py
Lines changed: 7 additions & 0 deletions b/‎megatron/mpu/__init__.py
Lines changed: 7 additions & 0 deletions
@@ -531,8 +531,12 @@ def build_train_valid_test_data_loaders(neox_args):
     else:
         pipe_load = True
 
-    # Data loader only on rank 0 of each model parallel group.
-    if mpu.get_model_parallel_rank() == 0 and pipe_load:
+    # Data loader only on rank 0 of each model and context parallel group.
+    if (
+        mpu.get_model_parallel_rank() == 0
+        and pipe_load
+        and mpu.get_context_parallel_rank() == 0
+    ):
         # Number of train/valid/test samples.
         if neox_args.train_iters is not None:
             train_iters = neox_args.train_iters
@@ -671,11 +675,17 @@ def build_train_valid_test_data_loaders(neox_args):
         # broadcast globally instead of just the model parallel group.
         torch.distributed.broadcast(flags, src=0)
     else:
+        # The same data should be used for the model parallel and context parallel groups
         torch.distributed.broadcast(
             flags,
             mpu.get_model_parallel_src_rank(),
             group=mpu.get_model_parallel_group(),
         )
+        torch.distributed.broadcast(
+            flags,
+            mpu.get_context_parallel_src_rank(),
+            group=mpu.get_context_parallel_group(),
+        )
     neox_args.do_train = flags[0].item()
     neox_args.do_valid = flags[1].item()
     neox_args.do_test = flags[2].item()
 
@@ -158,16 +158,20 @@ def _initialize_distributed(neox_args):
     # Setup 3D topology.
     pp = neox_args.pipe_parallel_size if neox_args.pipe_parallel_size >= 1 else 1
     mp = neox_args.model_parallel_size if neox_args.model_parallel_size >= 1 else 1
+    cp = neox_args.context_parallel_size if neox_args.context_parallel_size >= 1 else 1
+    assert (
+        neox_args.world_size % (pp * mp * cp) == 0
+    ), f"world_size={neox_args.world_size}, pp={pp}, mp={mp}, cp={cp}"
     assert (
         neox_args.world_size % (pp * mp) == 0
     ), f"world_size={neox_args.world_size}, pp={pp}, mp={mp}"
+    # The data parallel ranks will be used for context parallel
+    # to piggy back the gradient all reduce
     dp = neox_args.world_size // (pp * mp)
+    assert dp % cp == 0
+    from deepspeed.runtime.pipe.topology import ProcessTopology
 
-    from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
-
-    # this does pipe on the most outside, then data, then model.
-    # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
-    topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)
+    topo = ProcessTopology(axes=["pipe", "data", "model"], dims=[pp, dp, mp])
 
     # Offset base seeds for the interior pipeline stages.
     # TODO: adjust last stage too once IO is improved.
@@ -186,6 +190,8 @@ def _initialize_distributed(neox_args):
         else:
             mpu.initialize_model_parallel(
                 neox_args.model_parallel_size,
+                neox_args.pipe_parallel_size,
+                neox_args.context_parallel_size,
                 topology=topo,
                 fp32_allreduce=neox_args.fp32_allreduce,
             )
 
@@ -37,7 +37,7 @@ def __init__(
         normalized_shape,
         eps=1e-5,
         no_persist_layer_norm=True,
-        sequence_parallel=False,
+        context_parallel=False,
         apply_layernorm_1p=False,
         mem_efficient_ln=True,
     ):
@@ -92,11 +92,11 @@ def __init__(
         self.bias = Parameter(torch.Tensor(*normalized_shape))
         self.reset_parameters()
         self.no_persist_layer_norm = no_persist_layer_norm
-        self.sequence_parallel = sequence_parallel
+        self.context_parallel = context_parallel
 
         # set sequence parallelism flag on weight and bias parameters
-        setattr(self.weight, "sequence_parallel", self.sequence_parallel)
-        setattr(self.bias, "sequence_parallel", self.sequence_parallel)
+        setattr(self.weight, "context_parallel", self.context_parallel)
+        setattr(self.bias, "context_parallel", self.context_parallel)
 
     def reset_parameters(self):
 
 
@@ -74,7 +74,30 @@ def cross_entropy(output, labels, _fp16=False):
     else:
         losses = mpu.vocab_parallel_cross_entropy(output.float().contiguous(), labels)
     loss_mask = loss_mask.view(-1)
-    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()
+    loss_mask_sum = loss_mask.sum()
+    if mpu.get_context_parallel_world_size() > 1:
+        dt = loss_mask_sum.dtype
+        if dt == torch.bfloat16 and mpu.initialize.get_fp32_allreduce():
+            loss_mask_sum = loss_mask_sum.float()
+        torch.distributed.all_reduce(
+            loss_mask_sum,
+            op=torch.distributed.ReduceOp.SUM,
+            group=mpu.get_context_parallel_group(),
+        )
+        if dt == torch.bfloat16 and mpu.initialize.get_fp32_allreduce():
+            loss_mask_sum = loss_mask_sum.bfloat16()
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask_sum
+        if dt == torch.bfloat16 and mpu.initialize.get_fp32_allreduce():
+            loss = loss.float()
+        torch.distributed.all_reduce(
+            loss,
+            op=torch.distributed.ReduceOp.SUM,
+            group=mpu.get_context_parallel_group(),
+        )
+        if dt == torch.bfloat16 and mpu.initialize.get_fp32_allreduce():
+            loss = loss.bfloat16()
+    else:
+        loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask_sum
     return loss
 
 
 
@@ -14,6 +14,7 @@
 
 import torch
 import math
+import megatron.mpu as mpu
 
 
 class SinusoidalPositionalEmbedding(torch.nn.Module):
@@ -37,7 +38,13 @@ def forward(self, x, seq_dim=1):
 
 class RotaryEmbedding(torch.nn.Module):
     def __init__(
-        self, dim, max_seq_len, base=10000, precision=torch.half, save_inv_freqs=False
+        self,
+        dim,
+        max_seq_len,
+        base=10000,
+        precision=torch.half,
+        save_inv_freqs=False,
+        zigzag=True,
     ):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
@@ -49,6 +56,7 @@ def __init__(
         self.max_seq_len = max_seq_len
         self.base = base
         self.dim = dim
+        self.zigzag = zigzag  # seq parallel zigzag
 
         # precompute cos_cached, sin_cached in fp32
         cos_cached, sin_cached, inv_freq = self._prepare_cache(
@@ -64,6 +72,19 @@ def _prepare_cache(self, seq_len, precision, base):
         inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float() / self.dim))
 
         t = torch.arange(seq_len).type_as(inv_freq)
+        if mpu.get_context_parallel_world_size() > 1:
+            if not self.zigzag:
+                t_chunks = torch.chunk(t, mpu.get_context_parallel_world_size())
+                t = t_chunks[mpu.get_context_parallel_rank()].contiguous()
+            else:
+                t_chunks = torch.chunk(t, 2 * mpu.get_context_parallel_world_size())
+                t = torch.cat(
+                    (
+                        t_chunks[mpu.get_context_parallel_rank()],
+                        t_chunks[-(mpu.get_context_parallel_rank() + 1)],
+                    ),
+                    dim=0,
+                ).contiguous()
         freqs = torch.einsum("i,j->ij", t, inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
 
 
@@ -452,6 +452,7 @@ def __init__(
         self.rope_fusion = neox_args.rope_fusion
         self.attention_type = neox_args.attention_config[layer_number]
         self.use_flash_attention = self.attention_type == "flash"
+        self.use_ring_attention = self.attention_type == "ring"
         self.use_triton = (
             self.use_flash_attention
             and self.pos_emb == "alibi"
@@ -460,7 +461,7 @@ def __init__(
                 >= packaging.version.Version("2.4.0.post1")
             )
         )
-        self.sparse = self.attention_type not in ("global", "flash")
+        self.sparse = self.attention_type not in ("global", "flash", "ring")
 
         if self.gqa:
             assert not self.sparse
@@ -489,6 +490,12 @@ def __init__(
                 self.flash_triton_fn = flash_attn_unpadded_unpacked_func_triton
                 self.flash_qkv_fn = flash_attn_func
                 self.flash_varlen_qkv_fn = flash_attn_varlen_func
+            elif self.use_ring_attention:
+                from ring_flash_attn.zigzag_ring_flash_attn import (
+                    zigzag_ring_flash_attn_func,
+                )
+
+                self.ring_attn_fn = zigzag_ring_flash_attn_func
             else:
                 self.scale_mask_softmax = FusedScaleMaskSoftmax(
                     input_in_fp16=self.fp16,
@@ -736,6 +743,96 @@ def flash_attention(self, query_layer, key_layer, value_layer):
 
         return matmul_result
 
+    def ring_attention(self, query_layer, key_layer, value_layer):
+        # [b, np, sq, sk]
+        output_size = (
+            query_layer.size(1),
+            query_layer.size(2),
+            query_layer.size(0),
+            key_layer.size(0),
+        )
+
+        # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn]
+        key_layer = key_layer.transpose(0, 1).reshape(
+            output_size[0], output_size[3], self.num_kv_heads_per_partition, -1
+        )
+        value_layer = value_layer.transpose(0, 1).reshape(
+            output_size[0], output_size[3], self.num_kv_heads_per_partition, -1
+        )
+
+        # [sq, b, np, hn] -> [b, sq, np, hn]
+        query_layer = query_layer.transpose(0, 1).reshape(
+            output_size[0], output_size[2], output_size[1], -1
+        )
+
+        # only pass in window_size or alibi_slopes kwarg
+        # if we use Sliding Window Attention / AliBi.
+        # Flash attn defaults to (-1,-1), or
+        # does not have this kwarg prior to v2.3.0
+        extra_kwargs = (
+            {"window_size": (self.sliding_window_width, -1)}
+            if self.sliding_window_width is not None
+            else {}
+        )
+        if self.pos_emb == "alibi":
+            extra_kwargs["alibi_slopes"] = self.alibi_embed.slopes.to(
+                query_layer.device
+            ).to(torch.float32)
+
+        if not self.training:
+            batch_size = output_size[0]
+            max_seqlen_q = output_size[2]
+            max_seqlen_k = output_size[3]
+
+            cu_seqlens_q = torch.arange(
+                0,
+                (batch_size + 1) * max_seqlen_q,
+                step=max_seqlen_q,
+                dtype=torch.int32,
+                device=query_layer.device,
+            )
+
+            cu_seqlens_k = torch.arange(
+                0,
+                (batch_size + 1) * max_seqlen_k,
+                step=max_seqlen_k,
+                dtype=torch.int32,
+                device=key_layer.device,
+            )
+
+            q_shape = query_layer.shape
+            k_shape = key_layer.shape
+            v_shape = value_layer.shape
+            is_causal = max_seqlen_q == max_seqlen_k
+            output = self.ring_attn_fn(
+                query_layer,
+                key_layer,
+                value_layer,
+                0.0,
+                softmax_scale=None,
+                causal=is_causal,
+                group=mpu.get_context_parallel_group(),
+                **extra_kwargs,
+            )
+            output = output.reshape(q_shape)
+        else:
+            output = self.ring_attn_fn(
+                query_layer,
+                key_layer,
+                value_layer,
+                self.dropout_p if self.training else 0.0,
+                softmax_scale=None,
+                causal=True,
+                group=mpu.get_context_parallel_group(),
+                **extra_kwargs,
+            )
+
+        matmul_result = output
+        # [b, sq, np, hn] -> [b, np, sq, hn]
+        matmul_result = matmul_result.transpose(1, 2)
+
+        return matmul_result
+
     def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask):
         # TODO: sparse attn dropout?
         # TODO: pad to block size
@@ -831,7 +928,7 @@ def gqa_project(self, hidden_states, attention_mask, layer_past=None):
         value_layer = value_layer.view(*new_kv_shape)
 
         # if not using Flash attention, we repeat K/V heads to match Q head counts
-        if not self.use_flash_attention:
+        if not (self.use_flash_attention or self.use_ring_attention):
             key_layer = torch.repeat_interleave(
                 key_layer,
                 repeats=int(
@@ -945,6 +1042,8 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
 
         if self.use_flash_attention:
             context_layer = self.flash_attention(query_layer, key_layer, value_layer)
+        elif self.use_ring_attention:
+            context_layer = self.ring_attention(query_layer, key_layer, value_layer)
         elif not self.sparse:
             context_layer = self.attention(
                 query_layer, key_layer, value_layer, layer_past, attention_mask
 
@@ -373,14 +373,14 @@ def reduce_weight_grads_from_model_parallel_region(input_):
 
     # Bf16 convert
     dt = input_.dtype
-    if dt == torch.bfloat16 and mpu.get_fp32_allreduce():
+    if dt == torch.bfloat16 and mpu.initialize.get_fp32_allreduce():
         input_ = input_.float()
 
     # All-reduce.
     dist.all_reduce(input_, group=mpu.get_model_parallel_group())
 
     # Bf16 convert
-    if dt == torch.bfloat16 and mpu.get_fp32_allreduce():
+    if dt == torch.bfloat16 and mpu.initialize.get_fp32_allreduce():
         input_ = input_.bfloat16()
 
     return input_
 
@@ -57,3 +57,10 @@
 
 from .utils import divide
 from .utils import split_tensor_along_last_dim
+from .data import zigzag_data
+from .initialize import (
+    get_context_parallel_group,
+    get_context_parallel_rank,
+    get_context_parallel_world_size,
+    get_context_parallel_src_rank,
+)