EleutherAI · Quentin-Anthony · Mar 9, 2023 · Mar 9, 2023 · Mar 9, 2023 · Mar 9, 2023
@@ -88,10 +88,13 @@ RUN mkdir -p /home/mchorse/.ssh /job && \
 #### Python packages
 RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge
 COPY requirements/requirements.txt .
+COPY requirements/requirements-wandb.txt .
 COPY requirements/requirements-onebitadam.txt .
 COPY requirements/requirements-sparseattention.txt .
 RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \
     pip install -r requirements-sparseattention.txt && \
+    pip install -r requirements-flashattention.txt && \
+    pip install -r requirements-wandb.txt && \
     pip install protobuf==3.20.* && \
     pip cache purge
 

@@ -16,23 +16,26 @@
 import logging
 import os
 
-import deepspeed
-from deepspeed.launcher.runner import main
+import deepspeed.launcher.runner
 
-logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
 
-from megatron.neox_arguments import NeoXArgs
-from megatron.utils import get_wandb_api_key
+def main():
+    logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
 
+    from megatron.neox_arguments import NeoXArgs
+    from megatron.utils import get_wandb_api_key
 
-neox_args = NeoXArgs.consume_deepy_args()
-deepspeed_main_args = neox_args.get_deepspeed_main_args()
+    neox_args = NeoXArgs.consume_deepy_args()
+    deepspeed_main_args = neox_args.get_deepspeed_main_args()
+
+    # Extract wandb API key and inject into worker environments
+    wandb_token = get_wandb_api_key(neox_args=neox_args)
+    if wandb_token is not None:
+        deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
+        os.environ["WANDB_API_KEY"] = wandb_token
+
+    deepspeed.launcher.runner.main(deepspeed_main_args)
 
-# Extract wandb API key and inject into worker environments
-wandb_token = get_wandb_api_key(neox_args=neox_args)
-if wandb_token is not None:
-    deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
-    os.environ["WANDB_API_KEY"] = wandb_token
 
 if __name__ == "__main__":
-    main(deepspeed_main_args)
+    main()
@@ -14,7 +14,12 @@
 
 import sys
 import torch
-import wandb
+
+try:
+    import wandb
+except ModuleNotFoundError:
+    pass
+
 from megatron import mpu, print_rank_0
 from megatron.utils import report_memory
 

@@ -18,7 +18,6 @@
 import yaml
 import json
 import logging
-import shortuuid
 import copy
 import torch
 import argparse
@@ -278,13 +277,13 @@ def consume_deepy_args(cls):
             "--wandb_group",
             type=str,
             default=None,
-            help='Weights and Biases group name - used to group together "runs".',
+            help='Weights & Biases group name - used to group together "runs".',
         )
         group.add_argument(
             "--wandb_team",
             type=str,
             default=None,
-            help="Team name for Weights and Biases.",
+            help="Weights & Biases team name.",
         )
 
         group = parser.add_argument_group(title="Eval args")
@@ -372,11 +371,22 @@ def consume_deepy_args(cls):
             paths_to_yml_files=conf_files, overwrite_values=overwrite_values
         )
 
-        if neox_args.wandb_group is not None:
-            # concat the wandb group name with a uid to make sure it's unique
-            import wandb
+        if neox_args.use_wandb:
+            try:
+                import wandb
+
+                # Check if the W&B group name is configured
+                if neox_args.wandb_group is None:
+                    # Set a randomized string as group name if no group name is provided
+                    neox_args.wandb_group = wandb.sdk.lib.runid.generate_id()
+                else:
+                    # Concatenate the W&B group name with a randomized string to ensure uniqueness.
+                    neox_args.wandb_group += "_" + wandb.sdk.lib.runid.generate_id()
+            except ModuleNotFoundError as e:
+                if e.name == "wandb":
+                    e.msg += "\nWeights & Biases monitoring was requested but `wandb` was not found. Install `wandb` to use Weights & Biases, or set the `use_wandb` configuration option to a boolean false to disable Weights & Biases logging."
+                raise e
 
-            neox_args.wandb_group += "_" + wandb.util.generate_id()
         neox_args.print()
 
         return neox_args
@@ -736,12 +746,6 @@ def calculate_derived(self):
         Derives additional configuration values necessary for training from the current config
         """
 
-        # wandb
-        # sets a unique wandb group
-        if self.wandb_group is None:
-            # if none is defined a uuid is set for the run
-            self.wandb_group = shortuuid.uuid()
-
         # number of gpus
         # Get number of GPUs param or hostfile to determine train_batch_size
         global_num_gpus = getattr(self, "global_num_gpus", None)

@@ -24,16 +24,19 @@
 from typing import Dict, List
 
 import requests
-import wandb
-from wandb import UsageError
+
+try:
+    import wandb
+except ModuleNotFoundError:
+    pass
 
 import torch
 
 from deepspeed.launcher.runner import fetch_hostfile, parse_inclusion_exclusion
 
 from megatron import print_rank_0
 from megatron import mpu
-from deepspeed import PipelineEngine, DeepSpeedEngine
+
 from collections import deque
 
 
@@ -167,7 +170,7 @@ def init_wandb(neox_args):
                 force=False,
                 entity=neox_args.wandb_team,
             )
-        except UsageError as e:
+        except wandb.UsageError as e:
             neox_args.update_value("use_wandb", False)
             print(e)
             print(

@@ -1,8 +1,7 @@
-autopep8==1.5.6
-clang-format==13.0.1
-pre-commit~=2.17.0
-pytest==6.2.3
-pytest-cov==2.11.1
-pytest-forked==1.3.0
+autopep8>=1.5.6
+clang-format>=13.0.1
+pre-commit>=2.17.0
+pytest>=6.2.3
+pytest-cov>=2.11.1
+pytest-forked>=1.3.0
 pytest-xdist
-transformers~=4.16.2
@@ -1 +1 @@
-cupy-cuda111==8.6.0
+cupy-cuda111>=8.6.0
@@ -0,0 +1 @@
+wandb>=0.10.28
@@ -1,16 +1,14 @@
-einops==0.3.0
-ftfy==6.0.1
-git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
-huggingface_hub==0.11.0
-lm_eval==0.3.0
-mpi4py==3.0.3
-numpy==1.22.0
-pybind11==2.6.2
 deepspeed
+ftfy>=6.0.1
+git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
+huggingface_hub>=0.11.0
+lm_eval>=0.3.0
+mpi4py>=3.0.3
+numpy>=1.22.0
+pybind11>=2.6.2
 regex
 sentencepiece
 six
-tiktoken==0.1.2
-tokenizers==0.12.1
-transformers~=4.24.0
-wandb==0.10.28
+tiktoken>=0.1.2
+tokenizers>=0.12.1
+transformers>=4.24.0