Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,13 @@ RUN mkdir -p /home/mchorse/.ssh /job && \
#### Python packages
RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge
COPY requirements/requirements.txt .
COPY requirements/requirements-wandb.txt .
COPY requirements/requirements-onebitadam.txt .
COPY requirements/requirements-sparseattention.txt .
RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \
pip install -r requirements-sparseattention.txt && \
pip install -r requirements-flashattention.txt && \
pip install -r requirements-wandb.txt && \
pip install protobuf==3.20.* && \
pip cache purge

Expand Down
29 changes: 16 additions & 13 deletions deepy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,26 @@
import logging
import os

import deepspeed
from deepspeed.launcher.runner import main
import deepspeed.launcher.runner

logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

from megatron.neox_arguments import NeoXArgs
from megatron.utils import get_wandb_api_key
def main():
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))

from megatron.neox_arguments import NeoXArgs
from megatron.utils import get_wandb_api_key

neox_args = NeoXArgs.consume_deepy_args()
deepspeed_main_args = neox_args.get_deepspeed_main_args()
neox_args = NeoXArgs.consume_deepy_args()
deepspeed_main_args = neox_args.get_deepspeed_main_args()

# Extract wandb API key and inject into worker environments
wandb_token = get_wandb_api_key(neox_args=neox_args)
if wandb_token is not None:
deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
os.environ["WANDB_API_KEY"] = wandb_token

deepspeed.launcher.runner.main(deepspeed_main_args)

# Extract wandb API key and inject into worker environments
wandb_token = get_wandb_api_key(neox_args=neox_args)
if wandb_token is not None:
deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
os.environ["WANDB_API_KEY"] = wandb_token

if __name__ == "__main__":
main(deepspeed_main_args)
main()
7 changes: 6 additions & 1 deletion megatron/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

import sys
import torch
import wandb

try:
import wandb
except ModuleNotFoundError:
pass

from megatron import mpu, print_rank_0
from megatron.utils import report_memory

Expand Down
30 changes: 17 additions & 13 deletions megatron/neox_arguments/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import yaml
import json
import logging
import shortuuid
import copy
import torch
import argparse
Expand Down Expand Up @@ -278,13 +277,13 @@ def consume_deepy_args(cls):
"--wandb_group",
type=str,
default=None,
help='Weights and Biases group name - used to group together "runs".',
help='Weights & Biases group name - used to group together "runs".',
)
group.add_argument(
"--wandb_team",
type=str,
default=None,
help="Team name for Weights and Biases.",
help="Weights & Biases team name.",
)

group = parser.add_argument_group(title="Eval args")
Expand Down Expand Up @@ -372,11 +371,22 @@ def consume_deepy_args(cls):
paths_to_yml_files=conf_files, overwrite_values=overwrite_values
)

if neox_args.wandb_group is not None:
# concat the wandb group name with a uid to make sure it's unique
import wandb
if neox_args.use_wandb:
try:
import wandb

# Check if the W&B group name is configured
if neox_args.wandb_group is None:
# Set a randomized string as group name if no group name is provided
neox_args.wandb_group = wandb.sdk.lib.runid.generate_id()
else:
# Concatenate the W&B group name with a randomized string to ensure uniqueness.
neox_args.wandb_group += "_" + wandb.sdk.lib.runid.generate_id()
except ModuleNotFoundError as e:
if e.name == "wandb":
e.msg += "\nWeights & Biases monitoring was requested but `wandb` was not found. Install `wandb` to use Weights & Biases, or set the `use_wandb` configuration option to a boolean false to disable Weights & Biases logging."
raise e

neox_args.wandb_group += "_" + wandb.util.generate_id()
neox_args.print()

return neox_args
Expand Down Expand Up @@ -736,12 +746,6 @@ def calculate_derived(self):
Derives additional configuration values necessary for training from the current config
"""

# wandb
# sets a unique wandb group
if self.wandb_group is None:
# if none is defined a uuid is set for the run
self.wandb_group = shortuuid.uuid()

# number of gpus
# Get number of GPUs param or hostfile to determine train_batch_size
global_num_gpus = getattr(self, "global_num_gpus", None)
Expand Down
11 changes: 7 additions & 4 deletions megatron/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,19 @@
from typing import Dict, List

import requests
import wandb
from wandb import UsageError

try:
import wandb
except ModuleNotFoundError:
pass

import torch

from deepspeed.launcher.runner import fetch_hostfile, parse_inclusion_exclusion

from megatron import print_rank_0
from megatron import mpu
from deepspeed import PipelineEngine, DeepSpeedEngine

from collections import deque


Expand Down Expand Up @@ -167,7 +170,7 @@ def init_wandb(neox_args):
force=False,
entity=neox_args.wandb_team,
)
except UsageError as e:
except wandb.UsageError as e:
neox_args.update_value("use_wandb", False)
print(e)
print(
Expand Down
13 changes: 6 additions & 7 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
autopep8==1.5.6
clang-format==13.0.1
pre-commit~=2.17.0
pytest==6.2.3
pytest-cov==2.11.1
pytest-forked==1.3.0
autopep8>=1.5.6
clang-format>=13.0.1
pre-commit>=2.17.0
pytest>=6.2.3
pytest-cov>=2.11.1
pytest-forked>=1.3.0
pytest-xdist
transformers~=4.16.2
2 changes: 1 addition & 1 deletion requirements/requirements-onebitadam.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cupy-cuda111==8.6.0
cupy-cuda111>=8.6.0
1 change: 1 addition & 0 deletions requirements/requirements-wandb.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
wandb>=0.10.28
22 changes: 10 additions & 12 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
einops==0.3.0
ftfy==6.0.1
git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
huggingface_hub==0.11.0
lm_eval==0.3.0
mpi4py==3.0.3
numpy==1.22.0
pybind11==2.6.2
deepspeed
ftfy>=6.0.1
git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
huggingface_hub>=0.11.0
lm_eval>=0.3.0
mpi4py>=3.0.3
numpy>=1.22.0
pybind11>=2.6.2
regex
sentencepiece
six
tiktoken==0.1.2
tokenizers==0.12.1
transformers~=4.24.0
wandb==0.10.28
tiktoken>=0.1.2
tokenizers>=0.12.1
transformers>=4.24.0