Skip to content

Commit 3782c7a

Browse files
Add jsrun launcher based on neox 2.0
1 parent e2d8a24 commit 3782c7a

File tree

3 files changed

+7
-2
lines changed

3 files changed

+7
-2
lines changed

configs/neox_arguments.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1809,7 +1809,7 @@ Args for deepspeed runner (deepspeed.launcher.runner).
18091809

18101810

18111811

1812-
- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm']
1812+
- **launcher**: typing.Literal['pdsh', 'openmpi', 'mvapich', 'slurm', 'jsrun']
18131813

18141814
Default = pdsh
18151815

megatron/neox_arguments/deepspeed_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ class NeoXArgsDeepspeedRunner(NeoXArgsTemplate):
192192
IP address of node 0, will be inferred via 'hostname -I' if not specified.
193193
"""
194194

195-
launcher: Literal["pdsh", "openmpi", "mvapich", "slurm"] = "pdsh"
195+
launcher: Literal["pdsh", "openmpi", "mvapich", "slurm", "jsrun"] = "pdsh"
196196
"""
197197
Launcher backend for multi-node training. Options currently include PDSH, OpenMPI, MVAPICH.
198198
"""

megatron/neox_arguments/neox_args.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,11 @@ class NeoXArgsOther(NeoXArgsTemplate):
612612
Run via SLURM, this will attempt to discover the necessary variables to initialize torch distributed from the SLURM environment
613613
"""
614614

615+
deepspeed_jsrun: bool = False
616+
"""
617+
Run via JSRUN, this will attempt to discover the necessary variables to initialize torch distributed from the IBM LSF environment
618+
"""
619+
615620
user_script: str = None
616621
"""
617622
user script to be run

0 commit comments

Comments
 (0)