22
22
import pytorch_lightning as pl
23
23
from lightning_lite .plugins import ClusterEnvironment
24
24
from lightning_lite .strategies .launchers .base import _Launcher
25
- from lightning_lite .strategies .launchers .subprocess_script import _basic_subprocess_cmd , _hydra_subprocess_cmd
26
25
27
26
_HYDRA_AVAILABLE = RequirementCache ("hydra-core" )
28
27
@@ -100,6 +99,32 @@ def _call_children_scripts(self) -> None:
100
99
# allow the user to pass the node rank
101
100
os .environ ["NODE_RANK" ] = str (self .cluster_environment .node_rank ())
102
101
os .environ ["LOCAL_RANK" ] = str (self .cluster_environment .local_rank ())
102
+
103
+ # Check if the current calling command looked like `python a/b/c.py` or `python -m a.b.c`
104
+ # See https://docs.python.org/3/reference/import.html#main-spec
105
+ if __main__ .__spec__ is None : # pragma: no-cover
106
+ # Script called as `python a/b/c.py`
107
+ if _HYDRA_AVAILABLE :
108
+ # when user is using hydra find the absolute path
109
+ from hydra .utils import to_absolute_path
110
+
111
+ to_abs_path = to_absolute_path
112
+ else :
113
+ to_abs_path = os .path .abspath
114
+
115
+ # pull out the commands used to run the script and resolve the absolute file path
116
+ command = sys .argv
117
+ try :
118
+ full_path = to_abs_path (command [0 ])
119
+ except Exception :
120
+ full_path = os .path .abspath (command [0 ])
121
+
122
+ command [0 ] = full_path
123
+ # use the same python interpreter and actually running
124
+ command = [sys .executable ] + command
125
+ else : # Script called as `python -m a.b.c`
126
+ command = [sys .executable , "-m" , __main__ .__spec__ .name ] + sys .argv [1 :]
127
+
103
128
os .environ ["WORLD_SIZE" ] = f"{ self .num_processes * self .num_nodes } "
104
129
105
130
for local_rank in range (1 , self .num_processes ):
@@ -110,18 +135,18 @@ def _call_children_scripts(self) -> None:
110
135
if os .environ .get ("PL_GLOBAL_SEED" ) is None and "PL_GLOBAL_SEED" in env_copy :
111
136
del env_copy ["PL_GLOBAL_SEED" ]
112
137
113
- hydra_in_use = False
138
+ # start process
139
+ # if hydra is available and initialized, make sure to set the cwd correctly
140
+ cwd : Optional [str ] = None
114
141
if _HYDRA_AVAILABLE :
115
142
from hydra .core .hydra_config import HydraConfig
143
+ from hydra .utils import get_original_cwd
116
144
117
- hydra_in_use = HydraConfig .initialized ()
118
-
119
- if hydra_in_use :
120
- command = _hydra_subprocess_cmd (local_rank )
121
- else :
122
- command = _basic_subprocess_cmd ()
123
-
124
- subprocess .Popen (command , env = env_copy )
145
+ if HydraConfig .initialized ():
146
+ cwd = get_original_cwd ()
147
+ os_cwd = f'"{ os .getcwd ()} "'
148
+ command += [f"hydra.run.dir={ os_cwd } " , f"hydra.job.name=train_ddp_process_{ local_rank } " ]
149
+ subprocess .Popen (command , env = env_copy , cwd = cwd )
125
150
126
151
# starting all processes at once can cause issues
127
152
# with dataloaders delay between 1-10 seconds
0 commit comments