2
2
import re
3
3
import time
4
4
from datetime import datetime
5
+ from enum import Enum
5
6
from textwrap import dedent
6
7
from typing import Any , List
7
8
29
30
MAX_CLUSTER_WAIT_TIME = 5400
30
31
31
32
33
+ class ClusterState (Enum ):
34
+ UNSPECIFIED = "unspecified"
35
+ QUEUED = "queued"
36
+ PENDING = "pending"
37
+ RUNNING = "running"
38
+ FAILED = "failed"
39
+ DELETED = "deleted"
40
+
41
+ def __str__ (self ) -> str :
42
+ return str (self .value )
43
+
44
+ @classmethod
45
+ def from_api (cls , status : V1ClusterState ) -> "ClusterState" :
46
+ parsed = str (status ).lower ().split ("_" , maxsplit = 2 )[- 1 ]
47
+ return cls (parsed )
48
+
49
+
32
50
class ClusterList (Formatable ):
33
51
def __init__ (self , clusters : List [Externalv1Cluster ]):
34
52
self .clusters = clusters
@@ -130,9 +148,6 @@ def create(
130
148
click .echo ("cluster unchanged" )
131
149
132
150
resp = self .api_client .cluster_service_create_cluster (body = new_body )
133
- if wait :
134
- _wait_for_cluster_state (self .api_client , resp .id , V1ClusterState .RUNNING )
135
-
136
151
click .echo (
137
152
dedent (
138
153
f"""\
@@ -146,6 +161,13 @@ def create(
146
161
"""
147
162
)
148
163
)
164
+ if wait :
165
+ click .echo ("Waiting for cluster to enter state running..." )
166
+ click .echo (
167
+ "Canceling this operation will NOT stop the cluster from creating"
168
+ f"(use `lightning delete cluster { resp .id } `)"
169
+ )
170
+ _wait_for_cluster_state (self .api_client , resp .id , V1ClusterState .RUNNING )
149
171
150
172
def get_clusters (self ) -> ClusterList :
151
173
resp = self .api_client .cluster_service_list_clusters (phase_not_in = [V1ClusterState .DELETED ])
@@ -171,6 +193,8 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No
171
193
click .echo ("Cluster deletion triggered successfully" )
172
194
173
195
if wait :
196
+ click .echo ("Waiting for cluster to delete..." )
197
+ click .echo ("Canceling the operation will NOT stop the cluster from deleting" )
174
198
_wait_for_cluster_state (self .api_client , cluster_id , V1ClusterState .DELETED )
175
199
176
200
@@ -183,6 +207,8 @@ def _wait_for_cluster_state(
183
207
) -> None :
184
208
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.
185
209
210
+ Messages will be displayed to the user as the cluster changes state.
211
+
186
212
Args:
187
213
api_client: LightningClient used for polling
188
214
cluster_id: Specifies the cluster to wait for
@@ -192,6 +218,7 @@ def _wait_for_cluster_state(
192
218
"""
193
219
start = time .time ()
194
220
elapsed = 0
221
+
195
222
while elapsed < max_wait_time :
196
223
cluster_resp = api_client .cluster_service_list_clusters ()
197
224
new_cluster = None
@@ -200,10 +227,14 @@ def _wait_for_cluster_state(
200
227
new_cluster = clust
201
228
break
202
229
if new_cluster is not None :
230
+ echo_cluster_status_long (
231
+ cluster_id = cluster_id ,
232
+ current_state = new_cluster .status .phase ,
233
+ current_reason = new_cluster .status .reason ,
234
+ desired_state = target_state ,
235
+ )
203
236
if new_cluster .status .phase == target_state :
204
237
break
205
- elif new_cluster .status .phase == V1ClusterState .FAILED :
206
- raise click .ClickException (f"Cluster { cluster_id } is in failed state." )
207
238
time .sleep (check_timeout )
208
239
elapsed = int (time .time () - start )
209
240
else :
@@ -219,3 +250,49 @@ def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str:
219
250
Provide a cluster name using valid characters and try again."""
220
251
)
221
252
return value
253
+
254
+
255
+ def echo_cluster_status_long (
256
+ cluster_id : str ,
257
+ current_state : V1ClusterState ,
258
+ current_reason : str ,
259
+ desired_state : V1ClusterState ,
260
+ ) -> None :
261
+ """Echos a long-form status message to the user about the cluster state.
262
+
263
+ Args:
264
+ cluster_id: The name of the cluster
265
+ current_state: The cluster's current state
266
+ reason: The reason for the cluster's state
267
+ """
268
+
269
+ state_str = ClusterState .from_api (current_state )
270
+
271
+ message = f"Cluster { cluster_id } is now { state_str } " + (
272
+ f" with the following reason: { current_reason } " if current_reason else ""
273
+ )
274
+
275
+ if current_state == V1ClusterState .RUNNING and desired_state == V1ClusterState .RUNNING :
276
+ message = "\n " .join (
277
+ [
278
+ f"Cluster { cluster_id } is now running and ready to use." ,
279
+ f"To launch an app on this cluster use `lightning run app app.py --cloud --cluster-id { cluster_id } `" ,
280
+ ]
281
+ )
282
+ if current_state == V1ClusterState .RUNNING and desired_state == V1ClusterState .DELETED :
283
+ message = f"Cluster { cluster_id } is terminating"
284
+ if current_state == V1ClusterState .FAILED :
285
+ message = "\n " .join (
286
+ [
287
+ message ,
288
+ "We are automatically retrying cluster creation." ,
289
+ "In case you want to delete this cluster:" ,
290
+ "1. Stop this command" ,
291
+ f"2. Run `lightning delete cluster { cluster_id } " ,
292
+ "WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you." ,
293
+ ]
294
+ )
295
+ if current_state == V1ClusterState .DELETED :
296
+ message = f"Cluster { cluster_id } has been deleted."
297
+
298
+ click .echo (message )
0 commit comments