Skip to content

Commit e47d7d0

Browse files
luca3rdBorda
authored andcommitted
Cleanup cluster waiting (#16054)
(cherry picked from commit 6458a5a)
1 parent 4debdd3 commit e47d7d0

File tree

1 file changed

+20
-13
lines changed

1 file changed

+20
-13
lines changed

src/lightning_app/cli/cmd_clusters.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def _wait_for_cluster_state(
231231
cluster_id: str,
232232
target_state: V1ClusterState,
233233
timeout_seconds: int = MAX_CLUSTER_WAIT_TIME,
234-
poll_duration_seconds: int = 10,
234+
poll_duration_seconds: int = 60,
235235
) -> None:
236236
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.
237237
@@ -307,21 +307,24 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster
307307
duration = _format_elapsed_seconds(elapsed)
308308

309309
if current_state == V1ClusterState.FAILED:
310-
return dedent(
311-
f"""\
312-
The requested cluster operation for cluster {cluster_id} has errors:
313-
{current_reason}
310+
if not _is_retryable_error(current_reason):
311+
return dedent(
312+
f"""\
313+
The requested cluster operation for cluster {cluster_id} has errors:
314314
315-
---
316-
We are automatically retrying, and an automated alert has been created
315+
{current_reason}
317316
318-
WARNING: Any non-deleted cluster may be using resources.
319-
To avoid incuring cost on your cloud provider, delete the cluster using the following command:
320-
lightning delete cluster {cluster_id}
317+
--------------------------------------------------------------
321318
322-
Contact [email protected] for additional help
323-
"""
324-
)
319+
We are automatically retrying, and an automated alert has been created
320+
321+
WARNING: Any non-deleted cluster may be using resources.
322+
To avoid incuring cost on your cloud provider, delete the cluster using the following command:
323+
lightning delete cluster {cluster_id}
324+
325+
Contact [email protected] for additional help
326+
"""
327+
)
325328

326329
if desired_state == current_state == V1ClusterState.RUNNING:
327330
return dedent(
@@ -352,6 +355,10 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster
352355
raise click.ClickException(f"Unknown cluster desired state {desired_state}")
353356

354357

358+
def _is_retryable_error(error_message: str) -> bool:
359+
return "resources failed to delete" in error_message
360+
361+
355362
def _format_elapsed_seconds(seconds: Union[float, int]) -> str:
356363
"""Turns seconds into a duration string.
357364

0 commit comments

Comments
 (0)