@@ -231,7 +231,7 @@ def _wait_for_cluster_state(
231
231
cluster_id : str ,
232
232
target_state : V1ClusterState ,
233
233
timeout_seconds : int = MAX_CLUSTER_WAIT_TIME ,
234
- poll_duration_seconds : int = 10 ,
234
+ poll_duration_seconds : int = 60 ,
235
235
) -> None :
236
236
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.
237
237
@@ -307,21 +307,24 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster
307
307
duration = _format_elapsed_seconds (elapsed )
308
308
309
309
if current_state == V1ClusterState .FAILED :
310
- return dedent (
311
- f""" \
312
- The requested cluster operation for cluster { cluster_id } has errors:
313
- { current_reason }
310
+ if not _is_retryable_error ( current_reason ):
311
+ return dedent (
312
+ f""" \
313
+ The requested cluster operation for cluster { cluster_id } has errors:
314
314
315
- ---
316
- We are automatically retrying, and an automated alert has been created
315
+ { current_reason }
317
316
318
- WARNING: Any non-deleted cluster may be using resources.
319
- To avoid incuring cost on your cloud provider, delete the cluster using the following command:
320
- lightning delete cluster { cluster_id }
317
+ --------------------------------------------------------------
321
318
322
- Contact [email protected] for additional help
323
- """
324
- )
319
+ We are automatically retrying, and an automated alert has been created
320
+
321
+ WARNING: Any non-deleted cluster may be using resources.
322
+ To avoid incuring cost on your cloud provider, delete the cluster using the following command:
323
+ lightning delete cluster { cluster_id }
324
+
325
+ Contact [email protected] for additional help
326
+ """
327
+ )
325
328
326
329
if desired_state == current_state == V1ClusterState .RUNNING :
327
330
return dedent (
@@ -352,6 +355,10 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster
352
355
raise click .ClickException (f"Unknown cluster desired state { desired_state } " )
353
356
354
357
358
+ def _is_retryable_error (error_message : str ) -> bool :
359
+ return "resources failed to delete" in error_message
360
+
361
+
355
362
def _format_elapsed_seconds (seconds : Union [float , int ]) -> str :
356
363
"""Turns seconds into a duration string.
357
364
0 commit comments