Skip to content

Commit 33e1f93

Browse files
luca3rdnmiculinicpre-commit-ci[bot]nicolai86
authored
[App] Improve cluster creation / deletion experience (#15458)
Cluster creation and deletion can take a long time. Instead of having these long running operations happen in the background, they should happen in the foreground. The advantage is that failures are brought to the users attention immediately, instead of the next time they decide to run `lightning list clusters`. While the CLI waits for the cluster to run / delete, it will display cluster status changes to the user. This PR also hides the `--enable-performance` and `--edit-before-creation` creation flags, as well as the `--force` deletion flag. They are either not frequently used (performance mode is expensive), or prone to misuse. Co-authored-by: Neven Miculinic <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Raphael Randschau <[email protected]>
1 parent 8003663 commit 33e1f93

File tree

7 files changed

+251
-108
lines changed

7 files changed

+251
-108
lines changed

docs/source-app/workflows/byoc/index.rst

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ Parameters
6363
^^^^^^^^^^
6464

6565
+------------------------+----------------------------------------------------------------------------------------------------+
66-
|Parameter | Descritption |
66+
|Parameter | Description |
6767
+========================+====================================================================================================+
6868
| provider | The cloud provider where your cluster is located. |
6969
| | |
@@ -78,18 +78,7 @@ Parameters
7878
+------------------------+----------------------------------------------------------------------------------------------------+
7979
| region | AWS region containing compute resources |
8080
+------------------------+----------------------------------------------------------------------------------------------------+
81-
| instance-types | Instance types that you want to support, for computer jobs within the cluster. |
82-
| | |
83-
| | For now, this is the AWS instance types supported by the cluster. |
84-
+------------------------+----------------------------------------------------------------------------------------------------+
85-
| enable-performance | Specifies if the cluster uses cost savings mode. |
86-
| | |
87-
| | In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters |
88-
| | with low utilization. |
89-
+------------------------+----------------------------------------------------------------------------------------------------+
90-
| edit-before-creation | Enables interactive editing of requests before submitting it to Lightning AI. |
91-
+------------------------+----------------------------------------------------------------------------------------------------+
92-
| wait | Waits for the cluster to be in a RUNNING state. Only use this for debugging. |
81+
| async | Cluster creation will happen in the background. |
9382
+------------------------+----------------------------------------------------------------------------------------------------+
9483

9584
----

src/lightning_app/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
1919
### Changed
2020

2121
- The `MultiNode` components now warn the user when running with `num_nodes > 1` locally ([#15806](https://github.com/Lightning-AI/lightning/pull/15806))
22+
- Cluster creation and deletion now waits by default [#15458](https://github.com/Lightning-AI/lightning/pull/15458)
2223

2324

2425
### Deprecated

src/lightning_app/cli/cmd_clusters.py

Lines changed: 173 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
import time
44
from datetime import datetime
55
from textwrap import dedent
6-
from typing import Any, List
6+
from typing import Any, List, Union
77

88
import click
9+
import lightning_cloud
910
from lightning_cloud.openapi import (
1011
Externalv1Cluster,
1112
V1AWSClusterDriverSpec,
@@ -15,8 +16,10 @@
1516
V1ClusterState,
1617
V1ClusterType,
1718
V1CreateClusterRequest,
19+
V1GetClusterResponse,
1820
V1KubernetesClusterDriver,
1921
)
22+
from lightning_utilities.core.enums import StrEnum
2023
from rich.console import Console
2124
from rich.table import Table
2225
from rich.text import Text
@@ -25,10 +28,26 @@
2528
from lightning_app.utilities.network import LightningClient
2629
from lightning_app.utilities.openapi import create_openapi_object, string2dict
2730

28-
CLUSTER_STATE_CHECKING_TIMEOUT = 60
2931
MAX_CLUSTER_WAIT_TIME = 5400
3032

3133

34+
class ClusterState(StrEnum):
35+
UNSPECIFIED = "unspecified"
36+
QUEUED = "queued"
37+
PENDING = "pending"
38+
RUNNING = "running"
39+
FAILED = "error"
40+
DELETED = "deleted"
41+
42+
def __str__(self) -> str:
43+
return str(self.value)
44+
45+
@classmethod
46+
def from_api(cls, status: V1ClusterState) -> "ClusterState":
47+
parsed = str(status).lower().split("_", maxsplit=2)[-1]
48+
return cls.from_str(parsed)
49+
50+
3251
class ClusterList(Formatable):
3352
def __init__(self, clusters: List[Externalv1Cluster]):
3453
self.clusters = clusters
@@ -86,7 +105,7 @@ def create(
86105
region: str = "us-east-1",
87106
external_id: str = None,
88107
edit_before_creation: bool = False,
89-
wait: bool = False,
108+
do_async: bool = False,
90109
) -> None:
91110
"""request Lightning AI BYOC compute cluster creation.
92111
@@ -97,7 +116,7 @@ def create(
97116
region: AWS region containing compute resources
98117
external_id: AWS IAM Role external ID
99118
edit_before_creation: Enables interactive editing of requests before submitting it to Lightning AI.
100-
wait: Waits for the cluster to be in a RUNNING state. Only use this for debugging.
119+
do_async: Triggers cluster creation in the background and exits
101120
"""
102121
performance_profile = V1ClusterPerformanceProfile.DEFAULT
103122
if cost_savings:
@@ -130,22 +149,31 @@ def create(
130149
click.echo("cluster unchanged")
131150

132151
resp = self.api_client.cluster_service_create_cluster(body=new_body)
133-
if wait:
134-
_wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING)
135-
136152
click.echo(
137153
dedent(
138154
f"""\
139-
{resp.id} is now being created... This can take up to an hour.
155+
BYOC cluster creation triggered successfully!
156+
This can take up to an hour to complete.
140157
141158
To view the status of your clusters use:
142-
`lightning list clusters`
159+
lightning list clusters
143160
144161
To view cluster logs use:
145-
`lightning show cluster logs {resp.id}`
146-
"""
162+
lightning show cluster logs {cluster_name}
163+
164+
To delete the cluster run:
165+
lightning delete cluster {cluster_name}
166+
"""
147167
)
148168
)
169+
background_message = "\nCluster will be created in the background!"
170+
if do_async:
171+
click.echo(background_message)
172+
else:
173+
try:
174+
_wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING)
175+
except KeyboardInterrupt:
176+
click.echo(background_message)
149177

150178
def get_clusters(self) -> ClusterList:
151179
resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED])
@@ -156,7 +184,7 @@ def list(self) -> None:
156184
console = Console()
157185
console.print(clusters.as_table())
158186

159-
def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> None:
187+
def delete(self, cluster_id: str, force: bool = False, do_async: bool = False) -> None:
160188
if force:
161189
click.echo(
162190
"""
@@ -167,47 +195,86 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No
167195
)
168196
click.confirm("Do you want to continue?", abort=True)
169197

198+
resp: V1GetClusterResponse = self.api_client.cluster_service_get_cluster(id=cluster_id)
199+
bucket_name = resp.spec.driver.kubernetes.aws.bucket_name
200+
170201
self.api_client.cluster_service_delete_cluster(id=cluster_id, force=force)
171-
click.echo("Cluster deletion triggered successfully")
202+
click.echo(
203+
dedent(
204+
f"""\
205+
Cluster deletion triggered successfully
206+
207+
For safety purposes we will not delete anything in the S3 bucket associated with the cluster:
208+
{bucket_name}
172209
173-
if wait:
174-
_wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED)
210+
You may want to delete it manually using the AWS CLI:
211+
aws s3 rb --force s3://{bucket_name}
212+
"""
213+
)
214+
)
215+
216+
background_message = "\nCluster will be deleted in the background!"
217+
if do_async:
218+
click.echo(background_message)
219+
else:
220+
try:
221+
_wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED)
222+
except KeyboardInterrupt:
223+
click.echo(background_message)
175224

176225

177226
def _wait_for_cluster_state(
178227
api_client: LightningClient,
179228
cluster_id: str,
180229
target_state: V1ClusterState,
181-
max_wait_time: int = MAX_CLUSTER_WAIT_TIME,
182-
check_timeout: int = CLUSTER_STATE_CHECKING_TIMEOUT,
230+
timeout_seconds: int = MAX_CLUSTER_WAIT_TIME,
231+
poll_duration_seconds: int = 10,
183232
) -> None:
184233
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.
185234
235+
Messages will be displayed to the user as the cluster changes state.
236+
We poll the API server for any changes
237+
186238
Args:
187239
api_client: LightningClient used for polling
188240
cluster_id: Specifies the cluster to wait for
189241
target_state: Specifies the desired state the target cluster needs to meet
190-
max_wait_time: Maximum duration to wait (in seconds)
191-
check_timeout: duration between polling for the cluster state (in seconds)
242+
timeout_seconds: Maximum duration to wait
243+
poll_duration_seconds: duration between polling for the cluster state
192244
"""
193245
start = time.time()
194246
elapsed = 0
195-
while elapsed < max_wait_time:
196-
cluster_resp = api_client.cluster_service_list_clusters()
197-
new_cluster = None
198-
for clust in cluster_resp.clusters:
199-
if clust.id == cluster_id:
200-
new_cluster = clust
201-
break
202-
if new_cluster is not None:
203-
if new_cluster.status.phase == target_state:
247+
248+
click.echo(f"Waiting for cluster to be {ClusterState.from_api(target_state)}...")
249+
while elapsed < timeout_seconds:
250+
try:
251+
resp: V1GetClusterResponse = api_client.cluster_service_get_cluster(id=cluster_id)
252+
click.echo(_cluster_status_long(cluster=resp, desired_state=target_state, elapsed=elapsed))
253+
if resp.status.phase == target_state:
204254
break
205-
elif new_cluster.status.phase == V1ClusterState.FAILED:
206-
raise click.ClickException(f"Cluster {cluster_id} is in failed state.")
207-
time.sleep(check_timeout)
208-
elapsed = int(time.time() - start)
255+
time.sleep(poll_duration_seconds)
256+
elapsed = int(time.time() - start)
257+
except lightning_cloud.openapi.rest.ApiException as e:
258+
if e.status == 404 and target_state == V1ClusterState.DELETED:
259+
return
260+
raise
209261
else:
210-
raise click.ClickException("Max wait time elapsed")
262+
state_str = ClusterState.from_api(target_state)
263+
raise click.ClickException(
264+
dedent(
265+
f"""\
266+
The cluster has not entered the {state_str} state within {_format_elapsed_seconds(timeout_seconds)}.
267+
268+
The cluster may eventually be {state_str} afterwards, please check its status using:
269+
lighting list clusters
270+
271+
To view cluster logs use:
272+
lightning show cluster logs {cluster_id}
273+
274+
Contact [email protected] for additional help
275+
"""
276+
)
277+
)
211278

212279

213280
def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str:
@@ -219,3 +286,76 @@ def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str:
219286
Provide a cluster name using valid characters and try again."""
220287
)
221288
return value
289+
290+
291+
def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1ClusterState, elapsed: float) -> str:
292+
"""Echos a long-form status message to the user about the cluster state.
293+
294+
Args:
295+
cluster: The cluster object
296+
elapsed: Seconds since we've started polling
297+
"""
298+
299+
cluster_name = cluster.name
300+
current_state = cluster.status.phase
301+
current_reason = cluster.status.reason
302+
bucket_name = cluster.spec.driver.kubernetes.aws.bucket_name
303+
304+
duration = _format_elapsed_seconds(elapsed)
305+
306+
if current_state == V1ClusterState.FAILED:
307+
return dedent(
308+
f"""\
309+
The requested cluster operation for cluster {cluster_name} has errors:
310+
{current_reason}
311+
312+
---
313+
We are automatically retrying, and an automated alert has been created
314+
315+
WARNING: Any non-deleted cluster may be using resources.
316+
To avoid incuring cost on your cloud provider, delete the cluster using the following command:
317+
lightning delete cluster {cluster_name}
318+
319+
Contact [email protected] for additional help
320+
"""
321+
)
322+
323+
if desired_state == current_state == V1ClusterState.RUNNING:
324+
return dedent(
325+
f"""\
326+
Cluster {cluster_name} is now running and ready to use.
327+
To launch an app on this cluster use: lightning run app app.py --cloud --cluster-id {cluster_name}
328+
"""
329+
)
330+
331+
if desired_state == V1ClusterState.RUNNING:
332+
return f"Cluster {cluster_name} is being created [elapsed={duration}]"
333+
334+
if desired_state == current_state == V1ClusterState.DELETED:
335+
return dedent(
336+
f"""\
337+
Cluster {cluster_name} has been successfully deleted, and almost all AWS resources have been removed
338+
339+
For safety purposes we kept the S3 bucket associated with the cluster: {bucket_name}
340+
341+
You may want to delete it manually using the AWS CLI:
342+
aws s3 rb --force s3://{bucket_name}
343+
"""
344+
)
345+
346+
if desired_state == V1ClusterState.DELETED:
347+
return f"Cluster {cluster_name} is being deleted [elapsed={duration}]"
348+
349+
raise click.ClickException(f"Unknown cluster desired state {desired_state}")
350+
351+
352+
def _format_elapsed_seconds(seconds: Union[float, int]) -> str:
353+
"""Turns seconds into a duration string.
354+
355+
>>> _format_elapsed_seconds(5)
356+
'05s'
357+
>>> _format_elapsed_seconds(60)
358+
'01m00s'
359+
"""
360+
minutes, seconds = divmod(seconds, 60)
361+
return (f"{minutes:02}m" if minutes else "") + f"{seconds:02}s"

src/lightning_app/cli/lightning_cli_create.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def create() -> None:
3737
type=bool,
3838
required=False,
3939
default=False,
40+
hidden=True,
4041
is_flag=True,
4142
help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for performance.
4243
This makes runs more expensive but start-up times decrease.""",
@@ -45,16 +46,17 @@ def create() -> None:
4546
"--edit-before-creation",
4647
default=False,
4748
is_flag=True,
49+
hidden=True,
4850
help="Edit the cluster specs before submitting them to the API server.",
4951
)
5052
@click.option(
51-
"--wait",
52-
"wait",
53+
"--async",
54+
"do_async",
5355
type=bool,
5456
required=False,
5557
default=False,
5658
is_flag=True,
57-
help="Enabling this flag makes the CLI wait until the cluster is running.",
59+
help="This flag makes the CLI return immediately and lets the cluster creation happen in the background.",
5860
)
5961
def create_cluster(
6062
cluster_name: str,
@@ -64,7 +66,7 @@ def create_cluster(
6466
provider: str,
6567
edit_before_creation: bool,
6668
enable_performance: bool,
67-
wait: bool,
69+
do_async: bool,
6870
**kwargs: Any,
6971
) -> None:
7072
"""Create a Lightning AI BYOC compute cluster with your cloud provider credentials."""
@@ -79,7 +81,7 @@ def create_cluster(
7981
external_id=external_id,
8082
edit_before_creation=edit_before_creation,
8183
cost_savings=not enable_performance,
82-
wait=wait,
84+
do_async=do_async,
8385
)
8486

8587

0 commit comments

Comments
 (0)