Skip to content

Commit d3215ad

Browse files
luca3rdnmiculinic
authored andcommitted
Wait by default and notify on state changes
1 parent 921dc1c commit d3215ad

File tree

7 files changed

+214
-30
lines changed

7 files changed

+214
-30
lines changed

docs/source-app/workflows/byoc/index.rst

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ Parameters
6363
^^^^^^^^^^
6464

6565
+------------------------+----------------------------------------------------------------------------------------------------+
66-
|Parameter | Descritption |
66+
|Parameter | Description |
6767
+========================+====================================================================================================+
6868
| provider | The cloud provider where your cluster is located. |
6969
| | |
@@ -78,18 +78,14 @@ Parameters
7878
+------------------------+----------------------------------------------------------------------------------------------------+
7979
| region | AWS region containing compute resources |
8080
+------------------------+----------------------------------------------------------------------------------------------------+
81-
| instance-types | Instance types that you want to support, for computer jobs within the cluster. |
82-
| | |
83-
| | For now, this is the AWS instance types supported by the cluster. |
84-
+------------------------+----------------------------------------------------------------------------------------------------+
8581
| enable-performance | Specifies if the cluster uses cost savings mode. |
8682
| | |
8783
| | In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters |
8884
| | with low utilization. |
8985
+------------------------+----------------------------------------------------------------------------------------------------+
9086
| edit-before-creation | Enables interactive editing of requests before submitting it to Lightning AI. |
9187
+------------------------+----------------------------------------------------------------------------------------------------+
92-
| wait | Waits for the cluster to be in a RUNNING state. Only use this for debugging. |
88+
| no-wait | Cluster creation will happen in the background. |
9389
+------------------------+----------------------------------------------------------------------------------------------------+
9490

9591
----

src/lightning_app/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
1919

2020
### Changed
2121

22+
- Cluster creation and deletion now waits by default [#15458](https://github.com/Lightning-AI/lightning/pull/15458)
2223
- Changed the `flow.flows` to be recursive wont to align the behavior with the `flow.works` ([#15466](https://github.com/Lightning-AI/lightning/pull/15466))
2324

2425
-

src/lightning_app/cli/cmd_clusters.py

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import re
33
import time
44
from datetime import datetime
5+
from enum import Enum
56
from textwrap import dedent
67
from typing import Any, List
78

@@ -29,6 +30,23 @@
2930
MAX_CLUSTER_WAIT_TIME = 5400
3031

3132

33+
class ClusterState(Enum):
34+
UNSPECIFIED = "unspecified"
35+
QUEUED = "queued"
36+
PENDING = "pending"
37+
RUNNING = "running"
38+
FAILED = "failed"
39+
DELETED = "deleted"
40+
41+
def __str__(self) -> str:
42+
return str(self.value)
43+
44+
@classmethod
45+
def from_api(cls, status: V1ClusterState) -> "ClusterState":
46+
parsed = str(status).lower().split("_", maxsplit=2)[-1]
47+
return cls(parsed)
48+
49+
3250
class ClusterList(Formatable):
3351
def __init__(self, clusters: List[Externalv1Cluster]):
3452
self.clusters = clusters
@@ -130,9 +148,6 @@ def create(
130148
click.echo("cluster unchanged")
131149

132150
resp = self.api_client.cluster_service_create_cluster(body=new_body)
133-
if wait:
134-
_wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING)
135-
136151
click.echo(
137152
dedent(
138153
f"""\
@@ -146,6 +161,13 @@ def create(
146161
"""
147162
)
148163
)
164+
if wait:
165+
click.echo("Waiting for cluster to enter state running...")
166+
click.echo(
167+
"Canceling this operation will NOT stop the cluster from creating"
168+
f"(use `lightning delete cluster {resp.id}`)"
169+
)
170+
_wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING)
149171

150172
def get_clusters(self) -> ClusterList:
151173
resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED])
@@ -171,6 +193,8 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No
171193
click.echo("Cluster deletion triggered successfully")
172194

173195
if wait:
196+
click.echo("Waiting for cluster to delete...")
197+
click.echo("Canceling the operation will NOT stop the cluster from deleting")
174198
_wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED)
175199

176200

@@ -183,6 +207,8 @@ def _wait_for_cluster_state(
183207
) -> None:
184208
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.
185209
210+
Messages will be displayed to the user as the cluster changes state.
211+
186212
Args:
187213
api_client: LightningClient used for polling
188214
cluster_id: Specifies the cluster to wait for
@@ -192,6 +218,7 @@ def _wait_for_cluster_state(
192218
"""
193219
start = time.time()
194220
elapsed = 0
221+
195222
while elapsed < max_wait_time:
196223
cluster_resp = api_client.cluster_service_list_clusters()
197224
new_cluster = None
@@ -200,10 +227,14 @@ def _wait_for_cluster_state(
200227
new_cluster = clust
201228
break
202229
if new_cluster is not None:
230+
echo_cluster_status_long(
231+
cluster_id=cluster_id,
232+
current_state=new_cluster.status.phase,
233+
current_reason=new_cluster.status.reason,
234+
desired_state=target_state,
235+
)
203236
if new_cluster.status.phase == target_state:
204237
break
205-
elif new_cluster.status.phase == V1ClusterState.FAILED:
206-
raise click.ClickException(f"Cluster {cluster_id} is in failed state.")
207238
time.sleep(check_timeout)
208239
elapsed = int(time.time() - start)
209240
else:
@@ -219,3 +250,49 @@ def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str:
219250
Provide a cluster name using valid characters and try again."""
220251
)
221252
return value
253+
254+
255+
def echo_cluster_status_long(
256+
cluster_id: str,
257+
current_state: V1ClusterState,
258+
current_reason: str,
259+
desired_state: V1ClusterState,
260+
) -> None:
261+
"""Echos a long-form status message to the user about the cluster state.
262+
263+
Args:
264+
cluster_id: The name of the cluster
265+
current_state: The cluster's current state
266+
reason: The reason for the cluster's state
267+
"""
268+
269+
state_str = ClusterState.from_api(current_state)
270+
271+
message = f"Cluster {cluster_id} is now {state_str}" + (
272+
f" with the following reason: {current_reason}" if current_reason else ""
273+
)
274+
275+
if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.RUNNING:
276+
message = "\n".join(
277+
[
278+
f"Cluster {cluster_id} is now running and ready to use.",
279+
f"To launch an app on this cluster use `lightning run app app.py --cloud --cluster-id {cluster_id}`",
280+
]
281+
)
282+
if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.DELETED:
283+
message = f"Cluster {cluster_id} is terminating"
284+
if current_state == V1ClusterState.FAILED:
285+
message = "\n".join(
286+
[
287+
message,
288+
"We are automatically retrying cluster creation.",
289+
"In case you want to delete this cluster:",
290+
"1. Stop this command",
291+
f"2. Run `lightning delete cluster {cluster_id}",
292+
"WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you.",
293+
]
294+
)
295+
if current_state == V1ClusterState.DELETED:
296+
message = f"Cluster {cluster_id} has been deleted."
297+
298+
click.echo(message)

src/lightning_app/cli/lightning_cli_create.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def create() -> None:
3333
type=bool,
3434
required=False,
3535
default=False,
36+
hidden=True,
3637
is_flag=True,
3738
help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for performance.
3839
This makes runs more expensive but start-up times decrease.""",
@@ -41,16 +42,17 @@ def create() -> None:
4142
"--edit-before-creation",
4243
default=False,
4344
is_flag=True,
45+
hidden=True,
4446
help="Edit the cluster specs before submitting them to the API server.",
4547
)
4648
@click.option(
47-
"--wait",
48-
"wait",
49+
"--async",
50+
"no_wait",
4951
type=bool,
5052
required=False,
5153
default=False,
5254
is_flag=True,
53-
help="Enabling this flag makes the CLI wait until the cluster is running.",
55+
help="This flag makes the CLI return immediately and lets the cluster creation happen in the background.",
5456
)
5557
def create_cluster(
5658
cluster_name: str,
@@ -60,7 +62,7 @@ def create_cluster(
6062
provider: str,
6163
edit_before_creation: bool,
6264
enable_performance: bool,
63-
wait: bool,
65+
no_wait: bool,
6466
**kwargs: Any,
6567
) -> None:
6668
"""Create a Lightning AI BYOC compute cluster with your cloud provider credentials."""
@@ -75,5 +77,5 @@ def create_cluster(
7577
external_id=external_id,
7678
edit_before_creation=edit_before_creation,
7779
cost_savings=not enable_performance,
78-
wait=wait,
80+
wait=not no_wait,
7981
)

src/lightning_app/cli/lightning_cli_delete.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ def delete() -> None:
2424
WARNING: You should NOT use this under normal circumstances.""",
2525
)
2626
@click.option(
27-
"--wait",
28-
"wait",
27+
"--no-wait",
28+
"no_wait",
2929
type=bool,
3030
required=False,
3131
default=False,
3232
is_flag=True,
33-
help="Enabling this flag makes the CLI wait until the cluster is deleted.",
33+
help="This flag makes the CLI return immediately and lets the cluster deletion happen in the background",
3434
)
35-
def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> None:
35+
def delete_cluster(cluster: str, force: bool = False, no_wait: bool = False) -> None:
3636
"""Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources.
3737
3838
Deleting a run also deletes all Runs and Experiments that were started on the cluster.
@@ -46,4 +46,4 @@ def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> Non
4646
All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered.
4747
"""
4848
cluster_manager = AWSClusterManager()
49-
cluster_manager.delete(cluster_id=cluster, force=force, wait=wait)
49+
cluster_manager.delete(cluster_id=cluster, force=force, wait=not no_wait)

tests/tests_app/cli/test_cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def test_create_cluster(create_command: mock.MagicMock, extra_arguments, expecte
130130
external_id="dummy",
131131
edit_before_creation=False,
132132
cost_savings=expected_cost_savings_mode,
133-
wait=False,
133+
wait=True,
134134
)
135135

136136

@@ -158,7 +158,7 @@ def test_delete_cluster(delete: mock.MagicMock):
158158
runner = CliRunner()
159159
runner.invoke(delete_cluster, ["test-7"])
160160

161-
delete.assert_called_once_with(cluster_id="test-7", force=False, wait=False)
161+
delete.assert_called_once_with(cluster_id="test-7", force=False, wait=True)
162162

163163

164164
@mock.patch("lightning_app.utilities.login.Auth._run_server")

0 commit comments

Comments
 (0)