Skip to content
Merged
Show file tree
Hide file tree
Changes from 49 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
ac3495c
e2e test skeleton for cluster CLI
nicolai86 Jul 20, 2022
e5b85a3
implement skeleton of cluster CLI
nicolai86 Jul 20, 2022
aff7ec2
working cluster listing
nicolai86 Jul 22, 2022
ce34ff0
migrate cluster creation
nicolai86 Jul 22, 2022
8a72873
add cluster deletion code
nicolai86 Jul 22, 2022
8d4d88c
migrate some cluster configuration into cmd_clusters
nicolai86 Jul 22, 2022
bd1eccb
refactor waiting for cluster state to be configurable
nicolai86 Jul 22, 2022
5941876
add basic unittests for wait for cluster state
nicolai86 Jul 22, 2022
483b993
add basic unit tests for cluster name
nicolai86 Jul 22, 2022
81b0706
add unit tests for cluster mgmt api
nicolai86 Jul 22, 2022
362c470
documentation update
nicolai86 Jul 25, 2022
cabf617
adjust wording
nicolai86 Jul 25, 2022
6378481
Merge branch 'master' into nicolai86/cluster-management
nicolai86 Jul 25, 2022
b23164d
change CLI to <lightning> <verb> <object>
nicolai86 Jul 25, 2022
ac8f0b5
more wording changes
nicolai86 Jul 25, 2022
fc5a1e6
adjust e2e tests
nicolai86 Jul 25, 2022
9330c85
refactor command names to be more uniform
nicolai86 Jul 25, 2022
d8074ec
update CHANGELOG
nicolai86 Jul 25, 2022
99355c3
fix tests not working when un-authenticated
nicolai86 Jul 26, 2022
e02e143
Update src/lightning_app/CHANGELOG.md
nicolai86 Jul 26, 2022
d1ad3c2
wording update
nicolai86 Jul 26, 2022
459bbfa
Merge branch 'master' into nicolai86/cluster-management
nicolai86 Jul 26, 2022
ef6748b
drop dependency on arrow
nicolai86 Jul 26, 2022
911411b
doc strings and variable renaming
nicolai86 Jul 26, 2022
1ad8438
add api unit tests
nicolai86 Jul 26, 2022
397131d
dropping environment variables not relevant to CLI
nicolai86 Jul 26, 2022
9ce60e1
split up CLI
nicolai86 Jul 26, 2022
a78e6b6
do not print raw cluster response
nicolai86 Jul 26, 2022
d60d28a
Merge branch 'master' into nicolai86/cluster-management
nicolai86 Jul 27, 2022
7766a7a
Merge branch 'master' into nicolai86/cluster-management
nicolai86 Jul 27, 2022
1597cfd
properly import commands from separate files
nicolai86 Jul 28, 2022
3da681d
Merge branch 'master' into nicolai86/cluster-management
nicolai86 Jul 28, 2022
e9b3900
PR feedback from jirka, thomas
nicolai86 Jul 28, 2022
9aa1b26
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 28, 2022
3e6e1e5
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
93ee4aa
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
54a3969
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
f1d9d4a
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
c85e301
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
a0489d6
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
164a63a
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
58046c5
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
98346b4
Update src/lightning_app/cli/cmd_clusters.py
nicolai86 Jul 28, 2022
4b49287
Update src/lightning_app/utilities/openapi.py
nicolai86 Jul 28, 2022
f50670d
Update src/lightning_app/utilities/openapi.py
nicolai86 Jul 28, 2022
22868f3
Update src/lightning_app/utilities/openapi.py
nicolai86 Jul 28, 2022
1b8c19a
Update src/lightning_app/utilities/openapi.py
nicolai86 Jul 28, 2022
37fd580
more python comment style changes
nicolai86 Jul 28, 2022
1832d9d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 28, 2022
a6d4323
remove instance types list
nicolai86 Jul 29, 2022
b89ff6e
Merge branch 'master' into nicolai86/cluster-management
awaelchli Aug 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/lightning_app/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Added

- Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602))
- Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835))

### Changed

Expand Down
240 changes: 240 additions & 0 deletions src/lightning_app/cli/cmd_clusters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
import json
import re
import time
from datetime import datetime

import click
from lightning_cloud.openapi import (
V1AWSClusterDriverSpec,
V1ClusterDriver,
V1ClusterPerformanceProfile,
V1ClusterSpec,
V1CreateClusterRequest,
V1InstanceSpec,
V1KubernetesClusterDriver,
)
from lightning_cloud.openapi.models import Externalv1Cluster, V1ClusterState, V1ClusterType
from rich.console import Console
from rich.table import Table
from rich.text import Text

from lightning_app.cli.core import Formatable
from lightning_app.utilities.network import LightningClient
from lightning_app.utilities.openapi import create_openapi_object, string2dict

CLUSTER_STATE_CHECKING_TIMEOUT = 60
MAX_CLUSTER_WAIT_TIME = 5400


class AWSClusterManager:
"""AWSClusterManager implements API calls specific to Lightning AI BYOC compute clusters when the AWS provider
is selected as the backend compute."""

def __init__(self):
self.api_client = LightningClient()

def create(
self,
cost_savings: bool = False,
cluster_name: str = None,
role_arn: str = None,
region: str = "us-east-1",
external_id: str = None,
instance_types: [str] = [],
edit_before_creation: bool = False,
wait: bool = False,
):
"""request Lightning AI BYOC compute cluster creation.

Args:
cost_savings: Specifies if the cluster uses cost savings mode
cluster_name: The name of the cluster to be created
role_arn: AWS IAM Role ARN used to provision resources
region: AWS region containing compute resources
external_id: AWS IAM Role external ID
instance_types: AWS instance types supported by the cluster
edit_before_creation: Enables interactive editing of requests before submitting it to Lightning AI.
wait: Waits for the cluster to be in a RUNNING state. Only use this for debugging.
"""
performance_profile = V1ClusterPerformanceProfile.DEFAULT
if cost_savings:
"""In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters
with low utilization."""
performance_profile = V1ClusterPerformanceProfile.COST_SAVING

body = V1CreateClusterRequest(
name=cluster_name,
spec=V1ClusterSpec(
cluster_type=V1ClusterType.BYOC,
performance_profile=performance_profile,
driver=V1ClusterDriver(
kubernetes=V1KubernetesClusterDriver(
aws=V1AWSClusterDriverSpec(
region=region,
role_arn=role_arn,
external_id=external_id,
instance_types=[V1InstanceSpec(name=x) for x in instance_types],
)
)
),
),
)
new_body = body
if edit_before_creation:
after = click.edit(json.dumps(body.to_dict(), indent=4))
if after is not None:
new_body = create_openapi_object(string2dict(after), body)
if new_body == body:
click.echo("cluster unchanged")

resp = self.api_client.cluster_service_create_cluster(body=new_body)
if wait:
_wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING)

click.echo(f"${resp.id} cluster is ${resp.status.phase}")

def list(self):
resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED])
console = Console()
console.print(ClusterList(resp.clusters).as_table())

def delete(self, cluster_id: str = None, force: bool = False, wait: bool = False):
if force:
click.echo(
"""
Deletes a BYOC cluster. Lightning AI removes cluster artifacts and any resources running on the cluster.\n
WARNING: Deleting a cluster does not clean up any resources managed by Lightning AI.\n
Check your cloud provider to verify that existing cloud resources are deleted.
"""
)
click.confirm("Do you want to continue?", abort=True)

self.api_client.cluster_service_delete_cluster(id=cluster_id, force=force)
click.echo("Cluster deletion triggered successfully")

if wait:
_wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED)


class ClusterList(Formatable):
def __init__(self, clusters: [Externalv1Cluster]):
self.clusters = clusters

def as_json(self) -> str:
return json.dumps(self.clusters)

def as_table(self) -> Table:
table = Table("id", "name", "type", "status", "created", show_header=True, header_style="bold green")
phases = {
V1ClusterState.QUEUED: Text("queued", style="bold yellow"),
V1ClusterState.PENDING: Text("pending", style="bold yellow"),
V1ClusterState.RUNNING: Text("running", style="bold green"),
V1ClusterState.FAILED: Text("failed", style="bold red"),
V1ClusterState.DELETED: Text("deleted", style="bold red"),
}

cluster_type_lookup = {
V1ClusterType.BYOC: Text("byoc", style="bold yellow"),
V1ClusterType.GLOBAL: Text("lightning-cloud", style="bold green"),
}
for cluster in self.clusters:
cluster: Externalv1Cluster
status = phases[cluster.status.phase]
if cluster.spec.desired_state == V1ClusterState.DELETED and cluster.status.phase != V1ClusterState.DELETED:
status = Text("terminating", style="bold red")

# this guard is necessary only until 0.3.93 releases which includes the `created_at`
# field to the external API
created_at = datetime.now()
if hasattr(cluster, "created_at"):
created_at = cluster.created_at

table.add_row(
cluster.id,
cluster.name,
cluster_type_lookup.get(cluster.spec.cluster_type, Text("unknown", style="red")),
status,
created_at.strftime("%Y-%m-%d") if created_at else "",
)
return table


def _wait_for_cluster_state(
api_client: LightningClient,
cluster_id: str,
target_state: V1ClusterState,
max_wait_time: int = MAX_CLUSTER_WAIT_TIME,
check_timeout: int = CLUSTER_STATE_CHECKING_TIMEOUT,
):
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.

Args:
api_client: LightningClient used for polling
cluster_id: Specifies the cluster to wait for
target_state: Specifies the desired state the target cluster needs to meet
max_wait_time: Maximum duration to wait (in seconds)
check_timeout: duration between polling for the cluster state (in seconds)
"""
start = time.time()
elapsed = 0
while elapsed < max_wait_time:
cluster_resp = api_client.cluster_service_list_clusters()
new_cluster = None
for clust in cluster_resp.clusters:
if clust.id == cluster_id:
new_cluster = clust
break
if new_cluster is not None:
if new_cluster.status.phase == target_state:
break
elif new_cluster.status.phase == V1ClusterState.FAILED:
raise click.ClickException(f"Cluster {cluster_id} is in failed state.")
time.sleep(check_timeout)
elapsed = time.time() - start
else:
raise click.ClickException("Max wait time elapsed")


def _check_cluster_name_is_valid(_ctx, _param, value):
pattern = r"^(?!-)[a-z0-9-]{1,63}(?<!-)$"
if not re.match(pattern, value):
raise click.ClickException(
"""The cluster name is invalid.
Cluster names can only contain lowercase letters, numbers, and periodic hyphens ( - ).
Provide a cluster name using valid characters and try again."""
)
return value


_default_instance_types = frozenset(
[
"g2.8xlarge",
"g3.16xlarge",
"g3.4xlarge",
"g3.8xlarge",
"g3s.xlarge",
"g4dn.12xlarge",
"g4dn.16xlarge",
"g4dn.2xlarge",
"g4dn.4xlarge",
"g4dn.8xlarge",
"g4dn.metal",
"g4dn.xlarge",
"p2.16xlarge",
"p2.8xlarge",
"p2.xlarge",
"p3.16xlarge",
"p3.2xlarge",
"p3.8xlarge",
"p3dn.24xlarge",
# "p4d.24xlarge", # currently not supported
"t2.large",
"t2.medium",
"t2.xlarge",
"t2.2xlarge",
"t3.large",
"t3.medium",
"t3.xlarge",
"t3.2xlarge",
]
)
13 changes: 13 additions & 0 deletions src/lightning_app/cli/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import abc

from rich.table import Table


class Formatable(abc.ABC):
@abc.abstractmethod
def as_table(self) -> Table:
pass

@abc.abstractmethod
def as_json(self) -> str:
pass
16 changes: 6 additions & 10 deletions src/lightning_app/cli/lightning_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

from lightning_app import __version__ as ver
from lightning_app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init
from lightning_app.cli.lightning_cli_create import create
from lightning_app.cli.lightning_cli_delete import delete
from lightning_app.cli.lightning_cli_list import get_list
from lightning_app.core.constants import get_lightning_cloud_url, LOCAL_LAUNCH_ADMIN_VIEW
from lightning_app.runners.runtime import dispatch
from lightning_app.runners.runtime_type import RuntimeType
Expand Down Expand Up @@ -206,16 +209,9 @@ def stop():
pass


@_main.group(hidden=True)
def delete():
"""Delete an application."""
pass


@_main.group(name="list", hidden=True)
def get_list():
"""List your applications."""
pass
_main.add_command(get_list)
_main.add_command(delete)
_main.add_command(create)


@_main.group()
Expand Down
86 changes: 86 additions & 0 deletions src/lightning_app/cli/lightning_cli_create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import click

from lightning_app.cli.cmd_clusters import _check_cluster_name_is_valid, _default_instance_types, AWSClusterManager


@click.group("create")
def create():
"""Create Lightning AI BYOC managed resources."""
pass


@create.command("cluster")
@click.argument("cluster_name", callback=_check_cluster_name_is_valid)
@click.option("--provider", "provider", type=str, default="aws", help="cloud provider to be used for your cluster")
@click.option("--external-id", "external_id", type=str, required=True)
@click.option(
"--role-arn", "role_arn", type=str, required=True, help="AWS role ARN attached to the associated resources."
)
@click.option(
"--region",
"region",
type=str,
required=False,
default="us-east-1",
help="AWS region that is used to host the associated resources.",
)
@click.option(
"--instance-types",
"instance_types",
type=str,
required=False,
default=",".join(_default_instance_types),
help="Instance types that you want to support, for computer jobs within the cluster.",
)
@click.option(
"--cost-savings",
"cost_savings",
type=bool,
required=False,
default=False,
is_flag=True,
help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for cost savings.
This makes runs cheaper but start-up times may increase.""",
)
@click.option(
"--edit-before-creation",
default=False,
is_flag=True,
help="Edit the cluster specs before submitting them to the API server.",
)
@click.option(
"--wait",
"wait",
type=bool,
required=False,
default=False,
is_flag=True,
help="Enabling this flag makes the CLI wait until the cluster is running.",
)
def create_cluster(
cluster_name: str,
region: str,
role_arn: str,
external_id: str,
provider: str,
instance_types: str,
edit_before_creation: bool,
cost_savings: bool,
wait: bool,
**kwargs,
):
"""Create a Lightning AI BYOC compute cluster with your cloud provider credentials."""
if provider != "aws":
click.echo("Only AWS is supported for now. But support for more providers is coming soon.")
return
cluster_manager = AWSClusterManager()
cluster_manager.create(
cluster_name=cluster_name,
region=region,
role_arn=role_arn,
external_id=external_id,
instance_types=instance_types.split(","),
edit_before_creation=edit_before_creation,
cost_savings=cost_savings,
wait=wait,
)
Loading