Skip to content

Commit 2919dcf

Browse files
authored
[CLI] add support for cluster management (#13835)
1 parent b3203d9 commit 2919dcf

File tree

13 files changed

+707
-11
lines changed

13 files changed

+707
-11
lines changed

src/lightning_app/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
99
### Added
1010

1111
- Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602))
12+
- Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835))
1213

1314
- Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830))
1415

src/lightning_app/cli/cmd_clusters.py

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
import json
2+
import re
3+
import time
4+
from datetime import datetime
5+
6+
import click
7+
from lightning_cloud.openapi import (
8+
V1AWSClusterDriverSpec,
9+
V1ClusterDriver,
10+
V1ClusterPerformanceProfile,
11+
V1ClusterSpec,
12+
V1CreateClusterRequest,
13+
V1InstanceSpec,
14+
V1KubernetesClusterDriver,
15+
)
16+
from lightning_cloud.openapi.models import Externalv1Cluster, V1ClusterState, V1ClusterType
17+
from rich.console import Console
18+
from rich.table import Table
19+
from rich.text import Text
20+
21+
from lightning_app.cli.core import Formatable
22+
from lightning_app.utilities.network import LightningClient
23+
from lightning_app.utilities.openapi import create_openapi_object, string2dict
24+
25+
CLUSTER_STATE_CHECKING_TIMEOUT = 60
26+
MAX_CLUSTER_WAIT_TIME = 5400
27+
28+
29+
class AWSClusterManager:
30+
"""AWSClusterManager implements API calls specific to Lightning AI BYOC compute clusters when the AWS provider
31+
is selected as the backend compute."""
32+
33+
def __init__(self):
34+
self.api_client = LightningClient()
35+
36+
def create(
37+
self,
38+
cost_savings: bool = False,
39+
cluster_name: str = None,
40+
role_arn: str = None,
41+
region: str = "us-east-1",
42+
external_id: str = None,
43+
instance_types: [str] = [],
44+
edit_before_creation: bool = False,
45+
wait: bool = False,
46+
):
47+
"""request Lightning AI BYOC compute cluster creation.
48+
49+
Args:
50+
cost_savings: Specifies if the cluster uses cost savings mode
51+
cluster_name: The name of the cluster to be created
52+
role_arn: AWS IAM Role ARN used to provision resources
53+
region: AWS region containing compute resources
54+
external_id: AWS IAM Role external ID
55+
instance_types: AWS instance types supported by the cluster
56+
edit_before_creation: Enables interactive editing of requests before submitting it to Lightning AI.
57+
wait: Waits for the cluster to be in a RUNNING state. Only use this for debugging.
58+
"""
59+
performance_profile = V1ClusterPerformanceProfile.DEFAULT
60+
if cost_savings:
61+
"""In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters
62+
with low utilization."""
63+
performance_profile = V1ClusterPerformanceProfile.COST_SAVING
64+
65+
body = V1CreateClusterRequest(
66+
name=cluster_name,
67+
spec=V1ClusterSpec(
68+
cluster_type=V1ClusterType.BYOC,
69+
performance_profile=performance_profile,
70+
driver=V1ClusterDriver(
71+
kubernetes=V1KubernetesClusterDriver(
72+
aws=V1AWSClusterDriverSpec(
73+
region=region,
74+
role_arn=role_arn,
75+
external_id=external_id,
76+
instance_types=[V1InstanceSpec(name=x) for x in instance_types],
77+
)
78+
)
79+
),
80+
),
81+
)
82+
new_body = body
83+
if edit_before_creation:
84+
after = click.edit(json.dumps(body.to_dict(), indent=4))
85+
if after is not None:
86+
new_body = create_openapi_object(string2dict(after), body)
87+
if new_body == body:
88+
click.echo("cluster unchanged")
89+
90+
resp = self.api_client.cluster_service_create_cluster(body=new_body)
91+
if wait:
92+
_wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING)
93+
94+
click.echo(f"${resp.id} cluster is ${resp.status.phase}")
95+
96+
def list(self):
97+
resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED])
98+
console = Console()
99+
console.print(ClusterList(resp.clusters).as_table())
100+
101+
def delete(self, cluster_id: str = None, force: bool = False, wait: bool = False):
102+
if force:
103+
click.echo(
104+
"""
105+
Deletes a BYOC cluster. Lightning AI removes cluster artifacts and any resources running on the cluster.\n
106+
WARNING: Deleting a cluster does not clean up any resources managed by Lightning AI.\n
107+
Check your cloud provider to verify that existing cloud resources are deleted.
108+
"""
109+
)
110+
click.confirm("Do you want to continue?", abort=True)
111+
112+
self.api_client.cluster_service_delete_cluster(id=cluster_id, force=force)
113+
click.echo("Cluster deletion triggered successfully")
114+
115+
if wait:
116+
_wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED)
117+
118+
119+
class ClusterList(Formatable):
120+
def __init__(self, clusters: [Externalv1Cluster]):
121+
self.clusters = clusters
122+
123+
def as_json(self) -> str:
124+
return json.dumps(self.clusters)
125+
126+
def as_table(self) -> Table:
127+
table = Table("id", "name", "type", "status", "created", show_header=True, header_style="bold green")
128+
phases = {
129+
V1ClusterState.QUEUED: Text("queued", style="bold yellow"),
130+
V1ClusterState.PENDING: Text("pending", style="bold yellow"),
131+
V1ClusterState.RUNNING: Text("running", style="bold green"),
132+
V1ClusterState.FAILED: Text("failed", style="bold red"),
133+
V1ClusterState.DELETED: Text("deleted", style="bold red"),
134+
}
135+
136+
cluster_type_lookup = {
137+
V1ClusterType.BYOC: Text("byoc", style="bold yellow"),
138+
V1ClusterType.GLOBAL: Text("lightning-cloud", style="bold green"),
139+
}
140+
for cluster in self.clusters:
141+
cluster: Externalv1Cluster
142+
status = phases[cluster.status.phase]
143+
if cluster.spec.desired_state == V1ClusterState.DELETED and cluster.status.phase != V1ClusterState.DELETED:
144+
status = Text("terminating", style="bold red")
145+
146+
# this guard is necessary only until 0.3.93 releases which includes the `created_at`
147+
# field to the external API
148+
created_at = datetime.now()
149+
if hasattr(cluster, "created_at"):
150+
created_at = cluster.created_at
151+
152+
table.add_row(
153+
cluster.id,
154+
cluster.name,
155+
cluster_type_lookup.get(cluster.spec.cluster_type, Text("unknown", style="red")),
156+
status,
157+
created_at.strftime("%Y-%m-%d") if created_at else "",
158+
)
159+
return table
160+
161+
162+
def _wait_for_cluster_state(
163+
api_client: LightningClient,
164+
cluster_id: str,
165+
target_state: V1ClusterState,
166+
max_wait_time: int = MAX_CLUSTER_WAIT_TIME,
167+
check_timeout: int = CLUSTER_STATE_CHECKING_TIMEOUT,
168+
):
169+
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.
170+
171+
Args:
172+
api_client: LightningClient used for polling
173+
cluster_id: Specifies the cluster to wait for
174+
target_state: Specifies the desired state the target cluster needs to meet
175+
max_wait_time: Maximum duration to wait (in seconds)
176+
check_timeout: duration between polling for the cluster state (in seconds)
177+
"""
178+
start = time.time()
179+
elapsed = 0
180+
while elapsed < max_wait_time:
181+
cluster_resp = api_client.cluster_service_list_clusters()
182+
new_cluster = None
183+
for clust in cluster_resp.clusters:
184+
if clust.id == cluster_id:
185+
new_cluster = clust
186+
break
187+
if new_cluster is not None:
188+
if new_cluster.status.phase == target_state:
189+
break
190+
elif new_cluster.status.phase == V1ClusterState.FAILED:
191+
raise click.ClickException(f"Cluster {cluster_id} is in failed state.")
192+
time.sleep(check_timeout)
193+
elapsed = time.time() - start
194+
else:
195+
raise click.ClickException("Max wait time elapsed")
196+
197+
198+
def _check_cluster_name_is_valid(_ctx, _param, value):
199+
pattern = r"^(?!-)[a-z0-9-]{1,63}(?<!-)$"
200+
if not re.match(pattern, value):
201+
raise click.ClickException(
202+
"""The cluster name is invalid.
203+
Cluster names can only contain lowercase letters, numbers, and periodic hyphens ( - ).
204+
Provide a cluster name using valid characters and try again."""
205+
)
206+
return value

src/lightning_app/cli/core.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import abc
2+
3+
from rich.table import Table
4+
5+
6+
class Formatable(abc.ABC):
7+
@abc.abstractmethod
8+
def as_table(self) -> Table:
9+
pass
10+
11+
@abc.abstractmethod
12+
def as_json(self) -> str:
13+
pass

src/lightning_app/cli/lightning_cli.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212

1313
from lightning_app import __version__ as ver
1414
from lightning_app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init
15+
from lightning_app.cli.lightning_cli_create import create
16+
from lightning_app.cli.lightning_cli_delete import delete
17+
from lightning_app.cli.lightning_cli_list import get_list
1518
from lightning_app.core.constants import get_lightning_cloud_url, LOCAL_LAUNCH_ADMIN_VIEW
1619
from lightning_app.runners.runtime import dispatch
1720
from lightning_app.runners.runtime_type import RuntimeType
@@ -206,16 +209,9 @@ def stop():
206209
pass
207210

208211

209-
@_main.group(hidden=True)
210-
def delete():
211-
"""Delete an application."""
212-
pass
213-
214-
215-
@_main.group(name="list", hidden=True)
216-
def get_list():
217-
"""List your applications."""
218-
pass
212+
_main.add_command(get_list)
213+
_main.add_command(delete)
214+
_main.add_command(create)
219215

220216

221217
@_main.group()
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import click
2+
3+
from lightning_app.cli.cmd_clusters import _check_cluster_name_is_valid, AWSClusterManager
4+
5+
6+
@click.group("create")
7+
def create():
8+
"""Create Lightning AI BYOC managed resources."""
9+
pass
10+
11+
12+
@create.command("cluster")
13+
@click.argument("cluster_name", callback=_check_cluster_name_is_valid)
14+
@click.option("--provider", "provider", type=str, default="aws", help="cloud provider to be used for your cluster")
15+
@click.option("--external-id", "external_id", type=str, required=True)
16+
@click.option(
17+
"--role-arn", "role_arn", type=str, required=True, help="AWS role ARN attached to the associated resources."
18+
)
19+
@click.option(
20+
"--region",
21+
"region",
22+
type=str,
23+
required=False,
24+
default="us-east-1",
25+
help="AWS region that is used to host the associated resources.",
26+
)
27+
@click.option(
28+
"--instance-types",
29+
"instance_types",
30+
type=str,
31+
required=False,
32+
default=None,
33+
help="Instance types that you want to support, for computer jobs within the cluster.",
34+
)
35+
@click.option(
36+
"--cost-savings",
37+
"cost_savings",
38+
type=bool,
39+
required=False,
40+
default=False,
41+
is_flag=True,
42+
help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for cost savings.
43+
This makes runs cheaper but start-up times may increase.""",
44+
)
45+
@click.option(
46+
"--edit-before-creation",
47+
default=False,
48+
is_flag=True,
49+
help="Edit the cluster specs before submitting them to the API server.",
50+
)
51+
@click.option(
52+
"--wait",
53+
"wait",
54+
type=bool,
55+
required=False,
56+
default=False,
57+
is_flag=True,
58+
help="Enabling this flag makes the CLI wait until the cluster is running.",
59+
)
60+
def create_cluster(
61+
cluster_name: str,
62+
region: str,
63+
role_arn: str,
64+
external_id: str,
65+
provider: str,
66+
instance_types: str,
67+
edit_before_creation: bool,
68+
cost_savings: bool,
69+
wait: bool,
70+
**kwargs,
71+
):
72+
"""Create a Lightning AI BYOC compute cluster with your cloud provider credentials."""
73+
if provider != "aws":
74+
click.echo("Only AWS is supported for now. But support for more providers is coming soon.")
75+
return
76+
cluster_manager = AWSClusterManager()
77+
cluster_manager.create(
78+
cluster_name=cluster_name,
79+
region=region,
80+
role_arn=role_arn,
81+
external_id=external_id,
82+
instance_types=instance_types.split(","),
83+
edit_before_creation=edit_before_creation,
84+
cost_savings=cost_savings,
85+
wait=wait,
86+
)
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import click
2+
3+
from lightning_app.cli.cmd_clusters import AWSClusterManager
4+
5+
6+
@click.group("delete")
7+
def delete():
8+
"""Delete Lightning AI BYOC managed resources."""
9+
pass
10+
11+
12+
@delete.command("cluster")
13+
@click.argument("cluster", type=str)
14+
@click.option(
15+
"--force",
16+
"force",
17+
type=bool,
18+
required=False,
19+
default=False,
20+
is_flag=True,
21+
help="""Delete a BYOC cluster from Lightning AI. This does NOT delete any resources created by the cluster,
22+
it just removes the entry from Lightning AI.
23+
24+
WARNING: You should NOT use this under normal circumstances.""",
25+
)
26+
@click.option(
27+
"--wait",
28+
"wait",
29+
type=bool,
30+
required=False,
31+
default=False,
32+
is_flag=True,
33+
help="Enabling this flag makes the CLI wait until the cluster is deleted.",
34+
)
35+
def delete_cluster(cluster: str, force: bool = False, wait: bool = False):
36+
"""Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources.
37+
38+
Deleting a run also deletes all Runs and Experiments that were started on the cluster.
39+
Deletion permanently removes not only the record of all runs on a cluster, but all associated experiments,
40+
artifacts, metrics, logs, etc.
41+
42+
WARNING: This process may take a few minutes to complete, but once started it CANNOT be rolled back.
43+
Deletion permanently removes not only the BYOC cluster from being managed by Lightning AI, but tears down
44+
every BYOC resource Lightning AI managed (for that cluster id) in the host cloud.
45+
46+
All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered.
47+
"""
48+
cluster_manager = AWSClusterManager()
49+
cluster_manager.delete(cluster_id=cluster, force=force, wait=wait)

0 commit comments

Comments
 (0)