Skip to content

Commit 488c2ac

Browse files
authored
Prevent bug when launching apps on multiple clusters (#15226)
Stops a bug when cross-launching an app between clusters. Currently the platform does not allow running multiple app instances. If you have `app-1` running on `cluster-1` and try to run it on `cluster-2`, the CLI will succeed but the app will never start. This PR prevents this disconnect. The app should not be uploaded / released if it won't run. An error is presented to the user explaining what happened and how to proceed (specify a different `--name`: e.g. `app-2`). Once the platform supports multiple app instances / running individual apps on multiple clusters, this PR can be reverted.
1 parent 8b4d71c commit 488c2ac

File tree

3 files changed

+84
-13
lines changed

3 files changed

+84
-13
lines changed

src/lightning_app/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
2727

2828
- Fixed an issue when using the CLI without arguments ([#14877](https://github.com/Lightning-AI/lightning/pull/14877))
2929
- Fixed a bug where the upload files endpoint would raise an error when running locally ([#14924](https://github.com/Lightning-AI/lightning/pull/14924))
30+
- Fixed a bug when launching an app on multiple clusters ([#15226](https://github.com/Lightning-AI/lightning/pull/15226))
3031

3132
## [0.6.2] - 2022-09-21
3233

src/lightning_app/runners/cloud.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -269,19 +269,6 @@ def dispatch(
269269
if cluster_id is not None:
270270
self._ensure_cluster_project_binding(project.project_id, cluster_id)
271271

272-
lightning_app_release = self.backend.client.lightningapp_v2_service_create_lightningapp_release(
273-
project_id=project.project_id, app_id=lit_app.id, body=release_body
274-
)
275-
276-
if cluster_id is not None:
277-
logger.info(f"running app on {lightning_app_release.cluster_id}")
278-
279-
if lightning_app_release.source_upload_url == "":
280-
raise RuntimeError("The source upload url is empty.")
281-
282-
repo.package()
283-
repo.upload(url=lightning_app_release.source_upload_url)
284-
285272
# check if user has sufficient credits to run an app
286273
# if so set the desired state to running otherwise, create the app in stopped state,
287274
# and open the admin ui to add credits and running the app.
@@ -305,6 +292,15 @@ def dispatch(
305292

306293
if find_instances_resp.lightningapps:
307294
existing_instance = find_instances_resp.lightningapps[0]
295+
296+
# TODO: support multiple instances / 1 instance per cluster
297+
if existing_instance.spec.cluster_id != cluster_id:
298+
raise ValueError(
299+
f"Can not start app '{name}' on cluster '{cluster_id}' "
300+
f"since this app already exists on '{existing_instance.spec.cluster_id}'. "
301+
"To run it on another cluster, give it a new name with the --name option."
302+
)
303+
308304
if existing_instance.status.phase != V1LightningappInstanceState.STOPPED:
309305
# TODO(yurij): Implement release switching in the UI and remove this
310306
# We can only switch release of the stopped instance
@@ -324,6 +320,21 @@ def dispatch(
324320
if existing_instance.status.phase != V1LightningappInstanceState.STOPPED:
325321
raise RuntimeError("Failed to stop the existing instance.")
326322

323+
# create / upload the new app release / instace
324+
lightning_app_release = self.backend.client.lightningapp_v2_service_create_lightningapp_release(
325+
project_id=project.project_id, app_id=lit_app.id, body=release_body
326+
)
327+
328+
if cluster_id is not None:
329+
logger.info(f"running app on {lightning_app_release.cluster_id}")
330+
331+
if lightning_app_release.source_upload_url == "":
332+
raise RuntimeError("The source upload url is empty.")
333+
334+
repo.package()
335+
repo.upload(url=lightning_app_release.source_upload_url)
336+
337+
if find_instances_resp.lightningapps:
327338
lightning_app_instance = (
328339
self.backend.client.lightningapp_instance_service_update_lightningapp_instance_release(
329340
project_id=project.project_id,

tests/tests_app/runners/test_cloud.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,60 @@ def run(self):
6868
class TestAppCreationClient:
6969
"""Testing the calls made using GridRestClient to create the app."""
7070

71+
# TODO: remove this test once there is support for multiple instances
72+
@mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock())
73+
def test_new_instance_on_different_cluster_fails(self, monkeypatch):
74+
app_name = "test-app-name"
75+
original_cluster = "cluster-001"
76+
new_cluster = "cluster-002"
77+
78+
mock_client = mock.MagicMock()
79+
mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse(
80+
memberships=[V1Membership(name="Default Project", project_id="default-project-id")]
81+
)
82+
83+
cloud_backend = mock.MagicMock()
84+
cloud_backend.client = mock_client
85+
monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
86+
monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
87+
monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend))
88+
89+
app = mock.MagicMock()
90+
app.flows = []
91+
app.frontend = {}
92+
93+
existing_instance = MagicMock()
94+
existing_instance.status.phase = V1LightningappInstanceState.STOPPED
95+
existing_instance.spec.cluster_id = original_cluster
96+
mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
97+
V1ListLightningappInstancesResponse(lightningapps=[existing_instance])
98+
)
99+
100+
cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py")
101+
cloud_runtime._check_uploaded_folder = mock.MagicMock()
102+
103+
# without requirements file
104+
# setting is_file to False so requirements.txt existence check will return False
105+
monkeypatch.setattr(Path, "is_file", lambda *args, **kwargs: False)
106+
monkeypatch.setattr(cloud, "Path", Path)
107+
108+
# This is the main assertion:
109+
# we have an existing instance on `cluster-001`
110+
# but we want to run this app on `cluster-002`
111+
with pytest.raises(ValueError) as exc:
112+
cloud_runtime.dispatch(name=app_name, cluster_id=new_cluster)
113+
114+
assert exc.match(
115+
f"Can not start app '{app_name}' on cluster '{new_cluster}' "
116+
f"since this app already exists on '{original_cluster}'. "
117+
"To run it on another cluster, give it a new name with the --name option."
118+
)
119+
cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_not_called()
120+
cloud_runtime.backend.client.projects_service_create_project_cluster_binding.assert_called_once_with(
121+
project_id="default-project-id",
122+
body=V1ProjectClusterBinding(cluster_id=new_cluster, project_id="default-project-id"),
123+
)
124+
71125
@mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock())
72126
def test_run_with_custom_flow_compute_config(self, monkeypatch):
73127
mock_client = mock.MagicMock()
@@ -265,6 +319,7 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir):
265319
mock_client = mock.MagicMock()
266320
if lightningapps:
267321
lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED
322+
lightningapps[0].spec.cluster_id = None
268323
mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
269324
V1ListLightningappInstancesResponse(lightningapps=lightningapps)
270325
)
@@ -420,6 +475,7 @@ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch
420475
mock_client = mock.MagicMock()
421476
if lightningapps:
422477
lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED
478+
lightningapps[0].spec.cluster_id = None
423479
mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
424480
V1ListLightningappInstancesResponse(lightningapps=lightningapps)
425481
)
@@ -547,6 +603,7 @@ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, mo
547603
mock_client = mock.MagicMock()
548604
if lightningapps:
549605
lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED
606+
lightningapps[0].spec.cluster_id = None
550607
mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
551608
V1ListLightningappInstancesResponse(lightningapps=lightningapps)
552609
)
@@ -737,6 +794,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo
737794
mock_client = mock.MagicMock()
738795
if lightningapps:
739796
lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED
797+
lightningapps[0].spec.cluster_id = None
740798
mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
741799
V1ListLightningappInstancesResponse(lightningapps=lightningapps)
742800
)
@@ -747,6 +805,7 @@ def test_call_with_work_app_and_attached_mount_and_drive(self, lightningapps, mo
747805
)
748806
existing_instance = MagicMock()
749807
existing_instance.status.phase = V1LightningappInstanceState.STOPPED
808+
existing_instance.spec.cluster_id = None
750809
mock_client.lightningapp_service_get_lightningapp = MagicMock(return_value=existing_instance)
751810
cloud_backend = mock.MagicMock()
752811
cloud_backend.client = mock_client

0 commit comments

Comments
 (0)