Skip to content

Commit 51ec949

Browse files
tchatonthomas
andauthored
[App] Resolve some bugs from the Training Studio scaling (#16114)
Co-authored-by: thomas <[email protected]>
1 parent 8c265c5 commit 51ec949

23 files changed

+111
-35
lines changed

src/lightning_app/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
6060
- Fixed a bug where `AutoScaler` would fail with min_replica=0 ([#16092](https://github.com/Lightning-AI/lightning/pull/16092)
6161

6262

63+
- Fixed a non-thread safe deepcopy in the scheduler ([#16114](https://github.com/Lightning-AI/lightning/pull/16114))
64+
65+
- Fixed Http Queue sleeping for 1 sec by default if no delta were found ([#16114](https://github.com/Lightning-AI/lightning/pull/16114))
66+
67+
6368
## [1.8.4] - 2022-12-08
6469

6570
### Added

src/lightning_app/core/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@
2424
from lightning_app.api.http_methods import _HttpMethod
2525
from lightning_app.api.request_types import _DeltaRequest
2626
from lightning_app.core.constants import (
27-
CLOUD_QUEUE_TYPE,
2827
ENABLE_PULLING_STATE_ENDPOINT,
2928
ENABLE_PUSHING_STATE_ENDPOINT,
3029
ENABLE_STATE_WEBSOCKET,
3130
ENABLE_UPLOAD_ENDPOINT,
3231
FRONTEND_DIR,
32+
get_cloud_queue_type,
3333
)
3434
from lightning_app.core.queues import QueuingSystem
3535
from lightning_app.storage import Drive
@@ -350,7 +350,7 @@ async def healthz(response: Response):
350350
"""Health check endpoint used in the cloud FastAPI servers to check the status periodically."""
351351
# check the queue status only if running in cloud
352352
if is_running_in_cloud():
353-
queue_obj = QueuingSystem(CLOUD_QUEUE_TYPE).get_queue(queue_name="healthz")
353+
queue_obj = QueuingSystem(get_cloud_queue_type()).get_queue(queue_name="healthz")
354354
# this is only being implemented on Redis Queue. For HTTP Queue, it doesn't make sense to have every single
355355
# app checking the status of the Queue server
356356
if not queue_obj.is_running:

src/lightning_app/core/app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,8 @@ def _collect_deltas_from_ui_and_work_queues(self) -> List[Union[Delta, _APIReque
356356
deltas.append(delta)
357357
else:
358358
api_or_command_request_deltas.append(delta)
359+
else:
360+
break
359361

360362
if api_or_command_request_deltas:
361363
_process_requests(self, api_or_command_request_deltas)

src/lightning_app/core/constants.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
from pathlib import Path
3+
from typing import Optional
34

45
import lightning_cloud.env
56

@@ -13,7 +14,7 @@ def get_lightning_cloud_url() -> str:
1314

1415
SUPPORTED_PRIMITIVE_TYPES = (type(None), str, int, float, bool)
1516
STATE_UPDATE_TIMEOUT = 0.001
16-
STATE_ACCUMULATE_WAIT = 0.05
17+
STATE_ACCUMULATE_WAIT = 0.15
1718
# Duration in seconds of a moving average of a full flow execution
1819
# beyond which an exception is raised.
1920
FLOW_DURATION_THRESHOLD = 1.0
@@ -25,7 +26,6 @@ def get_lightning_cloud_url() -> str:
2526
APP_SERVER_PORT = _find_lit_app_port(7501)
2627
APP_STATE_MAX_SIZE_BYTES = 1024 * 1024 # 1 MB
2728

28-
CLOUD_QUEUE_TYPE = os.getenv("LIGHTNING_CLOUD_QUEUE_TYPE", None)
2929
WARNING_QUEUE_SIZE = 1000
3030
# different flag because queue debug can be very noisy, and almost always not useful unless debugging the queue itself.
3131
QUEUE_DEBUG_ENABLED = bool(int(os.getenv("LIGHTNING_QUEUE_DEBUG_ENABLED", "0")))
@@ -77,5 +77,9 @@ def enable_multiple_works_in_default_container() -> bool:
7777
return bool(int(os.getenv("ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER", "0")))
7878

7979

80+
def get_cloud_queue_type() -> Optional[str]:
81+
return os.getenv("LIGHTNING_CLOUD_QUEUE_TYPE", None)
82+
83+
8084
# Number of seconds to wait between filesystem checks when waiting for files in remote storage
8185
REMOTE_STORAGE_WAIT = 0.5

src/lightning_app/core/queues.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -364,12 +364,18 @@ def get(self, timeout: int = None) -> Any:
364364

365365
# timeout is some value - loop until the timeout is reached
366366
start_time = time.time()
367-
timeout += 0.1 # add 0.1 seconds as a safe margin
368367
while (time.time() - start_time) < timeout:
369368
try:
370369
return self._get()
371370
except queue.Empty:
372-
time.sleep(HTTP_QUEUE_REFRESH_INTERVAL)
371+
# Note: In theory, there isn't a need for a sleep as the queue shouldn't
372+
# block the flow if the queue is empty.
373+
# However, as the Http Server can saturate,
374+
# let's add a sleep here if a higher timeout is provided
375+
# than the default timeout
376+
if timeout > self.default_timeout:
377+
time.sleep(0.05)
378+
pass
373379

374380
def _get(self):
375381
resp = self.client.post(f"v1/{self.app_id}/{self._name_suffix}", query_params={"action": "pop"})

src/lightning_app/runners/cloud.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
from lightning_app import LightningWork
5151
from lightning_app.core.app import LightningApp
5252
from lightning_app.core.constants import (
53-
CLOUD_QUEUE_TYPE,
5453
CLOUD_UPLOAD_WARNING,
5554
DEFAULT_NUMBER_OF_EXPOSED_PORTS,
5655
DISABLE_DEPENDENCY_CACHE,
@@ -60,6 +59,7 @@
6059
ENABLE_MULTIPLE_WORKS_IN_NON_DEFAULT_CONTAINER,
6160
ENABLE_PULLING_STATE_ENDPOINT,
6261
ENABLE_PUSHING_STATE_ENDPOINT,
62+
get_cloud_queue_type,
6363
get_lightning_cloud_url,
6464
)
6565
from lightning_app.runners.backends.cloud import CloudBackend
@@ -418,9 +418,11 @@ def dispatch(
418418
initial_port += 1
419419

420420
queue_server_type = V1QueueServerType.UNSPECIFIED
421-
if CLOUD_QUEUE_TYPE == "http":
421+
# Note: Enable app to select their own queue type.
422+
queue_type = get_cloud_queue_type()
423+
if queue_type == "http":
422424
queue_server_type = V1QueueServerType.HTTP
423-
elif CLOUD_QUEUE_TYPE == "redis":
425+
elif queue_type == "redis":
424426
queue_server_type = V1QueueServerType.REDIS
425427

426428
release_body = Body8(
@@ -496,7 +498,8 @@ def dispatch(
496498
if lightning_app_instance.status.phase == V1LightningappInstanceState.FAILED:
497499
raise RuntimeError("Failed to create the application. Cannot upload the source code.")
498500

499-
if open_ui:
501+
# TODO: Remove testing dependency, but this would open a tab for each test...
502+
if open_ui and "PYTEST_CURRENT_TEST" not in os.environ:
500503
click.launch(self._get_app_url(lightning_app_instance, not has_sufficient_credits))
501504

502505
if cleanup_handle:

src/lightning_app/utilities/app_logs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def _app_logs_reader(
7979

8080
# And each socket on separate thread pushing log event to print queue
8181
# run_forever() will run until we close() the connection from outside
82-
log_threads = [Thread(target=work.run_forever) for work in log_sockets]
82+
log_threads = [Thread(target=work.run_forever, daemon=True) for work in log_sockets]
8383

8484
# Establish connection and begin pushing logs to the print queue
8585
for th in log_threads:

src/lightning_app/utilities/frontend.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,11 @@ def update_index_file(ui_root: str, info: Optional[AppInfo] = None, root_path: s
2222
entry_file = Path(ui_root) / "index.html"
2323
original_file = Path(ui_root) / "index.original.html"
2424

25-
if root_path:
26-
if not original_file.exists():
27-
shutil.copyfile(entry_file, original_file) # keep backup
28-
else:
29-
# revert index.html in case it was modified after creating original.html
30-
shutil.copyfile(original_file, entry_file)
25+
if not original_file.exists():
26+
shutil.copyfile(entry_file, original_file) # keep backup
27+
else:
28+
# revert index.html in case it was modified after creating original.html
29+
shutil.copyfile(original_file, entry_file)
3130

3231
if info:
3332
with original_file.open() as f:

src/lightning_app/utilities/packaging/cloud_compute.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class CloudCompute:
7171
name: str = "default"
7272
disk_size: int = 0
7373
idle_timeout: Optional[int] = None
74-
shm_size: Optional[int] = 0
74+
shm_size: Optional[int] = None
7575
mounts: Optional[Union[Mount, List[Mount]]] = None
7676
_internal_id: Optional[str] = None
7777

@@ -80,6 +80,12 @@ def __post_init__(self) -> None:
8080

8181
self.name = self.name.lower()
8282

83+
if self.shm_size is None:
84+
if "gpu" in self.name:
85+
self.shm_size = 1024
86+
else:
87+
self.shm_size = 0
88+
8389
# All `default` CloudCompute are identified in the same way.
8490
if self._internal_id is None:
8591
self._internal_id = self._generate_id()

src/lightning_app/utilities/scheduler.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
import threading
2-
from copy import deepcopy
32
from datetime import datetime
43
from typing import Optional
54

65
from croniter import croniter
7-
from deepdiff import DeepDiff, Delta
6+
from deepdiff import Delta
87

98
from lightning_app.utilities.proxies import ComponentDelta
109

@@ -34,11 +33,15 @@ def run_once(self):
3433
next_event = croniter(metadata["cron_pattern"], start_time).get_next(datetime)
3534
# When the event is reached, send a delta to activate scheduling.
3635
if current_date > next_event:
37-
flow = self._app.get_component_by_name(metadata["name"])
38-
previous_state = deepcopy(flow.state)
39-
flow._enable_schedule(call_hash)
4036
component_delta = ComponentDelta(
41-
id=flow.name, delta=Delta(DeepDiff(previous_state, flow.state, verbose_level=2))
37+
id=metadata["name"],
38+
delta=Delta(
39+
{
40+
"values_changed": {
41+
f"root['calls']['scheduling']['{call_hash}']['running']": {"new_value": True}
42+
}
43+
}
44+
),
4245
)
4346
self._app.delta_queue.put(component_delta)
4447
metadata["start_time"] = next_event.isoformat()

0 commit comments

Comments
 (0)