Lightning-AI · Borda · Dec 9, 2022 · Dec 7, 2022 · Dec 7, 2022 · Dec 8, 2022
@@ -1,3 +1,4 @@
+# ! pip install torch torchvision
 from typing import Any, List
 
 import torch
@@ -21,11 +22,13 @@ class BatchResponse(BaseModel):
 
 class PyTorchServer(L.app.components.PythonServer):
     def __init__(self, *args, **kwargs):
+        print(args)
+        print(kwargs)
         super().__init__(
-            port=L.app.utilities.network.find_free_network_port(),
             input_type=BatchRequestModel,
             output_type=BatchResponse,
-            cloud_compute=L.CloudCompute("gpu"),
+            *args,
+            **kwargs,
         )
 
     def setup(self):
@@ -57,30 +60,32 @@ def scale(self, replicas: int, metrics: dict) -> int:
         """The default scaling logic that users can override."""
         # scale out if the number of pending requests exceeds max batch size.
         max_requests_per_work = self.max_batch_size
-        pending_requests_per_running_or_pending_work = metrics["pending_requests"] / (
-            replicas + metrics["pending_works"]
-        )
-        if pending_requests_per_running_or_pending_work >= max_requests_per_work:
+        pending_requests_per_work = metrics["pending_requests"] / (replicas + metrics["pending_works"])
+        if pending_requests_per_work >= max_requests_per_work:
             return replicas + 1
 
         # scale in if the number of pending requests is below 25% of max_requests_per_work
         min_requests_per_work = max_requests_per_work * 0.25
-        pending_requests_per_running_work = metrics["pending_requests"] / replicas
-        if pending_requests_per_running_work < min_requests_per_work:
+        pending_requests_per_work = metrics["pending_requests"] / replicas
+        if pending_requests_per_work < min_requests_per_work:
             return replicas - 1
 
         return replicas
 
 
 app = L.LightningApp(
     MyAutoScaler(
+        # work class and args
         PyTorchServer,
-        min_replicas=2,
+        cloud_compute=L.CloudCompute("gpu"),
+        # autoscaler specific args
+        min_replicas=1,
         max_replicas=4,
         autoscale_interval=10,
         endpoint="predict",
         input_type=RequestModel,
         output_type=Any,
         timeout_batching=1,
+        max_batch_size=8,
     )
 )
@@ -22,6 +22,7 @@
 from lightning_app.core.flow import LightningFlow
 from lightning_app.core.work import LightningWork
 from lightning_app.utilities.app_helpers import Logger
+from lightning_app.utilities.network import find_free_network_port
 from lightning_app.utilities.packaging.cloud_compute import CloudCompute
 
 logger = Logger(__name__)
@@ -445,8 +446,14 @@ def workers(self) -> List[LightningWork]:
 
     def create_work(self) -> LightningWork:
         """Replicates a LightningWork instance with args and kwargs provided via ``__init__``."""
-        # TODO: Remove `start_with_flow=False` for faster initialization on the cloud
-        return self._work_cls(*self._work_args, **self._work_kwargs, start_with_flow=False)
+        self._work_kwargs.update(
+            dict(
+                port=find_free_network_port(),
+                # TODO: Remove `start_with_flow=False` for faster initialization on the cloud
+                start_with_flow=False,
+            )
+        )
+        return self._work_cls(*self._work_args, **self._work_kwargs)
 
     def add_work(self, work) -> str:
         """Adds a new LightningWork instance.