update

thomas · thomas · commit 77c929f975b6 · 2023-12-18T17:42:34.000Z
diff --git a/src/lightning/app/core/app.py b/src/lightning/app/core/app.py
@@ -29,6 +29,7 @@
 from lightning.app import _console
 from lightning.app.api.request_types import _APIRequest, _CommandRequest, _DeltaRequest
 from lightning.app.core.constants import (
+    BATCH_DELTA_COUNT,
     DEBUG_ENABLED,
     FLOW_DURATION_SAMPLES,
     FLOW_DURATION_THRESHOLD,
@@ -312,7 +313,7 @@ def get_state_changed_from_queue(q: BaseQueue, timeout: Optional[float] = None)
     def batch_get_state_changed_from_queue(q: BaseQueue, timeout: Optional[float] = None) -> List[dict]:
         try:
             timeout = timeout or q.default_timeout
-            return q.get_all(timeout=timeout)
+            return q.batch_get(timeout=timeout, count=BATCH_DELTA_COUNT)
         except queue.Empty:
             return []
 
@@ -353,7 +354,6 @@ def _collect_deltas_from_ui_and_work_queues(self) -> List[Union[Delta, _APIReque
                 self.delta_queue  # type: ignore[assignment,arg-type]
             )
             for delta in received_deltas:
-                print(delta)
                 if isinstance(delta, _DeltaRequest):
                     deltas.append(delta.delta)
                 elif isinstance(delta, ComponentDelta):
diff --git a/src/lightning/app/core/constants.py b/src/lightning/app/core/constants.py
@@ -98,6 +98,8 @@ def get_lightning_cloud_url() -> str:
 # directory where system customization sync files will be copied to be packed into app tarball
 SYS_CUSTOMIZATIONS_SYNC_PATH = ".sys-customizations-sync"
 
+BATCH_DELTA_COUNT = int(os.getenv("BATCH_DELTA_COUNT", "128"))
+
 
 def enable_multiple_works_in_default_container() -> bool:
     return bool(int(os.getenv("ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER", "0")))
diff --git a/src/lightning/app/core/queues.py b/src/lightning/app/core/queues.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import base64
 import multiprocessing
 import pickle
 import queue  # needed as import instead from/import for mocking in tests
@@ -22,11 +23,10 @@
 from pathlib import Path
 from typing import Any, Optional, Tuple
 from urllib.parse import urljoin
-import numpy as np
+
 import backoff
 import requests
 from requests.exceptions import ConnectionError, ConnectTimeout, ReadTimeout
-import base64
 
 from lightning.app.core.constants import (
     HTTP_QUEUE_REFRESH_INTERVAL,
@@ -191,7 +191,7 @@ def get(self, timeout: Optional[float] = None) -> Any:
         pass
 
     @abstractmethod
-    def get_all(self, timeout: Optional[float] = None) -> Any:
+    def batch_get(self, timeout: Optional[float] = None, count: Optional[int] = None) -> Any:
         """Returns the left most elements of the queue.
 
         Parameters
@@ -228,9 +228,10 @@ def get(self, timeout: Optional[float] = None) -> Any:
             timeout = self.default_timeout
         return self.queue.get(timeout=timeout, block=(timeout is None))
 
-    def get_all(self, timeout: Optional[float] = None) -> Any:
+    def batch_get(self, timeout: Optional[float] = None, count: Optional[int] = None) -> Any:
         if timeout == 0:
             timeout = self.default_timeout
+        # For multiprocessing, we can simply collect the latest upmost element
         return [self.queue.get(timeout=timeout, block=(timeout is None))]
 
 
@@ -331,8 +332,8 @@ def get(self, timeout: Optional[float] = None) -> Any:
             raise queue.Empty
         return pickle.loads(out[1])
 
-    def get_all(self, timeout: Optional[float] = None) -> Any:
-        raise NotImplementedError
+    def batch_get(self, timeout: Optional[float] = None, count: Optional[int] = None) -> Any:
+        raise NotImplementedError("The batch_get method isn't implemented.")
 
     def clear(self) -> None:
         """Clear all elements in the queue."""
@@ -404,10 +405,10 @@ def get(self, timeout: Optional[float] = None) -> Any:
         self._last_get = time.time()
         return self._queue.get(timeout=timeout)
 
-    def get_all(self, timeout: Optional[float] = None) -> Any:
+    def batch_get(self, timeout: Optional[float] = None, count: Optional[int] = None) -> Any:
         self._wait_until_allowed(self._last_get)
         self._last_get = time.time()
-        return self._queue.get_all(timeout=timeout)
+        return self._queue.batch_get(timeout=timeout)
 
     def put(self, item: Any) -> None:
         return self._queue.put(item)
@@ -501,13 +502,53 @@ def _get(self) -> Any:
             # we consider the queue is empty to avoid failing the app.
             raise queue.Empty
 
-    def get_all(self, timeout: Optional[float] = None) -> Any:
+    def batch_get(self, timeout: Optional[float] = None, count: Optional[int] = None) -> list[Any]:
         if not self.app_id:
             raise ValueError(f"App ID couldn't be extracted from the queue name: {self.name}")
 
+        # it's a blocking call, we need to loop and call the backend to mimic this behavior
+        if timeout is None:
+            while True:
+                try:
+                    try:
+                        return self._batch_get(count=count)
+                    except requests.exceptions.HTTPError:
+                        pass
+                except queue.Empty:
+                    time.sleep(HTTP_QUEUE_REFRESH_INTERVAL)
+
+        # make one request and return the result
+        if timeout == 0:
+            try:
+                return self._batch_get(count=count)
+            except requests.exceptions.HTTPError:
+                return []
+
+        # timeout is some value - loop until the timeout is reached
+        start_time = time.time()
+        while (time.time() - start_time) < timeout:
+            try:
+                try:
+                    return self._batch_get(count=count)
+                except requests.exceptions.HTTPError:
+                    if timeout > self.default_timeout:
+                        return []
+                    raise queue.Empty
+            except queue.Empty:
+                # Note: In theory, there isn't a need for a sleep as the queue shouldn't
+                # block the flow if the queue is empty.
+                # However, as the Http Server can saturate,
+                # let's add a sleep here if a higher timeout is provided
+                # than the default timeout
+                if timeout > self.default_timeout:
+                    time.sleep(0.05)
+        return []
+
+    def _batch_get(self, count: Optional[int] = 64) -> list[Any]:
         try:
-            print("HERE")
-            resp = self.client.post(f"v1/{self.app_id}/{self._name_suffix}", query_params={"action": "popCount", "count": "64"})
+            resp = self.client.post(
+                f"v1/{self.app_id}/{self._name_suffix}", query_params={"action": "popCount", "count": str(count)}
+            )
             if resp.status_code == 204:
                 raise queue.Empty
             return [pickle.loads(base64.b64decode(data)) for data in resp.json()]