|
130 | 130 | MIN_HEARTBEAT_INTERVAL: int = conf.getint("workers", "min_heartbeat_interval")
|
131 | 131 | MAX_FAILED_HEARTBEATS: int = conf.getint("workers", "max_failed_heartbeats")
|
132 | 132 |
|
| 133 | +SOCKET_CLEANUP_TIMEOUT: float = conf.getfloat("workers", "socket_cleanup_timeout") |
133 | 134 |
|
134 | 135 | SERVER_TERMINATED = "SERVER_TERMINATED"
|
135 | 136 |
|
@@ -357,6 +358,13 @@ def exit(n: int) -> NoReturn:
|
357 | 358 | sys.stderr.flush()
|
358 | 359 | with suppress(ValueError, OSError):
|
359 | 360 | last_chance_stderr.flush()
|
| 361 | + |
| 362 | + # Explicitly close the child-end of our supervisor sockets so |
| 363 | + # the parent sees EOF on both "requests" and "logs" channels. |
| 364 | + with suppress(OSError): |
| 365 | + os.close(log_fd) |
| 366 | + with suppress(OSError): |
| 367 | + os.close(child_stdin.fileno()) |
360 | 368 | os._exit(n)
|
361 | 369 |
|
362 | 370 | if hasattr(atexit, "_clear"):
|
@@ -429,6 +437,8 @@ class WatchedSubprocess:
|
429 | 437 |
|
430 | 438 | _num_open_sockets: int = 4
|
431 | 439 | _exit_code: int | None = attrs.field(default=None, init=False)
|
| 440 | + _process_exit_monotonic: float | None = attrs.field(default=None, init=False) |
| 441 | + _fd_to_socket_type: dict[int, str] = attrs.field(factory=dict, init=False) |
432 | 442 |
|
433 | 443 | selector: selectors.BaseSelector = attrs.field(factory=selectors.DefaultSelector, repr=False)
|
434 | 444 |
|
@@ -513,6 +523,14 @@ def _register_pipe_readers(self, stdout: socket, stderr: socket, requests: socke
|
513 | 523 | # alternatives are used automatically) -- this is a way of having "event-based" code, but without
|
514 | 524 | # needing full async, to read and process output from each socket as it is received.
|
515 | 525 |
|
| 526 | + # Track socket types for debugging |
| 527 | + self._fd_to_socket_type = { |
| 528 | + stdout.fileno(): "stdout", |
| 529 | + stderr.fileno(): "stderr", |
| 530 | + requests.fileno(): "requests", |
| 531 | + logs.fileno(): "logs", |
| 532 | + } |
| 533 | + |
516 | 534 | target_loggers: tuple[FilteringBoundLogger, ...] = (self.process_log,)
|
517 | 535 | if self.subprocess_logs_to_stdout:
|
518 | 536 | target_loggers += (log,)
|
@@ -599,6 +617,28 @@ def _close_unused_sockets(*sockets):
|
599 | 617 | sock._sock.close()
|
600 | 618 | sock.close()
|
601 | 619 |
|
| 620 | + def _cleanup_open_sockets(self): |
| 621 | + """Force-close any sockets that never reported EOF.""" |
| 622 | + # In extremely busy environments the selector can fail to deliver a |
| 623 | + # final read event before the subprocess exits. Without closing these |
| 624 | + # sockets the supervisor would wait forever thinking they are still |
| 625 | + # active. This cleanup ensures we always release resources and exit. |
| 626 | + stuck_sockets = [] |
| 627 | + for key in list(self.selector.get_map().values()): |
| 628 | + socket_type = self._fd_to_socket_type.get(key.fd, f"unknown-{key.fd}") |
| 629 | + stuck_sockets.append(f"{socket_type}({key.fd})") |
| 630 | + with suppress(Exception): |
| 631 | + self.selector.unregister(key.fileobj) |
| 632 | + with suppress(Exception): |
| 633 | + key.fileobj.close() # type: ignore[union-attr] |
| 634 | + |
| 635 | + if stuck_sockets: |
| 636 | + log.warning("Force-closed stuck sockets", pid=self.pid, sockets=stuck_sockets) |
| 637 | + |
| 638 | + self.selector.close() |
| 639 | + self._close_unused_sockets(self.stdin) |
| 640 | + self._num_open_sockets = 0 |
| 641 | + |
602 | 642 | def kill(
|
603 | 643 | self,
|
604 | 644 | signal_to_send: signal.Signals = signal.SIGINT,
|
@@ -732,6 +772,7 @@ def _check_subprocess_exit(
|
732 | 772 | if raise_on_timeout:
|
733 | 773 | raise
|
734 | 774 | else:
|
| 775 | + self._process_exit_monotonic = time.monotonic() |
735 | 776 | self._close_unused_sockets(self.stdin)
|
736 | 777 | # Put a message in the viewable task logs
|
737 | 778 |
|
@@ -905,6 +946,18 @@ def _monitor_subprocess(self):
|
905 | 946 | # This listens for activity (e.g., subprocess output) on registered file objects
|
906 | 947 | alive = self._service_subprocess(max_wait_time=max_wait_time) is None
|
907 | 948 |
|
| 949 | + if self._exit_code is not None and self._num_open_sockets > 0: |
| 950 | + if ( |
| 951 | + self._process_exit_monotonic |
| 952 | + and time.monotonic() - self._process_exit_monotonic > SOCKET_CLEANUP_TIMEOUT |
| 953 | + ): |
| 954 | + log.debug( |
| 955 | + "Forcefully closing remaining sockets", |
| 956 | + open_sockets=self._num_open_sockets, |
| 957 | + pid=self.pid, |
| 958 | + ) |
| 959 | + self._cleanup_open_sockets() |
| 960 | + |
908 | 961 | if alive:
|
909 | 962 | # We don't need to heartbeat if the process has shutdown, as we are just finishing of reading the
|
910 | 963 | # logs
|
|
0 commit comments