feat(autoscaling): add runtime utilization (#4606)

Litarnus · web-flow · commit 3ac92c62c8fb · 2025-03-26T14:36:05.000+01:00
This PR exposes the tokio Runtime utilization metric to the autoscaling
endpoint.
It will produce a single number which is the utilization across all
workers
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,7 +20,8 @@
 - Always enforce cached rate limits in processor. ([#4603](https://github.com/getsentry/relay/pull/4603))
 - Remove `parent_span_link` from `SpanLink` struct. ([#4594](https://github.com/getsentry/relay/pull/4594))
 - Extract transaction breakdowns into measurements. ([#4600](https://github.com/getsentry/relay/pull/4600))
-- Expose worker pool metrics in autoscaler endpoint ([#4605](https://github.com/getsentry/relay/pull/4605))
+- Expose worker pool metrics in autoscaler endpoint. ([#4605](https://github.com/getsentry/relay/pull/4605))
+- Expose runtime utilization metric in autoscaler endpoint. ([#4606](https://github.com/getsentry/relay/pull/4606))
 - Bump the revision of `sysinfo` to the revision at `15b3be3273ba286740122fed7bb7dccd2a79dc8f`. ([#4613](https://github.com/getsentry/relay/pull/4613))
 - Switch the processor and store to `async`. ([#4552](https://github.com/getsentry/relay/pull/4552))
 
diff --git a/relay-server/src/endpoints/autoscaling.rs b/relay-server/src/endpoints/autoscaling.rs
@@ -46,6 +46,12 @@ fn to_prometheus_string(data: &AutoscalingData) -> String {
         data.worker_pool_utilization,
         &[],
     );
+    append_data_row(
+        &mut result,
+        "runtime_utilization",
+        data.runtime_utilization,
+        &[],
+    );
     result
 }
 
@@ -135,6 +141,7 @@ mod test {
                 ServiceUtilization("envelope", 50),
             ],
             worker_pool_utilization: 61,
+            runtime_utilization: 41,
         };
         let result = super::to_prometheus_string(&data);
         assert_eq!(
@@ -146,6 +153,7 @@ relay_spool_total_size 30
 relay_utilization{relay_service="test"} 10
 relay_utilization{relay_service="envelope"} 50
 relay_worker_pool_utilization 61
+relay_runtime_utilization 41
 "#
         );
     }
diff --git a/relay-server/src/services/autoscaling.rs b/relay-server/src/services/autoscaling.rs
@@ -1,7 +1,10 @@
 use crate::services::buffer::PartitionedEnvelopeBuffer;
 use crate::services::processor::EnvelopeProcessorServicePool;
 use crate::MemoryStat;
-use relay_system::{AsyncResponse, Controller, FromMessage, Handle, Interface, Sender, Service};
+use relay_system::{
+    AsyncResponse, Controller, FromMessage, Handle, Interface, RuntimeMetrics, Sender, Service,
+};
+use tokio::time::Instant;
 
 /// Service that tracks internal relay metrics so that they can be exposed.
 pub struct AutoscalingMetricService {
@@ -11,6 +14,10 @@ pub struct AutoscalingMetricService {
     envelope_buffer: PartitionedEnvelopeBuffer,
     /// Runtime handle to expose service utilization metrics.
     handle: Handle,
+    /// Gives access to runtime metrics.
+    runtime_metrics: RuntimeMetrics,
+    /// The last time the runtime utilization was checked.
+    last_runtime_check: Instant,
     /// This will always report `1` unless the instance is shutting down.
     up: u8,
     /// Gives access to AsyncPool metrics.
@@ -24,10 +31,13 @@ impl AutoscalingMetricService {
         handle: Handle,
         async_pool: EnvelopeProcessorServicePool,
     ) -> Self {
+        let runtime_metrics = handle.metrics();
         Self {
             memory_stat,
             envelope_buffer,
             handle,
+            runtime_metrics,
+            last_runtime_check: Instant::now(),
             async_pool,
             up: 1,
         }
@@ -54,13 +64,16 @@ impl Service for AutoscalingMetricService {
                                 .map(|(id, metric)| ServiceUtilization(id.name(), metric.utilization))
                                 .collect();
                             let worker_pool_utilization = self.async_pool.metrics().utilization() as u8;
+                            let runtime_utilization = self.runtime_utilization();
+
                             sender.send(AutoscalingData {
                                 memory_usage: memory_usage.used_percent(),
                                 up: self.up,
                                 total_size: self.envelope_buffer.total_storage_size(),
                                 item_count: self.envelope_buffer.item_count(),
                                 services_metrics: metrics,
                                 worker_pool_utilization,
+                                runtime_utilization
                             });
                         }
                     }
@@ -70,6 +83,26 @@ impl Service for AutoscalingMetricService {
     }
 }
 
+impl AutoscalingMetricService {
+    fn runtime_utilization(&mut self) -> u8 {
+        let last_checked = self.last_runtime_check.elapsed().as_secs_f64();
+        // Prevent division by 0 in case it's checked in rapid succession.
+        if last_checked < 0.001 {
+            return 0;
+        }
+        let avg_utilization = (0..self.runtime_metrics.num_workers())
+            .map(|worker_id| self.runtime_metrics.worker_total_busy_duration(worker_id))
+            .map(|busy| busy.as_secs_f64())
+            .sum::<f64>()
+            / last_checked
+            / (self.runtime_metrics.num_workers() as f64);
+
+        self.last_runtime_check = Instant::now();
+
+        (avg_utilization * 100.0).min(100.0) as u8
+    }
+}
+
 /// Supported operations within the internal metrics service.
 pub enum AutoscalingMessageKind {
     /// Requests the current data from the service.
@@ -101,6 +134,7 @@ pub struct AutoscalingData {
     pub item_count: u64,
     pub worker_pool_utilization: u8,
     pub services_metrics: Vec<ServiceUtilization>,
+    pub runtime_utilization: u8,
 }
 
 pub struct ServiceUtilization(pub &'static str, pub u8);
diff --git a/tests/integration/test_autoscaling.py b/tests/integration/test_autoscaling.py
@@ -95,3 +95,13 @@ def test_pool_utilization(mini_sentry, relay):
     assert response.status_code == 200
 
     assert 0 <= int(parsed["relay_worker_pool_utilization"]) <= 100
+
+
+def test_runtime_utilization(mini_sentry, relay):
+    relay = relay(mini_sentry)
+
+    response = relay.get("/api/relay/autoscaling/")
+    parsed = parse_prometheus(response.text)
+    assert response.status_code == 200
+
+    assert 0 <= int(parsed["relay_runtime_utilization"]) <= 100