fix(autoscaling): expose services with their instance ids (#4654)

Litarnus · web-flow · commit cbf72c2930ad · 2025-04-08T12:56:07.000+02:00
Fixes a bug where only a single service metric was exported even though
there were multiple with different instance IDs.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@
 - Separates profiles into backend and ui profiles. ([#4595](https://github.com/getsentry/relay/pull/4595))
 - Normalize trace context information before writing it into transaction and span data. This ensures the correct sampling rates are stored for extrapolation in Sentry. ([#4625](https://github.com/getsentry/relay/pull/4625))
 - Adds u16 validation to the replay protocol's segment_id field. ([#4635](https://github.com/getsentry/relay/pull/4635))
+- Exposes all service utilization with instance labels instead of the last. ([#4654](https://github.com/getsentry/relay/pull/4654))
 
 **Internal**:
 
diff --git a/relay-server/src/endpoints/autoscaling.rs b/relay-server/src/endpoints/autoscaling.rs
@@ -32,12 +32,15 @@ fn to_prometheus_string(data: &AutoscalingData) -> String {
     append_data_row(&mut result, "spool_item_count", data.item_count, &[]);
     append_data_row(&mut result, "spool_total_size", data.total_size, &[]);
     for utilization in &data.services_metrics {
-        let service_name = extract_service_name(utilization.0);
+        let service_name = extract_service_name(utilization.name);
         append_data_row(
             &mut result,
             "service_utilization",
-            utilization.1,
-            &[("relay_service", service_name)],
+            utilization.utilization,
+            &[
+                ("relay_service", service_name),
+                ("instance_id", &format!("{}", utilization.instance_id)),
+            ],
         );
     }
 
@@ -138,8 +141,21 @@ mod test {
             item_count: 10,
             total_size: 30,
             services_metrics: vec![
-                ServiceUtilization("test", 10),
-                ServiceUtilization("envelope", 50),
+                ServiceUtilization {
+                    name: "test",
+                    instance_id: 0,
+                    utilization: 10,
+                },
+                ServiceUtilization {
+                    name: "test",
+                    instance_id: 1,
+                    utilization: 30,
+                },
+                ServiceUtilization {
+                    name: "envelope",
+                    instance_id: 1,
+                    utilization: 50,
+                },
             ],
             worker_pool_utilization: 61,
             runtime_utilization: 41,
@@ -151,8 +167,9 @@ mod test {
 relay_up 1
 relay_spool_item_count 10
 relay_spool_total_size 30
-relay_service_utilization{relay_service="test"} 10
-relay_service_utilization{relay_service="envelope"} 50
+relay_service_utilization{relay_service="test", instance_id="0"} 10
+relay_service_utilization{relay_service="test", instance_id="1"} 30
+relay_service_utilization{relay_service="envelope", instance_id="1"} 50
 relay_worker_pool_utilization 61
 relay_runtime_utilization 41
 "#
diff --git a/relay-server/src/services/autoscaling.rs b/relay-server/src/services/autoscaling.rs
@@ -61,7 +61,12 @@ impl Service for AutoscalingMetricService {
                             let metrics = self.handle
                                 .current_services_metrics()
                                 .iter()
-                                .map(|(id, metric)| ServiceUtilization(id.name(), metric.utilization))
+                                .map(|(id, metric)| ServiceUtilization {
+                                    name: id.name(),
+                                    instance_id: id.instance_id(),
+                                    utilization: metric.utilization
+                                }
+                            )
                                 .collect();
                             let worker_pool_utilization = self.async_pool.metrics().utilization() as u8;
                             let runtime_utilization = self.runtime_utilization();
@@ -127,14 +132,43 @@ impl FromMessage<AutoscalingMessageKind> for AutoscalingMetrics {
     }
 }
 
+/// Contains data that is used for autoscaling.
 pub struct AutoscalingData {
+    /// Memory usage of relay.
     pub memory_usage: f32,
+    /// Is `1` if relay is running, `0` if it's shutting down.
     pub up: u8,
+    /// The total number of bytes used by the spooler.
     pub total_size: u64,
+    /// The total number of envelopes in the spooler.
     pub item_count: u64,
+    /// Worker pool utilization in percent.
     pub worker_pool_utilization: u8,
+    /// List of service utilization.
     pub services_metrics: Vec<ServiceUtilization>,
+    /// Utilization of the async runtime.
     pub runtime_utilization: u8,
 }
 
-pub struct ServiceUtilization(pub &'static str, pub u8);
+/// Contains the minimal required information for service utilization.
+///
+/// A service can have multiple instances which will all have the same name.
+/// Those instances are distinguished by the `instance_id`.
+pub struct ServiceUtilization {
+    /// The service name.
+    pub name: &'static str,
+    /// The id of the specific service instance.
+    pub instance_id: u32,
+    /// Utilization as percentage.
+    pub utilization: u8,
+}
+
+impl ServiceUtilization {
+    pub fn new(name: &'static str, instance_id: u32, utilization: u8) -> Self {
+        Self {
+            name,
+            instance_id,
+            utilization,
+        }
+    }
+}
diff --git a/tests/integration/test_autoscaling.py b/tests/integration/test_autoscaling.py
@@ -13,7 +13,7 @@
 def parse_prometheus(input_string):
     result = {}
     for line in input_string.splitlines():
-        parts = line.split(" ")
+        parts = line.rsplit(" ", 1)
         result[parts[0]] = parts[1]
     return result
 
@@ -79,7 +79,7 @@ def test_memory_spooling_metrics(mini_sentry, relay):
 @pytest.mark.parametrize(
     "metric_name",
     (
-        'relay_service_utilization{relay_service="AggregatorService"}',
+        'relay_service_utilization{relay_service="AggregatorService", instance_id="0"}',
         "relay_worker_pool_utilization",
         "relay_runtime_utilization",
     ),