diff --git a/CHANGELOG.md b/CHANGELOG.md index bef058b17fe..d6362624bca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,8 @@ - Always enforce cached rate limits in processor. ([#4603](https://github.com/getsentry/relay/pull/4603)) - Remove `parent_span_link` from `SpanLink` struct. ([#4594](https://github.com/getsentry/relay/pull/4594)) - Extract transaction breakdowns into measurements. ([#4600](https://github.com/getsentry/relay/pull/4600)) -- Expose worker pool metrics in autoscaler endpoint ([#4605](https://github.com/getsentry/relay/pull/4605)) +- Expose worker pool metrics in autoscaler endpoint. ([#4605](https://github.com/getsentry/relay/pull/4605)) +- Expose runtime utilization metric in autoscaler endpoint. ([#4606](https://github.com/getsentry/relay/pull/4606)) - Bump the revision of `sysinfo` to the revision at `15b3be3273ba286740122fed7bb7dccd2a79dc8f`. ([#4613](https://github.com/getsentry/relay/pull/4613)) - Switch the processor and store to `async`. ([#4552](https://github.com/getsentry/relay/pull/4552)) diff --git a/relay-server/src/endpoints/autoscaling.rs b/relay-server/src/endpoints/autoscaling.rs index 2dbf7379eae..fcf07a57ced 100644 --- a/relay-server/src/endpoints/autoscaling.rs +++ b/relay-server/src/endpoints/autoscaling.rs @@ -46,6 +46,12 @@ fn to_prometheus_string(data: &AutoscalingData) -> String { data.worker_pool_utilization, &[], ); + append_data_row( + &mut result, + "runtime_utilization", + data.runtime_utilization, + &[], + ); result } @@ -135,6 +141,7 @@ mod test { ServiceUtilization("envelope", 50), ], worker_pool_utilization: 61, + runtime_utilization: 41, }; let result = super::to_prometheus_string(&data); assert_eq!( @@ -146,6 +153,7 @@ relay_spool_total_size 30 relay_utilization{relay_service="test"} 10 relay_utilization{relay_service="envelope"} 50 relay_worker_pool_utilization 61 +relay_runtime_utilization 41 "# ); } diff --git a/relay-server/src/services/autoscaling.rs b/relay-server/src/services/autoscaling.rs index b169b417a2e..16c3728756c 100644 --- a/relay-server/src/services/autoscaling.rs +++ b/relay-server/src/services/autoscaling.rs @@ -1,7 +1,10 @@ use crate::services::buffer::PartitionedEnvelopeBuffer; use crate::services::processor::EnvelopeProcessorServicePool; use crate::MemoryStat; -use relay_system::{AsyncResponse, Controller, FromMessage, Handle, Interface, Sender, Service}; +use relay_system::{ + AsyncResponse, Controller, FromMessage, Handle, Interface, RuntimeMetrics, Sender, Service, +}; +use tokio::time::Instant; /// Service that tracks internal relay metrics so that they can be exposed. pub struct AutoscalingMetricService { @@ -11,6 +14,10 @@ pub struct AutoscalingMetricService { envelope_buffer: PartitionedEnvelopeBuffer, /// Runtime handle to expose service utilization metrics. handle: Handle, + /// Gives access to runtime metrics. + runtime_metrics: RuntimeMetrics, + /// The last time the runtime utilization was checked. + last_runtime_check: Instant, /// This will always report `1` unless the instance is shutting down. up: u8, /// Gives access to AsyncPool metrics. @@ -24,10 +31,13 @@ impl AutoscalingMetricService { handle: Handle, async_pool: EnvelopeProcessorServicePool, ) -> Self { + let runtime_metrics = handle.metrics(); Self { memory_stat, envelope_buffer, handle, + runtime_metrics, + last_runtime_check: Instant::now(), async_pool, up: 1, } @@ -54,6 +64,8 @@ impl Service for AutoscalingMetricService { .map(|(id, metric)| ServiceUtilization(id.name(), metric.utilization)) .collect(); let worker_pool_utilization = self.async_pool.metrics().utilization() as u8; + let runtime_utilization = self.runtime_utilization(); + sender.send(AutoscalingData { memory_usage: memory_usage.used_percent(), up: self.up, @@ -61,6 +73,7 @@ impl Service for AutoscalingMetricService { item_count: self.envelope_buffer.item_count(), services_metrics: metrics, worker_pool_utilization, + runtime_utilization }); } } @@ -70,6 +83,26 @@ impl Service for AutoscalingMetricService { } } +impl AutoscalingMetricService { + fn runtime_utilization(&mut self) -> u8 { + let last_checked = self.last_runtime_check.elapsed().as_secs_f64(); + // Prevent division by 0 in case it's checked in rapid succession. + if last_checked < 0.001 { + return 0; + } + let avg_utilization = (0..self.runtime_metrics.num_workers()) + .map(|worker_id| self.runtime_metrics.worker_total_busy_duration(worker_id)) + .map(|busy| busy.as_secs_f64()) + .sum::() + / last_checked + / (self.runtime_metrics.num_workers() as f64); + + self.last_runtime_check = Instant::now(); + + (avg_utilization * 100.0).min(100.0) as u8 + } +} + /// Supported operations within the internal metrics service. pub enum AutoscalingMessageKind { /// Requests the current data from the service. @@ -101,6 +134,7 @@ pub struct AutoscalingData { pub item_count: u64, pub worker_pool_utilization: u8, pub services_metrics: Vec, + pub runtime_utilization: u8, } pub struct ServiceUtilization(pub &'static str, pub u8); diff --git a/tests/integration/test_autoscaling.py b/tests/integration/test_autoscaling.py index 8f89e760187..8549f0df974 100644 --- a/tests/integration/test_autoscaling.py +++ b/tests/integration/test_autoscaling.py @@ -95,3 +95,13 @@ def test_pool_utilization(mini_sentry, relay): assert response.status_code == 200 assert 0 <= int(parsed["relay_worker_pool_utilization"]) <= 100 + + +def test_runtime_utilization(mini_sentry, relay): + relay = relay(mini_sentry) + + response = relay.get("/api/relay/autoscaling/") + parsed = parse_prometheus(response.text) + assert response.status_code == 200 + + assert 0 <= int(parsed["relay_runtime_utilization"]) <= 100