Skip to content

Commit 3ac92c6

Browse files
authored
feat(autoscaling): add runtime utilization (#4606)
This PR exposes the tokio Runtime utilization metric to the autoscaling endpoint. It will produce a single number which is the utilization across all workers
1 parent d980306 commit 3ac92c6

File tree

4 files changed

+55
-2
lines changed

4 files changed

+55
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
- Always enforce cached rate limits in processor. ([#4603](https://github.com/getsentry/relay/pull/4603))
2121
- Remove `parent_span_link` from `SpanLink` struct. ([#4594](https://github.com/getsentry/relay/pull/4594))
2222
- Extract transaction breakdowns into measurements. ([#4600](https://github.com/getsentry/relay/pull/4600))
23-
- Expose worker pool metrics in autoscaler endpoint ([#4605](https://github.com/getsentry/relay/pull/4605))
23+
- Expose worker pool metrics in autoscaler endpoint. ([#4605](https://github.com/getsentry/relay/pull/4605))
24+
- Expose runtime utilization metric in autoscaler endpoint. ([#4606](https://github.com/getsentry/relay/pull/4606))
2425
- Bump the revision of `sysinfo` to the revision at `15b3be3273ba286740122fed7bb7dccd2a79dc8f`. ([#4613](https://github.com/getsentry/relay/pull/4613))
2526
- Switch the processor and store to `async`. ([#4552](https://github.com/getsentry/relay/pull/4552))
2627

relay-server/src/endpoints/autoscaling.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ fn to_prometheus_string(data: &AutoscalingData) -> String {
4646
data.worker_pool_utilization,
4747
&[],
4848
);
49+
append_data_row(
50+
&mut result,
51+
"runtime_utilization",
52+
data.runtime_utilization,
53+
&[],
54+
);
4955
result
5056
}
5157

@@ -135,6 +141,7 @@ mod test {
135141
ServiceUtilization("envelope", 50),
136142
],
137143
worker_pool_utilization: 61,
144+
runtime_utilization: 41,
138145
};
139146
let result = super::to_prometheus_string(&data);
140147
assert_eq!(
@@ -146,6 +153,7 @@ relay_spool_total_size 30
146153
relay_utilization{relay_service="test"} 10
147154
relay_utilization{relay_service="envelope"} 50
148155
relay_worker_pool_utilization 61
156+
relay_runtime_utilization 41
149157
"#
150158
);
151159
}

relay-server/src/services/autoscaling.rs

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
use crate::services::buffer::PartitionedEnvelopeBuffer;
22
use crate::services::processor::EnvelopeProcessorServicePool;
33
use crate::MemoryStat;
4-
use relay_system::{AsyncResponse, Controller, FromMessage, Handle, Interface, Sender, Service};
4+
use relay_system::{
5+
AsyncResponse, Controller, FromMessage, Handle, Interface, RuntimeMetrics, Sender, Service,
6+
};
7+
use tokio::time::Instant;
58

69
/// Service that tracks internal relay metrics so that they can be exposed.
710
pub struct AutoscalingMetricService {
@@ -11,6 +14,10 @@ pub struct AutoscalingMetricService {
1114
envelope_buffer: PartitionedEnvelopeBuffer,
1215
/// Runtime handle to expose service utilization metrics.
1316
handle: Handle,
17+
/// Gives access to runtime metrics.
18+
runtime_metrics: RuntimeMetrics,
19+
/// The last time the runtime utilization was checked.
20+
last_runtime_check: Instant,
1421
/// This will always report `1` unless the instance is shutting down.
1522
up: u8,
1623
/// Gives access to AsyncPool metrics.
@@ -24,10 +31,13 @@ impl AutoscalingMetricService {
2431
handle: Handle,
2532
async_pool: EnvelopeProcessorServicePool,
2633
) -> Self {
34+
let runtime_metrics = handle.metrics();
2735
Self {
2836
memory_stat,
2937
envelope_buffer,
3038
handle,
39+
runtime_metrics,
40+
last_runtime_check: Instant::now(),
3141
async_pool,
3242
up: 1,
3343
}
@@ -54,13 +64,16 @@ impl Service for AutoscalingMetricService {
5464
.map(|(id, metric)| ServiceUtilization(id.name(), metric.utilization))
5565
.collect();
5666
let worker_pool_utilization = self.async_pool.metrics().utilization() as u8;
67+
let runtime_utilization = self.runtime_utilization();
68+
5769
sender.send(AutoscalingData {
5870
memory_usage: memory_usage.used_percent(),
5971
up: self.up,
6072
total_size: self.envelope_buffer.total_storage_size(),
6173
item_count: self.envelope_buffer.item_count(),
6274
services_metrics: metrics,
6375
worker_pool_utilization,
76+
runtime_utilization
6477
});
6578
}
6679
}
@@ -70,6 +83,26 @@ impl Service for AutoscalingMetricService {
7083
}
7184
}
7285

86+
impl AutoscalingMetricService {
87+
fn runtime_utilization(&mut self) -> u8 {
88+
let last_checked = self.last_runtime_check.elapsed().as_secs_f64();
89+
// Prevent division by 0 in case it's checked in rapid succession.
90+
if last_checked < 0.001 {
91+
return 0;
92+
}
93+
let avg_utilization = (0..self.runtime_metrics.num_workers())
94+
.map(|worker_id| self.runtime_metrics.worker_total_busy_duration(worker_id))
95+
.map(|busy| busy.as_secs_f64())
96+
.sum::<f64>()
97+
/ last_checked
98+
/ (self.runtime_metrics.num_workers() as f64);
99+
100+
self.last_runtime_check = Instant::now();
101+
102+
(avg_utilization * 100.0).min(100.0) as u8
103+
}
104+
}
105+
73106
/// Supported operations within the internal metrics service.
74107
pub enum AutoscalingMessageKind {
75108
/// Requests the current data from the service.
@@ -101,6 +134,7 @@ pub struct AutoscalingData {
101134
pub item_count: u64,
102135
pub worker_pool_utilization: u8,
103136
pub services_metrics: Vec<ServiceUtilization>,
137+
pub runtime_utilization: u8,
104138
}
105139

106140
pub struct ServiceUtilization(pub &'static str, pub u8);

tests/integration/test_autoscaling.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,13 @@ def test_pool_utilization(mini_sentry, relay):
9595
assert response.status_code == 200
9696

9797
assert 0 <= int(parsed["relay_worker_pool_utilization"]) <= 100
98+
99+
100+
def test_runtime_utilization(mini_sentry, relay):
101+
relay = relay(mini_sentry)
102+
103+
response = relay.get("/api/relay/autoscaling/")
104+
parsed = parse_prometheus(response.text)
105+
assert response.status_code == 200
106+
107+
assert 0 <= int(parsed["relay_runtime_utilization"]) <= 100

0 commit comments

Comments
 (0)