Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
- Always enforce cached rate limits in processor. ([#4603](https://github.com/getsentry/relay/pull/4603))
- Remove `parent_span_link` from `SpanLink` struct. ([#4594](https://github.com/getsentry/relay/pull/4594))
- Extract transaction breakdowns into measurements. ([#4600](https://github.com/getsentry/relay/pull/4600))
- Expose worker pool metrics in autoscaler endpoint ([#4605](https://github.com/getsentry/relay/pull/4605))
- Expose worker pool metrics in autoscaler endpoint. ([#4605](https://github.com/getsentry/relay/pull/4605))
- Expose runtime utilization metric in autoscaler endpoint. ([#4606](https://github.com/getsentry/relay/pull/4606))
- Bump the revision of `sysinfo` to the revision at `15b3be3273ba286740122fed7bb7dccd2a79dc8f`. ([#4613](https://github.com/getsentry/relay/pull/4613))
- Switch the processor and store to `async`. ([#4552](https://github.com/getsentry/relay/pull/4552))

Expand Down
8 changes: 8 additions & 0 deletions relay-server/src/endpoints/autoscaling.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ fn to_prometheus_string(data: &AutoscalingData) -> String {
data.worker_pool_utilization,
&[],
);
append_data_row(
&mut result,
"runtime_utilization",
data.runtime_utilization,
&[],
);
result
}

Expand Down Expand Up @@ -135,6 +141,7 @@ mod test {
ServiceUtilization("envelope", 50),
],
worker_pool_utilization: 61,
runtime_utilization: 41,
};
let result = super::to_prometheus_string(&data);
assert_eq!(
Expand All @@ -146,6 +153,7 @@ relay_spool_total_size 30
relay_utilization{relay_service="test"} 10
relay_utilization{relay_service="envelope"} 50
relay_worker_pool_utilization 61
relay_runtime_utilization 41
Comment on lines 154 to +156
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mh that's a bit awkward now, relay_utilization and relay_runtime_utilization, relay_service_utilization would have been better in hindsight I guess.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's true, I will prepare something to change this

"#
);
}
Expand Down
36 changes: 35 additions & 1 deletion relay-server/src/services/autoscaling.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
use crate::services::buffer::PartitionedEnvelopeBuffer;
use crate::services::processor::EnvelopeProcessorServicePool;
use crate::MemoryStat;
use relay_system::{AsyncResponse, Controller, FromMessage, Handle, Interface, Sender, Service};
use relay_system::{
AsyncResponse, Controller, FromMessage, Handle, Interface, RuntimeMetrics, Sender, Service,
};
use tokio::time::Instant;

/// Service that tracks internal relay metrics so that they can be exposed.
pub struct AutoscalingMetricService {
Expand All @@ -11,6 +14,10 @@ pub struct AutoscalingMetricService {
envelope_buffer: PartitionedEnvelopeBuffer,
/// Runtime handle to expose service utilization metrics.
handle: Handle,
/// Gives access to runtime metrics.
runtime_metrics: RuntimeMetrics,
/// The last time the runtime utilization was checked.
last_runtime_check: Instant,
/// This will always report `1` unless the instance is shutting down.
up: u8,
/// Gives access to AsyncPool metrics.
Expand All @@ -24,10 +31,13 @@ impl AutoscalingMetricService {
handle: Handle,
async_pool: EnvelopeProcessorServicePool,
) -> Self {
let runtime_metrics = handle.metrics();
Self {
memory_stat,
envelope_buffer,
handle,
runtime_metrics,
last_runtime_check: Instant::now(),
async_pool,
up: 1,
}
Expand All @@ -54,13 +64,16 @@ impl Service for AutoscalingMetricService {
.map(|(id, metric)| ServiceUtilization(id.name(), metric.utilization))
.collect();
let worker_pool_utilization = self.async_pool.metrics().utilization() as u8;
let runtime_utilization = self.runtime_utilization();

sender.send(AutoscalingData {
memory_usage: memory_usage.used_percent(),
up: self.up,
total_size: self.envelope_buffer.total_storage_size(),
item_count: self.envelope_buffer.item_count(),
services_metrics: metrics,
worker_pool_utilization,
runtime_utilization
});
}
}
Expand All @@ -70,6 +83,26 @@ impl Service for AutoscalingMetricService {
}
}

impl AutoscalingMetricService {
fn runtime_utilization(&mut self) -> u8 {
let last_checked = self.last_runtime_check.elapsed().as_secs_f64();
// Prevent division by 0 in case it's checked in rapid succession.
if last_checked < 0.001 {
return 0;
}
let avg_utilization = (0..self.runtime_metrics.num_workers())
.map(|worker_id| self.runtime_metrics.worker_total_busy_duration(worker_id))
.map(|busy| busy.as_secs_f64())
.sum::<f64>()
/ last_checked
/ (self.runtime_metrics.num_workers() as f64);

self.last_runtime_check = Instant::now();

(avg_utilization * 100.0).min(100.0) as u8
}
}

/// Supported operations within the internal metrics service.
pub enum AutoscalingMessageKind {
/// Requests the current data from the service.
Expand Down Expand Up @@ -101,6 +134,7 @@ pub struct AutoscalingData {
pub item_count: u64,
pub worker_pool_utilization: u8,
pub services_metrics: Vec<ServiceUtilization>,
pub runtime_utilization: u8,
}

pub struct ServiceUtilization(pub &'static str, pub u8);
10 changes: 10 additions & 0 deletions tests/integration/test_autoscaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,13 @@ def test_pool_utilization(mini_sentry, relay):
assert response.status_code == 200

assert 0 <= int(parsed["relay_worker_pool_utilization"]) <= 100


def test_runtime_utilization(mini_sentry, relay):
relay = relay(mini_sentry)

response = relay.get("/api/relay/autoscaling/")
parsed = parse_prometheus(response.text)
assert response.status_code == 200

assert 0 <= int(parsed["relay_runtime_utilization"]) <= 100