Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Diagnostics.Metrics;
using System.Linq;
using System.Threading;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
Expand Down Expand Up @@ -33,6 +36,10 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
private readonly double _scaleRelativeToCpuRequest;
private readonly double _scaleRelativeToCpuRequestForTrackerApi;

private readonly TimeSpan _retryInterval = TimeSpan.FromMinutes(5);
private DateTimeOffset _lastFailure = DateTimeOffset.MinValue;
private int _measurementsUnavailable;

private DateTimeOffset _refreshAfterCpu;
private DateTimeOffset _refreshAfterMemory;

Expand Down Expand Up @@ -94,18 +101,44 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
// Initialize the counters
_cpuUtilizationLimit100PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_100_percent_exceeded");
_cpuUtilizationLimit110PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_110_percent_exceeded");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilizationLimit(cpuLimit), unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilizationWithoutHostDelta() / cpuRequest, unit: "1");

_ = meter.CreateObservableGauge(
ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
() => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)),
"1");

_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationWithoutHostDelta() / cpuRequest),
unit: "1");
}
else
{
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuLimit, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuLimit),
unit: "1");

_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuRequest),
unit: "1");

_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ProcessCpuUtilization,
observeValues: () => GetMeasurementWithRetry(() => CpuUtilization() * _scaleRelativeToCpuRequest),
unit: "1");
}

_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: MemoryUtilization, unit: "1");
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessMemoryUtilization, observeValue: MemoryUtilization, unit: "1");
_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization,
observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()),
unit: "1");

_ = meter.CreateObservableGauge(
name: ResourceUtilizationInstruments.ProcessMemoryUtilization,
observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()),
unit: "1");

// cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core
// cpuLimit is a CPU limit (aka max CPU units available) for a pod or for a host.
Expand Down Expand Up @@ -288,4 +321,34 @@ public Snapshot GetSnapshot()
userTimeSinceStart: TimeSpan.FromTicks((long)(cgroupTime / Hundred * _scaleRelativeToCpuRequestForTrackerApi)),
memoryUsageInBytes: memoryUsed);
}

private IEnumerable<Measurement<double>> GetMeasurementWithRetry(Func<double> func)
{
if (Volatile.Read(ref _measurementsUnavailable) == 1 &&
_timeProvider.GetUtcNow() - _lastFailure < _retryInterval)
{
return Enumerable.Empty<Measurement<double>>();
}

try
{
double result = func();
if (Volatile.Read(ref _measurementsUnavailable) == 1)
{
_ = Interlocked.Exchange(ref _measurementsUnavailable, 0);
}

return new[] { new Measurement<double>(result) };
}
catch (Exception ex) when (
ex is System.IO.FileNotFoundException ||
ex is System.IO.DirectoryNotFoundException ||
ex is System.UnauthorizedAccessException)
{
_lastFailure = _timeProvider.GetUtcNow();
_ = Interlocked.Exchange(ref _measurementsUnavailable, 1);

return Enumerable.Empty<Measurement<double>>();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using System.Threading.Tasks;
using Microsoft.Extensions.Diagnostics.ResourceMonitoring.Test.Helpers;
using Microsoft.Extensions.Logging.Testing;
using Microsoft.Extensions.Time.Testing;
using Microsoft.Shared.Instruments;
using Microsoft.TestUtilities;
using Moq;
Expand Down Expand Up @@ -272,4 +273,133 @@ public void Provider_Registers_Instruments_CgroupV2_WithoutHostCpu()
Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
Assert.Equal(1, samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization).value);
}

[Fact]
public void Provider_GetMeasurementWithRetry_HandlesExceptionAndRecovers()
{
var meterName = Guid.NewGuid().ToString();
var logger = new FakeLogger<LinuxUtilizationProvider>();
var options = Options.Options.Create(new ResourceMonitoringOptions());
using var meter = new Meter(nameof(Provider_GetMeasurementWithRetry_HandlesExceptionAndRecovers));
var meterFactoryMock = new Mock<IMeterFactory>();
meterFactoryMock.Setup(x => x.Create(It.IsAny<MeterOptions>()))
.Returns(meter);

var callCount = 0;
var parserMock = new Mock<ILinuxUtilizationParser>();
parserMock.Setup(p => p.GetMemoryUsageInBytes()).Returns(() =>
{
callCount++;
if (callCount <= 1)
{
throw new FileNotFoundException("Simulated failure to read file");
}

return 420UL;
});
parserMock.Setup(p => p.GetAvailableMemoryInBytes()).Returns(1000UL);
parserMock.Setup(p => p.GetCgroupRequestCpu()).Returns(10f);
parserMock.Setup(p => p.GetCgroupLimitedCpus()).Returns(12f);

var fakeTime = new FakeTimeProvider(DateTimeOffset.UtcNow);
var provider = new LinuxUtilizationProvider(options, parserMock.Object, meterFactoryMock.Object, logger, fakeTime);

using var listener = new MeterListener
{
InstrumentPublished = (instrument, listener) =>
{
if (ReferenceEquals(meter, instrument.Meter))
{
listener.EnableMeasurementEvents(instrument);
}
}
};

var samples = new List<(Instrument instrument, double value)>();
listener.SetMeasurementEventCallback<double>((instrument, value, _, _) =>
{
if (ReferenceEquals(meter, instrument.Meter))
{
samples.Add((instrument, value));
}
});

listener.Start();
listener.RecordObservableInstruments();
Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);

fakeTime.Advance(TimeSpan.FromMinutes(1));
listener.RecordObservableInstruments();
Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);

fakeTime.Advance(TimeSpan.FromMinutes(5));
listener.RecordObservableInstruments();
var metric = samples.SingleOrDefault(x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
Assert.Equal(0.42, metric.value);

parserMock.Verify(p => p.GetMemoryUsageInBytes(), Times.Exactly(2));
}

[Fact]
public void Provider_GetMeasurementWithRetry_UnhandledException_DoesNotBlockFutureReads()
{
var meterName = Guid.NewGuid().ToString();
var logger = new FakeLogger<LinuxUtilizationProvider>();
var options = Options.Options.Create(new ResourceMonitoringOptions());
using var meter = new Meter(nameof(Provider_GetMeasurementWithRetry_UnhandledException_DoesNotBlockFutureReads));
var meterFactoryMock = new Mock<IMeterFactory>();
meterFactoryMock.Setup(x => x.Create(It.IsAny<MeterOptions>()))
.Returns(meter);

var callCount = 0;
var parserMock = new Mock<ILinuxUtilizationParser>();
parserMock.Setup(p => p.GetMemoryUsageInBytes()).Returns(() =>
{
callCount++;
if (callCount <= 2)
{
throw new InvalidOperationException("Simulated unhandled exception");
}

return 1234UL;
});
parserMock.Setup(p => p.GetAvailableMemoryInBytes()).Returns(2000UL);
parserMock.Setup(p => p.GetCgroupRequestCpu()).Returns(10f);
parserMock.Setup(p => p.GetCgroupLimitedCpus()).Returns(12f);

var fakeTime = new FakeTimeProvider(DateTimeOffset.UtcNow);
var provider = new LinuxUtilizationProvider(options, parserMock.Object, meterFactoryMock.Object, logger, fakeTime);

using var listener = new MeterListener
{
InstrumentPublished = (instrument, listener) =>
{
if (ReferenceEquals(meter, instrument.Meter))
{
listener.EnableMeasurementEvents(instrument);
}
}
};

var samples = new List<(Instrument instrument, double value)>();
listener.SetMeasurementEventCallback<double>((instrument, value, _, _) =>
{
if (ReferenceEquals(meter, instrument.Meter))
{
samples.Add((instrument, value));
}
});

listener.Start();

Assert.Throws<AggregateException>(() => listener.RecordObservableInstruments());
Assert.DoesNotContain(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);

fakeTime.Advance(TimeSpan.FromMinutes(1));
listener.RecordObservableInstruments();
var metric = samples.SingleOrDefault(x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
Assert.Equal(1234f / 2000f, metric.value, 0.01f);

parserMock.Verify(p => p.GetMemoryUsageInBytes(), Times.Exactly(3));
}
}
Loading