Skip to content

Commit c9aa5d3

Browse files
github-actions[bot]niver2d2nivebb8evgenyfedorov2
authored
[release/9.4] Fix Process Metric calculation in CgroupsV2 (#6324)
* Fix Process Metric * Removing Process Metric * Adding Counters --------- Co-authored-by: Nidhi Verma <[email protected]> Co-authored-by: nivebb8 <[email protected]> Co-authored-by: evgenyfedorov2 <[email protected]>
1 parent 20c931e commit c9aa5d3

File tree

5 files changed

+71
-7
lines changed

5 files changed

+71
-7
lines changed

eng/Versions.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
<PropertyGroup Label="Version settings">
33
<MajorVersion>9</MajorVersion>
44
<MinorVersion>4</MinorVersion>
5-
<PatchVersion>1</PatchVersion>
5+
<PatchVersion>2</PatchVersion>
66
<PreReleaseVersionLabel>preview</PreReleaseVersionLabel>
77
<PreReleaseVersionIteration>1</PreReleaseVersionIteration>
88
<VersionPrefix>$(MajorVersion).$(MinorVersion).$(PatchVersion)</VersionPrefix>

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
1414
{
1515
private const double One = 1.0;
1616
private const long Hundred = 100L;
17+
private const double CpuLimitThreshold110Percent = 1.1;
18+
19+
// Meters to track CPU utilization threshold exceedances
20+
private readonly Counter<long>? _cpuUtilizationLimit100PercentExceededCounter;
21+
private readonly Counter<long>? _cpuUtilizationLimit110PercentExceededCounter;
1722

1823
private readonly object _cpuLocker = new();
1924
private readonly object _memoryLocker = new();
@@ -38,6 +43,8 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
3843
private double _memoryPercentage;
3944
private long _previousCgroupCpuTime;
4045
private long _previousHostCpuTime;
46+
private long _cpuUtilizationLimit100PercentExceeded;
47+
private long _cpuUtilizationLimit110PercentExceeded;
4148
public SystemResources Resources { get; }
4249

4350
public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILinuxUtilizationParser parser,
@@ -77,17 +84,21 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
7784

7885
// Try to get the CPU request from cgroup
7986
cpuRequest = _parser.GetCgroupRequestCpuV2();
80-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilizationWithoutHostDelta() / cpuLimit, unit: "1");
87+
88+
// Initialize the counters
89+
_cpuUtilizationLimit100PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_100_percent_exceeded");
90+
_cpuUtilizationLimit110PercentExceededCounter = meter.CreateCounter<long>("cpu_utilization_limit_110_percent_exceeded");
91+
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilizationLimit(cpuLimit), unit: "1");
8192
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilizationWithoutHostDelta() / cpuRequest, unit: "1");
8293
}
8394
else
8495
{
8596
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuLimit, unit: "1");
8697
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
98+
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
8799
}
88100

89101
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: MemoryUtilization, unit: "1");
90-
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessCpuUtilization, observeValue: () => CpuUtilization() * _scaleRelativeToCpuRequest, unit: "1");
91102
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ProcessMemoryUtilization, observeValue: MemoryUtilization, unit: "1");
92103

93104
// cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core
@@ -138,6 +149,34 @@ public double CpuUtilizationWithoutHostDelta()
138149
return _lastCpuCoresUsed;
139150
}
140151

152+
/// <summary>
153+
/// Calculates CPU utilization relative to the CPU limit.
154+
/// </summary>
155+
/// <param name="cpuLimit">The CPU limit to use for the calculation.</param>
156+
/// <returns>CPU usage as a ratio of the limit.</returns>
157+
public double CpuUtilizationLimit(float cpuLimit)
158+
{
159+
double utilization = CpuUtilizationWithoutHostDelta() / cpuLimit;
160+
161+
// Increment counter if utilization exceeds 1 (100%)
162+
if (utilization > 1.0)
163+
{
164+
_cpuUtilizationLimit100PercentExceededCounter?.Add(1);
165+
_cpuUtilizationLimit100PercentExceeded++;
166+
Log.CounterMessage100(_logger, _cpuUtilizationLimit100PercentExceeded);
167+
}
168+
169+
// Increment counter if utilization exceeds 110%
170+
if (utilization > CpuLimitThreshold110Percent)
171+
{
172+
_cpuUtilizationLimit110PercentExceededCounter?.Add(1);
173+
_cpuUtilizationLimit110PercentExceeded++;
174+
Log.CounterMessage110(_logger, _cpuUtilizationLimit110PercentExceeded);
175+
}
176+
177+
return utilization;
178+
}
179+
141180
public double CpuUtilization()
142181
{
143182
DateTimeOffset now = _timeProvider.GetUtcNow();

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/Log.cs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,16 @@ public static partial void CpuUsageDataV2(
4444
long previousCgroupCpuTime,
4545
double actualElapsedNanoseconds,
4646
double cpuCores);
47+
48+
[LoggerMessage(5, LogLevel.Debug,
49+
"CPU utilization exceeded 100%: Counter = {counterValue}")]
50+
public static partial void CounterMessage100(
51+
ILogger logger,
52+
long counterValue);
53+
54+
[LoggerMessage(6, LogLevel.Debug,
55+
"CPU utilization exceeded 110%: Counter = {counterValue}")]
56+
public static partial void CounterMessage110(
57+
ILogger logger,
58+
long counterValue);
4759
}

src/Shared/Instruments/ResourceUtilizationInstruments.cs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,22 @@ internal static class ResourceUtilizationInstruments
6565
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableUpDownCounter{T}"/>.
6666
/// </remarks>
6767
public const string SystemNetworkConnections = "system.network.connections";
68+
69+
/// <summary>
70+
/// The name of an instrument to count occurrences when CPU utilization exceeds 100% of the limit.
71+
/// </summary>
72+
/// <remarks>
73+
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.Counter{T}"/>.
74+
/// </remarks>
75+
public const string CpuUtilizationLimit100PercentExceeded = "cpu.utilization.limit.100percent.exceeded";
76+
77+
/// <summary>
78+
/// The name of an instrument to count occurrences when CPU utilization exceeds 110% of the limit.
79+
/// </summary>
80+
/// <remarks>
81+
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.Counter{T}"/>.
82+
/// </remarks>
83+
public const string CpuUtilizationLimit110PercentExceeded = "cpu.utilization.limit.110percent.exceeded";
6884
}
6985

7086
#pragma warning disable CS1574

test/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring.Tests/Linux/LinuxUtilizationProviderTests.cs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ public void Provider_Registers_Instruments_CgroupV2_WithoutHostCpu()
258258
listener.Start();
259259
listener.RecordObservableInstruments();
260260

261-
Assert.Equal(5, samples.Count);
261+
Assert.Equal(4, samples.Count);
262262

263263
Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ContainerCpuLimitUtilization);
264264
Assert.True(double.IsNaN(samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ContainerCpuLimitUtilization).value));
@@ -269,9 +269,6 @@ public void Provider_Registers_Instruments_CgroupV2_WithoutHostCpu()
269269
Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ContainerMemoryLimitUtilization);
270270
Assert.Equal(1, samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ContainerMemoryLimitUtilization).value);
271271

272-
Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessCpuUtilization);
273-
Assert.True(double.IsNaN(samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ProcessCpuUtilization).value));
274-
275272
Assert.Contains(samples, x => x.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization);
276273
Assert.Equal(1, samples.Single(i => i.instrument.Name == ResourceUtilizationInstruments.ProcessMemoryUtilization).value);
277274
}

0 commit comments

Comments
 (0)