Skip to content

Commit 19f9c6c

Browse files
Support response caching for safety evaluators (#6277)
Introduce an `IChatClient` for Safety evaluators so that we can support response caching in the same way as we do for other LLM-based evaluators. Also includes some changes to introduce a `GetContents` API on the `EvaluationContext` type. This aids with some of changes in this PR and should also help for #6033 in a subsequent PR. Fixes #6260
1 parent daeadf4 commit 19f9c6c

29 files changed

+822
-589
lines changed

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluatorContext.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary
77
// constructor syntax.
88

9+
using System.Collections.Generic;
10+
911
namespace Microsoft.Extensions.AI.Evaluation.Quality;
1012

1113
/// <summary>
@@ -29,4 +31,8 @@ public sealed class EquivalenceEvaluatorContext(string groundTruth) : Evaluation
2931
/// the response supplied via <see cref="GroundTruth"/>.
3032
/// </remarks>
3133
public string GroundTruth { get; } = groundTruth;
34+
35+
/// <inheritdoc/>
36+
public override IReadOnlyList<AIContent> GetContents()
37+
=> [new TextContent(GroundTruth)];
3238
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluatorContext.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
// We disable this warning because it is a false positive arising from the analyzer's lack of support for C#'s primary
77
// constructor syntax.
88

9+
using System.Collections.Generic;
10+
911
namespace Microsoft.Extensions.AI.Evaluation.Quality;
1012

1113
/// <summary>
@@ -29,4 +31,8 @@ public sealed class GroundednessEvaluatorContext(string groundingContext) : Eval
2931
/// in the information present in the supplied <see cref="GroundingContext"/>.
3032
/// </remarks>
3133
public string GroundingContext { get; } = groundingContext;
34+
35+
/// <inheritdoc/>
36+
public override IReadOnlyList<AIContent> GetContents()
37+
=> [new TextContent(GroundingContext)];
3238
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,9 @@ await JsonOutputFixer.RepairJsonAsync(
171171
result.AddDiagnosticToAllMetrics(
172172
EvaluationDiagnostic.Error(
173173
$"""
174-
Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
175-
{evaluationResponseText}
176-
"""));
174+
Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
175+
{evaluationResponseText}
176+
"""));
177177
}
178178
else
179179
{
@@ -186,10 +186,10 @@ await JsonOutputFixer.RepairJsonAsync(
186186
result.AddDiagnosticToAllMetrics(
187187
EvaluationDiagnostic.Error(
188188
$"""
189-
Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
190-
{evaluationResponseText}
191-
{ex}
192-
"""));
189+
Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
190+
{evaluationResponseText}
191+
{ex}
192+
"""));
193193
}
194194
}
195195
}
@@ -211,28 +211,28 @@ void UpdateResult()
211211

212212
if (!string.IsNullOrWhiteSpace(evaluationResponse.ModelId))
213213
{
214-
commonMetadata["rtc-evaluation-model-used"] = evaluationResponse.ModelId!;
214+
commonMetadata["evaluation-model-used"] = evaluationResponse.ModelId!;
215215
}
216216

217217
if (evaluationResponse.Usage is UsageDetails usage)
218218
{
219219
if (usage.InputTokenCount is not null)
220220
{
221-
commonMetadata["rtc-evaluation-input-tokens-used"] = $"{usage.InputTokenCount}";
221+
commonMetadata["evaluation-input-tokens-used"] = $"{usage.InputTokenCount}";
222222
}
223223

224224
if (usage.OutputTokenCount is not null)
225225
{
226-
commonMetadata["rtc-evaluation-output-tokens-used"] = $"{usage.OutputTokenCount}";
226+
commonMetadata["evaluation-output-tokens-used"] = $"{usage.OutputTokenCount}";
227227
}
228228

229229
if (usage.TotalTokenCount is not null)
230230
{
231-
commonMetadata["rtc-evaluation-total-tokens-used"] = $"{usage.TotalTokenCount}";
231+
commonMetadata["evaluation-total-tokens-used"] = $"{usage.TotalTokenCount}";
232232
}
233233
}
234234

235-
commonMetadata["rtc-evaluation-duration"] = duration;
235+
commonMetadata["evaluation-duration"] = duration;
236236

237237
NumericMetric relevance = result.Get<NumericMetric>(RelevanceMetricName);
238238
relevance.Value = rating.Relevance;

src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/CSharp/ReportingConfiguration.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ await ResponseCacheProvider.GetCacheAsync(
263263

264264
private static IEnumerable<string> GetCachingKeysForChatClient(IChatClient chatClient)
265265
{
266-
var metadata = chatClient.GetService<ChatClientMetadata>();
266+
ChatClientMetadata? metadata = chatClient.GetService<ChatClientMetadata>();
267267

268268
string? providerName = metadata?.ProviderName;
269269
if (!string.IsNullOrWhiteSpace(providerName))

src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting/TypeScript/components/ChatDetailsSection.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export const ChatDetailsSection = ({ chatDetails }: { chatDetails: ChatDetails;
2424
<div className={classes.section}>
2525
<div className={classes.sectionHeader} onClick={() => setIsExpanded(!isExpanded)}>
2626
{isExpanded ? <ChevronDown12Regular /> : <ChevronRight12Regular />}
27-
<h3 className={classes.sectionHeaderText}>LLM Chat Diagnostic Details</h3>
27+
<h3 className={classes.sectionHeaderText}>Diagnostic Data</h3>
2828
{hasCacheStatus && (
2929
<div className={classes.hint}>
3030
{cachedTurns != totalTurns ?

src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/CodeVulnerabilityEvaluator.cs

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33

44
using System.Collections.Generic;
5-
using System.Linq;
65
using System.Threading;
76
using System.Threading.Tasks;
7+
using Microsoft.Shared.Diagnostics;
88

99
namespace Microsoft.Extensions.AI.Evaluation.Safety;
1010

@@ -31,26 +31,17 @@ namespace Microsoft.Extensions.AI.Evaluation.Safety;
3131
/// will be ignored.
3232
/// </para>
3333
/// </remarks>
34-
/// <param name="contentSafetyServiceConfiguration">
35-
/// Specifies the Azure AI project that should be used and credentials that should be used when this
36-
/// <see cref="ContentSafetyEvaluator"/> communicates with the Azure AI Content Safety service to perform
37-
/// evaluations.
38-
/// </param>
39-
public sealed class CodeVulnerabilityEvaluator(ContentSafetyServiceConfiguration contentSafetyServiceConfiguration)
34+
public sealed class CodeVulnerabilityEvaluator()
4035
: ContentSafetyEvaluator(
41-
contentSafetyServiceConfiguration,
4236
contentSafetyServiceAnnotationTask: "code vulnerability",
43-
evaluatorName: nameof(CodeVulnerabilityEvaluator))
37+
metricNames: new Dictionary<string, string> { ["code_vulnerability"] = CodeVulnerabilityMetricName })
4438
{
4539
/// <summary>
4640
/// Gets the <see cref="EvaluationMetric.Name"/> of the <see cref="BooleanMetric"/> returned by
4741
/// <see cref="CodeVulnerabilityEvaluator"/>.
4842
/// </summary>
4943
public static string CodeVulnerabilityMetricName => "Code Vulnerability";
5044

51-
/// <inheritdoc/>
52-
public override IReadOnlyCollection<string> EvaluationMetricNames => [CodeVulnerabilityMetricName];
53-
5445
/// <inheritdoc/>
5546
public override async ValueTask<EvaluationResult> EvaluateAsync(
5647
IEnumerable<ChatMessage> messages,
@@ -59,30 +50,18 @@ public override async ValueTask<EvaluationResult> EvaluateAsync(
5950
IEnumerable<EvaluationContext>? additionalContext = null,
6051
CancellationToken cancellationToken = default)
6152
{
62-
const string CodeVulnerabilityContentSafetyServiceMetricName = "code_vulnerability";
53+
_ = Throw.IfNull(chatConfiguration);
54+
_ = Throw.IfNull(modelResponse);
6355

6456
EvaluationResult result =
6557
await EvaluateContentSafetyAsync(
58+
chatConfiguration.ChatClient,
6659
messages,
6760
modelResponse,
61+
additionalContext,
6862
contentSafetyServicePayloadFormat: ContentSafetyServicePayloadFormat.ContextCompletion.ToString(),
69-
contentSafetyServiceMetricName: CodeVulnerabilityContentSafetyServiceMetricName,
7063
cancellationToken: cancellationToken).ConfigureAwait(false);
7164

72-
IEnumerable<EvaluationMetric> updatedMetrics =
73-
result.Metrics.Values.Select(
74-
metric =>
75-
{
76-
if (metric.Name == CodeVulnerabilityContentSafetyServiceMetricName)
77-
{
78-
metric.Name = CodeVulnerabilityMetricName;
79-
}
80-
81-
return metric;
82-
});
83-
84-
result = new EvaluationResult(updatedMetrics);
85-
result.Interpret(metric => metric is BooleanMetric booleanMetric ? booleanMetric.InterpretScore() : null);
8665
return result;
8766
}
8867
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Safety/ContentHarmEvaluator.cs

Lines changed: 15 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,44 +2,27 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33

44
using System.Collections.Generic;
5-
using System.Linq;
65
using System.Threading;
76
using System.Threading.Tasks;
7+
using Microsoft.Shared.Diagnostics;
88

99
namespace Microsoft.Extensions.AI.Evaluation.Safety;
1010

11-
#pragma warning disable S1694 // An abstract class should have both abstract and concrete methods
1211
/// <summary>
1312
/// An <see langword="abstract"/> base class that can be used to implement <see cref="IEvaluator"/>s that utilize the
1413
/// Azure AI Content Safety service to evaluate responses produced by an AI model for the presence of a variety of
1514
/// harmful content such as violence, hate speech, etc.
1615
/// </summary>
17-
/// <param name="contentSafetyServiceConfiguration">
18-
/// Specifies the Azure AI project that should be used and credentials that should be used when this
19-
/// <see cref="ContentSafetyEvaluator"/> communicates with the Azure AI Content Safety service to perform evaluations.
20-
/// </param>
21-
/// <param name="contentSafetyServiceMetricName">
22-
/// The name of the metric that should be used when this <see cref="ContentSafetyEvaluator"/> communicates with the
23-
/// Azure AI Content Safety service to perform evaluations.
16+
/// <param name="metricNames">
17+
/// A dictionary containing the mapping from the names of the metrics that are used when communicating with the Azure
18+
/// AI Content Safety to the <see cref="EvaluationMetric.Name"/>s of the <see cref="EvaluationMetric"/>s returned by
19+
/// this <see cref="IEvaluator"/>.
2420
/// </param>
25-
/// <param name="metricName">
26-
/// The name of the <see cref="EvaluationMetric"/> produced by this <see cref="ContentSafetyEvaluator"/>.
27-
/// </param>
28-
/// <param name="evaluatorName">The name of the derived <see cref="ContentSafetyEvaluator"/>.</param>
29-
public abstract class ContentHarmEvaluator(
30-
ContentSafetyServiceConfiguration contentSafetyServiceConfiguration,
31-
string contentSafetyServiceMetricName,
32-
string metricName,
33-
string evaluatorName)
34-
: ContentSafetyEvaluator(
35-
contentSafetyServiceConfiguration,
36-
contentSafetyServiceAnnotationTask: "content harm",
37-
evaluatorName)
21+
#pragma warning disable S1694 // An abstract class should have both abstract and concrete methods
22+
public abstract class ContentHarmEvaluator(IDictionary<string, string> metricNames)
23+
: ContentSafetyEvaluator(contentSafetyServiceAnnotationTask: "content harm", metricNames)
3824
#pragma warning restore S1694
3925
{
40-
/// <inheritdoc/>
41-
public override IReadOnlyCollection<string> EvaluationMetricNames => [metricName];
42-
4326
/// <inheritdoc/>
4427
public sealed override async ValueTask<EvaluationResult> EvaluateAsync(
4528
IEnumerable<ChatMessage> messages,
@@ -48,28 +31,21 @@ public sealed override async ValueTask<EvaluationResult> EvaluateAsync(
4831
IEnumerable<EvaluationContext>? additionalContext = null,
4932
CancellationToken cancellationToken = default)
5033
{
34+
_ = Throw.IfNull(chatConfiguration);
35+
_ = Throw.IfNull(modelResponse);
36+
5137
EvaluationResult result =
5238
await EvaluateContentSafetyAsync(
39+
chatConfiguration.ChatClient,
5340
messages,
5441
modelResponse,
42+
additionalContext,
5543
contentSafetyServicePayloadFormat: ContentSafetyServicePayloadFormat.Conversation.ToString(),
56-
contentSafetyServiceMetricName: contentSafetyServiceMetricName,
5744
cancellationToken: cancellationToken).ConfigureAwait(false);
5845

59-
IEnumerable<EvaluationMetric> updatedMetrics =
60-
result.Metrics.Values.Select(
61-
metric =>
62-
{
63-
if (metric.Name == contentSafetyServiceMetricName)
64-
{
65-
metric.Name = metricName;
66-
}
67-
68-
return metric;
69-
});
46+
result.Interpret(
47+
metric => metric is NumericMetric numericMetric ? numericMetric.InterpretContentHarmScore() : null);
7048

71-
result = new EvaluationResult(updatedMetrics);
72-
result.Interpret(metric => metric is NumericMetric numericMetric ? numericMetric.InterpretHarmScore() : null);
7349
return result;
7450
}
7551
}

0 commit comments

Comments
 (0)