dotnet
diff --git a/‎eng/packages/General.props
Lines changed: 1 addition & 0 deletions b/‎eng/packages/General.props
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs
Lines changed: 1 addition & 1 deletion b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/Program.cs
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md
Lines changed: 1 addition & 0 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Console/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluatorContext.cs
Lines changed: 2 additions & 1 deletion b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluatorContext.cs
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluatorContext.cs
Lines changed: 2 additions & 1 deletion b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluatorContext.cs
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md
Lines changed: 1 addition & 0 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs
Lines changed: 106 additions & 41 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs
Lines changed: 106 additions & 41 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs
Lines changed: 57 additions & 22 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/SingleNumericMetricEvaluator.cs
Lines changed: 57 additions & 22 deletions
diff --git a/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md
Lines changed: 1 addition & 0 deletions b/‎src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/README.md
Lines changed: 1 addition & 0 deletions
@@ -1,6 +1,7 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup>
+    <PackageVersion Include="Azure.Core" Version="1.45.0" />
     <PackageVersion Include="Azure.Identity" Version="1.13.2" />
     <PackageVersion Include="Azure.Storage.Files.DataLake" Version="12.21.0" />
     <PackageVersion Include="Azure.AI.Inference" Version="1.0.0-beta.4" />
 
@@ -139,7 +139,7 @@ private static async Task<int> Main(string[] args)
         // TASK: Support some mechanism to fail a build (i.e. return a failure exit code) based on one or more user
         // specified criteria (e.g., if x% of metrics were deemed 'poor'). Ideally this mechanism would be flexible /
         // extensible enough to allow users to configure multiple different kinds of failure criteria.
-
+        // See https://github.com/dotnet/extensions/issues/6038.
 #if DEBUG
         ParseResult parseResult = rootCmd.Parse(args);
         if (parseResult.HasOption(debugOpt))
 
@@ -4,6 +4,7 @@
 
 * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation.
 * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Equivalence and Groundedness.
+* [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Content Safety service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack.
 * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data.
 * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container.
 * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data.
 
@@ -9,7 +9,8 @@
 namespace Microsoft.Extensions.AI.Evaluation.Quality;
 
 /// <summary>
-/// Contextual information required to evaluate the 'Equivalence' of a response.
+/// Contextual information that the <see cref="EquivalenceEvaluator"/> uses to evaluate the 'Equivalence' of a
+/// response.
 /// </summary>
 /// <param name="groundTruth">
 /// The ground truth response against which the response that is being evaluated is compared.
 
@@ -9,7 +9,8 @@
 namespace Microsoft.Extensions.AI.Evaluation.Quality;
 
 /// <summary>
-/// Contextual information required to evaluate the 'Groundedness' of a response.
+/// Contextual information that the <see cref="GroundednessEvaluator"/> uses to evaluate the 'Groundedness' of a
+/// response.
 /// </summary>
 /// <param name="groundingContext">
 /// Contextual information against which the 'Groundedness' of a response is evaluated.
 
@@ -4,6 +4,7 @@
 
 * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation.
 * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Equivalence and Groundedness.
+* [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Content Safety service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack.
 * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data.
 * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container.
 * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data.
 
@@ -7,6 +7,9 @@
 // constructor syntax.
 
 using System.Collections.Generic;
+using System.Diagnostics;
+using System.Globalization;
+using System.Linq;
 using System.Text;
 using System.Text.Json;
 using System.Threading;
@@ -125,71 +128,112 @@ protected override async ValueTask PerformEvaluationAsync(
         EvaluationResult result,
         CancellationToken cancellationToken)
     {
-        ChatResponse evaluationResponse =
-            await chatConfiguration.ChatClient.GetResponseAsync(
-                evaluationMessages,
-                _chatOptions,
-                cancellationToken: cancellationToken).ConfigureAwait(false);
-
-        string evaluationResponseText = evaluationResponse.Text.Trim();
+        ChatResponse evaluationResponse;
         Rating rating;
+        string duration;
+        Stopwatch stopwatch = Stopwatch.StartNew();
 
-        if (string.IsNullOrEmpty(evaluationResponseText))
-        {
-            rating = Rating.Inconclusive;
-            result.AddDiagnosticToAllMetrics(
-                EvaluationDiagnostic.Error(
-                    "Evaluation failed because the model failed to produce a valid evaluation response."));
-        }
-        else
+        try
         {
-            try
+            evaluationResponse =
+                await chatConfiguration.ChatClient.GetResponseAsync(
+                    evaluationMessages,
+                    _chatOptions,
+                    cancellationToken: cancellationToken).ConfigureAwait(false);
+
+            string evaluationResponseText = evaluationResponse.Text.Trim();
+            if (string.IsNullOrEmpty(evaluationResponseText))
             {
-                rating = Rating.FromJson(evaluationResponseText!);
+                rating = Rating.Inconclusive;
+                result.AddDiagnosticToAllMetrics(
+                    EvaluationDiagnostic.Error(
+                        "Evaluation failed because the model failed to produce a valid evaluation response."));
             }
-            catch (JsonException)
+            else
             {
                 try
                 {
-                    string repairedJson =
-                        await JsonOutputFixer.RepairJsonAsync(
-                            chatConfiguration,
-                            evaluationResponseText!,
-                            cancellationToken).ConfigureAwait(false);
-
-                    if (string.IsNullOrEmpty(repairedJson))
+                    rating = Rating.FromJson(evaluationResponseText!);
+                }
+                catch (JsonException)
+                {
+                    try
                     {
-                        rating = Rating.Inconclusive;
-                        result.AddDiagnosticToAllMetrics(
-                            EvaluationDiagnostic.Error(
-                                $"""
+                        string repairedJson =
+                            await JsonOutputFixer.RepairJsonAsync(
+                                chatConfiguration,
+                                evaluationResponseText!,
+                                cancellationToken).ConfigureAwait(false);
+
+                        if (string.IsNullOrEmpty(repairedJson))
+                        {
+                            rating = Rating.Inconclusive;
+                            result.AddDiagnosticToAllMetrics(
+                                EvaluationDiagnostic.Error(
+                                    $"""
                                 Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
                                 {evaluationResponseText}
                                 """));
+                        }
+                        else
+                        {
+                            rating = Rating.FromJson(repairedJson!);
+                        }
                     }
-                    else
+                    catch (JsonException ex)
                     {
-                        rating = Rating.FromJson(repairedJson!);
-                    }
-                }
-                catch (JsonException ex)
-                {
-                    rating = Rating.Inconclusive;
-                    result.AddDiagnosticToAllMetrics(
-                        EvaluationDiagnostic.Error(
-                            $"""
+                        rating = Rating.Inconclusive;
+                        result.AddDiagnosticToAllMetrics(
+                            EvaluationDiagnostic.Error(
+                                $"""
                             Failed to repair the following response from the model and parse scores for '{RelevanceMetricName}', '{TruthMetricName}' and '{CompletenessMetricName}'.:
                             {evaluationResponseText}
                             {ex}
                             """));
+                    }
                 }
             }
         }
+        finally
+        {
+            stopwatch.Stop();
+            duration = $"{stopwatch.Elapsed.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s";
+        }
 
-        UpdateResult(rating);
+        UpdateResult();
 
-        void UpdateResult(Rating rating)
+        void UpdateResult()
         {
+            const string Rationales = "Rationales";
+            const string Separator = "; ";
+
+            var commonMetadata = new Dictionary<string, string>();
+
+            if (!string.IsNullOrWhiteSpace(evaluationResponse.ModelId))
+            {
+                commonMetadata["rtc-evaluation-model-used"] = evaluationResponse.ModelId!;
+            }
+
+            if (evaluationResponse.Usage is UsageDetails usage)
+            {
+                if (usage.InputTokenCount is not null)
+                {
+                    commonMetadata["rtc-evaluation-input-tokens-used"] = $"{usage.InputTokenCount}";
+                }
+
+                if (usage.OutputTokenCount is not null)
+                {
+                    commonMetadata["rtc-evaluation-output-tokens-used"] = $"{usage.OutputTokenCount}";
+                }
+
+                if (usage.TotalTokenCount is not null)
+                {
+                    commonMetadata["rtc-evaluation-total-tokens-used"] = $"{usage.TotalTokenCount}";
+                }
+            }
+
+            commonMetadata["rtc-evaluation-duration"] = duration;
+
             NumericMetric relevance = result.Get<NumericMetric>(RelevanceMetricName);
             relevance.Value = rating.Relevance;
             relevance.Interpretation = relevance.InterpretScore();
@@ -198,6 +242,13 @@ void UpdateResult(Rating rating)
                 relevance.Reason = rating.RelevanceReasoning!;
             }
 
+            relevance.AddOrUpdateMetadata(commonMetadata);
+            if (rating.RelevanceReasons.Any())
+            {
+                string value = string.Join(Separator, rating.RelevanceReasons);
+                relevance.AddOrUpdateMetadata(name: Rationales, value);
+            }
+
             NumericMetric truth = result.Get<NumericMetric>(TruthMetricName);
             truth.Value = rating.Truth;
             truth.Interpretation = truth.InterpretScore();
@@ -206,6 +257,13 @@ void UpdateResult(Rating rating)
                 truth.Reason = rating.TruthReasoning!;
             }
 
+            truth.AddOrUpdateMetadata(commonMetadata);
+            if (rating.TruthReasons.Any())
+            {
+                string value = string.Join(Separator, rating.TruthReasons);
+                truth.AddOrUpdateMetadata(name: Rationales, value);
+            }
+
             NumericMetric completeness = result.Get<NumericMetric>(CompletenessMetricName);
             completeness.Value = rating.Completeness;
             completeness.Interpretation = completeness.InterpretScore();
@@ -214,6 +272,13 @@ void UpdateResult(Rating rating)
                 completeness.Reason = rating.CompletenessReasoning!;
             }
 
+            completeness.AddOrUpdateMetadata(commonMetadata);
+            if (rating.CompletenessReasons.Any())
+            {
+                string value = string.Join(Separator, rating.CompletenessReasons);
+                completeness.AddOrUpdateMetadata(name: Rationales, value);
+            }
+
             if (!string.IsNullOrWhiteSpace(rating.Error))
             {
                 result.AddDiagnosticToAllMetrics(EvaluationDiagnostic.Error(rating.Error!));
 
@@ -2,6 +2,8 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 using System.Collections.Generic;
+using System.Diagnostics;
+using System.Globalization;
 using System.Threading;
 using System.Threading.Tasks;
 using Microsoft.Shared.Diagnostics;
@@ -65,33 +67,66 @@ protected sealed override async ValueTask PerformEvaluationAsync(
         _ = Throw.IfNull(chatConfiguration);
         _ = Throw.IfNull(result);
 
-        ChatResponse evaluationResponse =
-            await chatConfiguration.ChatClient.GetResponseAsync(
-                evaluationMessages,
-                _chatOptions,
-                cancellationToken: cancellationToken).ConfigureAwait(false);
-
-        string evaluationResponseText = evaluationResponse.Text.Trim();
-
+        Stopwatch stopwatch = Stopwatch.StartNew();
         NumericMetric metric = result.Get<NumericMetric>(MetricName);
 
-        if (string.IsNullOrEmpty(evaluationResponseText))
-        {
-            metric.AddDiagnostic(
-                EvaluationDiagnostic.Error(
-                    "Evaluation failed because the model failed to produce a valid evaluation response."));
-        }
-        else if (int.TryParse(evaluationResponseText, out int score))
+        try
         {
-            metric.Value = score;
+            ChatResponse evaluationResponse =
+                await chatConfiguration.ChatClient.GetResponseAsync(
+                    evaluationMessages,
+                    _chatOptions,
+                    cancellationToken: cancellationToken).ConfigureAwait(false);
+
+            if (!string.IsNullOrWhiteSpace(evaluationResponse.ModelId))
+            {
+                metric.AddOrUpdateMetadata(name: "evaluation-model-used", value: evaluationResponse.ModelId!);
+            }
+
+            if (evaluationResponse.Usage is UsageDetails usage)
+            {
+                if (usage.InputTokenCount is not null)
+                {
+                    metric.AddOrUpdateMetadata(name: "evaluation-input-tokens-used", value: $"{usage.InputTokenCount}");
+                }
+
+                if (usage.OutputTokenCount is not null)
+                {
+                    metric.AddOrUpdateMetadata(name: "evaluation-output-tokens-used", value: $"{usage.OutputTokenCount}");
+                }
+
+                if (usage.TotalTokenCount is not null)
+                {
+                    metric.AddOrUpdateMetadata(name: "evaluation-total-tokens-used", value: $"{usage.TotalTokenCount}");
+                }
+            }
+
+            string evaluationResponseText = evaluationResponse.Text.Trim();
+
+            if (string.IsNullOrEmpty(evaluationResponseText))
+            {
+                metric.AddDiagnostic(
+                    EvaluationDiagnostic.Error(
+                        "Evaluation failed because the model failed to produce a valid evaluation response."));
+            }
+            else if (int.TryParse(evaluationResponseText, out int score))
+            {
+                metric.Value = score;
+            }
+            else
+            {
+                metric.AddDiagnostic(
+                    EvaluationDiagnostic.Error(
+                        $"Failed to parse '{evaluationResponseText!}' as an integer score for '{MetricName}'."));
+            }
+
+            metric.Interpretation = metric.InterpretScore();
         }
-        else
+        finally
         {
-            metric.AddDiagnostic(
-                EvaluationDiagnostic.Error(
-                    $"Failed to parse '{evaluationResponseText!}' as an integer score for '{MetricName}'."));
+            stopwatch.Stop();
+            string duration = $"{stopwatch.Elapsed.TotalSeconds.ToString("F2", CultureInfo.InvariantCulture)} s";
+            metric.AddOrUpdateMetadata(name: "evaluation-duration", value: duration);
         }
-
-        metric.Interpretation = metric.InterpretScore();
     }
 }
@@ -4,6 +4,7 @@
 
 * [`Microsoft.Extensions.AI.Evaluation`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation) - Defines core abstractions and types for supporting evaluation.
 * [`Microsoft.Extensions.AI.Evaluation.Quality`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Quality) - Contains evaluators that can be used to evaluate the quality of AI responses in your projects including Relevance, Truth, Completeness, Fluency, Coherence, Equivalence and Groundedness.
+* [`Microsoft.Extensions.AI.Evaluation.Safety`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Safety) - Contains a set of evaluators that are built atop the Azure AI Content Safety service that can be used to evaluate the content safety of AI responses in your projects including Protected Material, Groundedness Pro, Ungrounded Attributes, Hate and Unfairness, Self Harm, Violence, Sexual, Code Vulnerability and Indirect Attack.
 * [`Microsoft.Extensions.AI.Evaluation.Reporting`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting) - Contains support for caching LLM responses, storing the results of evaluations and generating reports from that data.
 * [`Microsoft.Extensions.AI.Evaluation.Reporting.Azure`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Reporting.Azure) - Supports the `Microsoft.Extensions.AI.Evaluation.Reporting` library with an implementation for caching LLM responses and storing the evaluation results in an Azure Storage container.
 * [`Microsoft.Extensions.AI.Evaluation.Console`](https://www.nuget.org/packages/Microsoft.Extensions.AI.Evaluation.Console) - A command line dotnet tool for generating reports and managing evaluation data.