dotnet · stephentoub · Jun 27, 2025 · Jun 26, 2025
@@ -47,10 +47,10 @@ public SpeechToTextResponse(string? content)
     /// <summary>Gets or sets the ID of the speech to text response.</summary>
     public string? ResponseId { get; set; }
 
-    /// <summary>Gets or sets the model ID used in the creation of the speech to text completion.</summary>
+    /// <summary>Gets or sets the model ID used in the creation of the speech to text response.</summary>
     public string? ModelId { get; set; }
 
-    /// <summary>Gets or sets the raw representation of the speech to text completion from an underlying implementation.</summary>
+    /// <summary>Gets or sets the raw representation of the speech to text response from an underlying implementation.</summary>
     /// <remarks>
     /// If a <see cref="SpeechToTextResponse"/> is created to represent some underlying object from another object
     /// model, this property can be used to store that original object. This can be useful for debugging or
@@ -59,7 +59,7 @@ public SpeechToTextResponse(string? content)
     [JsonIgnore]
     public object? RawRepresentation { get; set; }
 
-    /// <summary>Gets or sets any additional properties associated with the speech to text completion.</summary>
+    /// <summary>Gets or sets any additional properties associated with the speech to text response.</summary>
     public AdditionalPropertiesDictionary? AdditionalProperties { get; set; }
 
     /// <summary>Gets the text of this speech to text response.</summary>
@@ -76,9 +76,15 @@ public SpeechToTextResponse(string? content)
     /// <returns>An array of <see cref="SpeechToTextResponseUpdate" /> instances that may be used to represent this <see cref="SpeechToTextResponse" />.</returns>
     public SpeechToTextResponseUpdate[] ToSpeechToTextResponseUpdates()
     {
-        SpeechToTextResponseUpdate update = new SpeechToTextResponseUpdate
+        IList<AIContent> contents = Contents;
+        if (Usage is { } usage)
         {
-            Contents = Contents,
+            contents = [.. contents, new UsageContent(usage)];
+        }
+
+        SpeechToTextResponseUpdate update = new()
+        {
+            Contents = contents,
             AdditionalProperties = AdditionalProperties,
             RawRepresentation = RawRepresentation,
             StartTime = StartTime,
@@ -98,4 +104,7 @@ public IList<AIContent> Contents
         get => _contents ??= [];
         set => _contents = value;
     }
+
+    /// <summary>Gets or sets usage details for the speech to text response.</summary>
+    public UsageDetails? Usage { get; set; }
 }
@@ -1,13 +1,14 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-using System;
 using System.Collections.Generic;
 using System.Diagnostics.CodeAnalysis;
 using System.Threading;
 using System.Threading.Tasks;
 using Microsoft.Shared.Diagnostics;
 
+#pragma warning disable S1121 // Assignments should not be made from within sub-expressions
+
 namespace Microsoft.Extensions.AI;
 
 /// <summary>
@@ -25,32 +26,13 @@ public static SpeechToTextResponse ToSpeechToTextResponse(
         _ = Throw.IfNull(updates);
 
         SpeechToTextResponse response = new();
-        List<AIContent> contents = [];
-        string? responseId = null;
-        string? modelId = null;
-        AdditionalPropertiesDictionary? additionalProperties = null;
 
-        TimeSpan? endTime = null;
         foreach (var update in updates)
         {
-            // Track the first start time provided by the updates
-            response.StartTime ??= update.StartTime;
-
-            // Track the last end time provided by the updates
-            if (update.EndTime is not null)
-            {
-                endTime = update.EndTime;
-            }
-
-            ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties);
+            ProcessUpdate(update, response);
         }
 
-        ChatResponseExtensions.CoalesceTextContent(contents);
-        response.EndTime = endTime;
-        response.Contents = contents;
-        response.ResponseId = responseId;
-        response.ModelId = modelId;
-        response.AdditionalProperties = additionalProperties;
+        ChatResponseExtensions.CoalesceTextContent((List<AIContent>)response.Contents);
 
         return response;
     }
@@ -70,74 +52,73 @@ static async Task<SpeechToTextResponse> ToResponseAsync(
             IAsyncEnumerable<SpeechToTextResponseUpdate> updates, CancellationToken cancellationToken)
         {
             SpeechToTextResponse response = new();
-            List<AIContent> contents = [];
-            string? responseId = null;
-            string? modelId = null;
-            AdditionalPropertiesDictionary? additionalProperties = null;
 
-            TimeSpan? endTime = null;
             await foreach (var update in updates.WithCancellation(cancellationToken).ConfigureAwait(false))
             {
-                // Track the first start time provided by the updates
-                response.StartTime ??= update.StartTime;
-
-                // Track the last end time provided by the updates
-                if (update.EndTime is not null)
-                {
-                    endTime = update.EndTime;
-                }
-
-                ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties);
+                ProcessUpdate(update, response);
             }
 
-            ChatResponseExtensions.CoalesceTextContent(contents);
-
-            response.EndTime = endTime;
-            response.Contents = contents;
-            response.ResponseId = responseId;
-            response.ModelId = modelId;
-            response.AdditionalProperties = additionalProperties;
+            ChatResponseExtensions.CoalesceTextContent((List<AIContent>)response.Contents);
 
             return response;
         }
     }
 
     /// <summary>Processes the <see cref="SpeechToTextResponseUpdate"/>, incorporating its contents and properties.</summary>
     /// <param name="update">The update to process.</param>
-    /// <param name="contents">The list of content items being accumulated.</param>
-    /// <param name="responseId">The response ID to update if the update has one.</param>
-    /// <param name="modelId">The model ID to update if the update has one.</param>
-    /// <param name="additionalProperties">The additional properties to update if the update has any.</param>
+    /// <param name="response">The <see cref="SpeechToTextResponse"/> object that should be updated based on <paramref name="update"/>.</param>
     private static void ProcessUpdate(
         SpeechToTextResponseUpdate update,
-        List<AIContent> contents,
-        ref string? responseId,
-        ref string? modelId,
-        ref AdditionalPropertiesDictionary? additionalProperties)
+        SpeechToTextResponse response)
     {
         if (update.ResponseId is not null)
         {
-            responseId = update.ResponseId;
+            response.ResponseId = update.ResponseId;
         }
 
         if (update.ModelId is not null)
         {
-            modelId = update.ModelId;
+            response.ModelId = update.ModelId;
         }
 
-        contents.AddRange(update.Contents);
+        if (response.StartTime is null || (update.StartTime is not null && update.StartTime < response.StartTime))
+        {
+            // Track the first start time provided by the updates
+            response.StartTime = update.StartTime;
+        }
+
+        if (response.EndTime is null || (update.EndTime is not null && update.EndTime > response.EndTime))
+        {
+            // Track the last end time provided by the updates
+            response.EndTime = update.EndTime;
+        }
+
+        foreach (var content in update.Contents)
+        {
+            switch (content)
+            {
+                // Usage content is treated specially and propagated to the response's Usage.
+                case UsageContent usage:
+                    (response.Usage ??= new()).Add(usage.Details);
+                    break;
+
+                default:
+                    response.Contents.Add(content);
+                    break;
+            }
+        }
 
         if (update.AdditionalProperties is not null)
         {
-            if (additionalProperties is null)
+            if (response.AdditionalProperties is null)
             {
-                additionalProperties = new(update.AdditionalProperties);
+                response.AdditionalProperties = new(update.AdditionalProperties);
             }
             else
             {
                 foreach (var entry in update.AdditionalProperties)
                 {
-                    additionalProperties[entry.Key] = entry.Value;
+                    response.AdditionalProperties[entry.Key] = entry.Value;
                 }
             }
         }

@@ -31,6 +31,7 @@ public void Constructor_Parameterless_PropsDefaulted()
         Assert.Null(response.StartTime);
         Assert.Null(response.EndTime);
         Assert.Equal(string.Empty, response.ToString());
+        Assert.Null(response.Usage);
     }
 
     [Theory]
@@ -132,6 +133,11 @@ public void Properties_Roundtrip()
         List<AIContent> newContents = [new TextContent("text1"), new TextContent("text2")];
         response.Contents = newContents;
         Assert.Same(newContents, response.Contents);
+
+        Assert.Null(response.Usage);
+        UsageDetails usageDetails = new();
+        response.Usage = usageDetails;
+        Assert.Same(usageDetails, response.Usage);
     }
 
     [Fact]
@@ -152,6 +158,7 @@ public void JsonSerialization_Roundtrips()
             EndTime = TimeSpan.FromSeconds(2),
             RawRepresentation = new(),
             AdditionalProperties = new() { ["key"] = "value" },
+            Usage = new() { InputTokenCount = 42, OutputTokenCount = 84, TotalTokenCount = 126 },
         };
 
         string json = JsonSerializer.Serialize(original, TestJsonSerializerContext.Default.SpeechToTextResponse);
@@ -176,6 +183,11 @@ public void JsonSerialization_Roundtrips()
         Assert.True(result.AdditionalProperties.TryGetValue("key", out object? value));
         Assert.IsType<JsonElement>(value);
         Assert.Equal("value", ((JsonElement)value!).GetString());
+
+        Assert.NotNull(result.Usage);
+        Assert.Equal(42, result.Usage.InputTokenCount);
+        Assert.Equal(84, result.Usage.OutputTokenCount);
+        Assert.Equal(126, result.Usage.TotalTokenCount);
     }
 
     [Fact]
@@ -185,8 +197,10 @@ public void ToString_OutputsText()
         Assert.Equal("This is a test." + Environment.NewLine + "It's multiple lines.", response.ToString());
     }
 
-    [Fact]
-    public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
+    [Theory]
+    [InlineData(false)]
+    [InlineData(true)]
+    public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate(bool withUsage)
     {
         // Arrange: create a response with contents
         SpeechToTextResponse response = new()
@@ -202,6 +216,7 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
             ResponseId = "12345",
             ModelId = "someModel",
             AdditionalProperties = new() { ["key1"] = "value1", ["key2"] = 42 },
+            Usage = withUsage ? new UsageDetails { InputTokenCount = 100, OutputTokenCount = 200, TotalTokenCount = 300 } : null
         };
 
         // Act: convert to streaming updates
@@ -217,13 +232,21 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
         Assert.Equal(TimeSpan.FromSeconds(1), update.StartTime);
         Assert.Equal(TimeSpan.FromSeconds(2), update.EndTime);
 
-        Assert.Equal(3, update.Contents.Count);
+        Assert.Equal(withUsage ? 4 : 3, update.Contents.Count);
         Assert.Equal("Hello, ", Assert.IsType<TextContent>(update.Contents[0]).Text);
         Assert.Equal("image/png", Assert.IsType<DataContent>(update.Contents[1]).MediaType);
         Assert.Equal("world!", Assert.IsType<TextContent>(update.Contents[2]).Text);
 
         Assert.NotNull(update.AdditionalProperties);
         Assert.Equal("value1", update.AdditionalProperties["key1"]);
         Assert.Equal(42, update.AdditionalProperties["key2"]);
+
+        if (withUsage)
+        {
+            var usage = Assert.IsType<UsageContent>(update.Contents[3]);
+            Assert.Equal(100, usage.Details.InputTokenCount);
+            Assert.Equal(200, usage.Details.OutputTokenCount);
+            Assert.Equal(300, usage.Details.TotalTokenCount);
+        }
     }
 }
@@ -70,6 +70,8 @@ public async Task ToSpeechToTextResponse_SuccessfullyCreatesResponse(bool useAsy
         Assert.Equal("d", response.AdditionalProperties["c"]);
 
         Assert.Equal("Hello human, How are You?", response.Text);
+
+        Assert.Null(response.Usage);
     }
 
     [Theory]
@@ -129,6 +131,28 @@ void AddGap()
         }
     }
 
+    [Fact]
+    public async Task ToSpeechToTextResponse_UsageContentExtractedFromContents()
+    {
+        SpeechToTextResponseUpdate[] updates =
+        {
+            new() { Contents = [new TextContent("Hello, ")] },
+            new() { Contents = [new UsageContent(new() { TotalTokenCount = 42 })] },
+            new() { Contents = [new TextContent("world!")] },
+            new() { Contents = [new UsageContent(new() { InputTokenCount = 12, TotalTokenCount = 24 })] },
+        };
+
+        SpeechToTextResponse response = await YieldAsync(updates).ToSpeechToTextResponseAsync();
+
+        Assert.NotNull(response);
+
+        Assert.NotNull(response.Usage);
+        Assert.Equal(12, response.Usage.InputTokenCount);
+        Assert.Equal(66, response.Usage.TotalTokenCount);
+
+        Assert.Equal("Hello, world!", Assert.IsType<TextContent>(Assert.Single(response.Contents)).Text);
+    }
+
     private static async IAsyncEnumerable<SpeechToTextResponseUpdate> YieldAsync(IEnumerable<SpeechToTextResponseUpdate> updates)
     {
         foreach (SpeechToTextResponseUpdate update in updates)