diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs
index 24fa20a11ed..63c6c137411 100644
--- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs
+++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs
@@ -47,10 +47,10 @@ public SpeechToTextResponse(string? content)
/// Gets or sets the ID of the speech to text response.
public string? ResponseId { get; set; }
- /// Gets or sets the model ID used in the creation of the speech to text completion.
+ /// Gets or sets the model ID used in the creation of the speech to text response.
public string? ModelId { get; set; }
- /// Gets or sets the raw representation of the speech to text completion from an underlying implementation.
+ /// Gets or sets the raw representation of the speech to text response from an underlying implementation.
///
/// If a is created to represent some underlying object from another object
/// model, this property can be used to store that original object. This can be useful for debugging or
@@ -59,7 +59,7 @@ public SpeechToTextResponse(string? content)
[JsonIgnore]
public object? RawRepresentation { get; set; }
- /// Gets or sets any additional properties associated with the speech to text completion.
+ /// Gets or sets any additional properties associated with the speech to text response.
public AdditionalPropertiesDictionary? AdditionalProperties { get; set; }
/// Gets the text of this speech to text response.
@@ -76,9 +76,15 @@ public SpeechToTextResponse(string? content)
/// An array of instances that may be used to represent this .
public SpeechToTextResponseUpdate[] ToSpeechToTextResponseUpdates()
{
- SpeechToTextResponseUpdate update = new SpeechToTextResponseUpdate
+ IList contents = Contents;
+ if (Usage is { } usage)
{
- Contents = Contents,
+ contents = [.. contents, new UsageContent(usage)];
+ }
+
+ SpeechToTextResponseUpdate update = new()
+ {
+ Contents = contents,
AdditionalProperties = AdditionalProperties,
RawRepresentation = RawRepresentation,
StartTime = StartTime,
@@ -98,4 +104,7 @@ public IList Contents
get => _contents ??= [];
set => _contents = value;
}
+
+ /// Gets or sets usage details for the speech to text response.
+ public UsageDetails? Usage { get; set; }
}
diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs
index 230ec838ba3..0f83a7a8bee 100644
--- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs
+++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs
@@ -1,13 +1,14 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
-using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;
+#pragma warning disable S1121 // Assignments should not be made from within sub-expressions
+
namespace Microsoft.Extensions.AI;
///
@@ -25,32 +26,13 @@ public static SpeechToTextResponse ToSpeechToTextResponse(
_ = Throw.IfNull(updates);
SpeechToTextResponse response = new();
- List contents = [];
- string? responseId = null;
- string? modelId = null;
- AdditionalPropertiesDictionary? additionalProperties = null;
- TimeSpan? endTime = null;
foreach (var update in updates)
{
- // Track the first start time provided by the updates
- response.StartTime ??= update.StartTime;
-
- // Track the last end time provided by the updates
- if (update.EndTime is not null)
- {
- endTime = update.EndTime;
- }
-
- ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties);
+ ProcessUpdate(update, response);
}
- ChatResponseExtensions.CoalesceTextContent(contents);
- response.EndTime = endTime;
- response.Contents = contents;
- response.ResponseId = responseId;
- response.ModelId = modelId;
- response.AdditionalProperties = additionalProperties;
+ ChatResponseExtensions.CoalesceTextContent((List)response.Contents);
return response;
}
@@ -70,33 +52,13 @@ static async Task ToResponseAsync(
IAsyncEnumerable updates, CancellationToken cancellationToken)
{
SpeechToTextResponse response = new();
- List contents = [];
- string? responseId = null;
- string? modelId = null;
- AdditionalPropertiesDictionary? additionalProperties = null;
- TimeSpan? endTime = null;
await foreach (var update in updates.WithCancellation(cancellationToken).ConfigureAwait(false))
{
- // Track the first start time provided by the updates
- response.StartTime ??= update.StartTime;
-
- // Track the last end time provided by the updates
- if (update.EndTime is not null)
- {
- endTime = update.EndTime;
- }
-
- ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties);
+ ProcessUpdate(update, response);
}
- ChatResponseExtensions.CoalesceTextContent(contents);
-
- response.EndTime = endTime;
- response.Contents = contents;
- response.ResponseId = responseId;
- response.ModelId = modelId;
- response.AdditionalProperties = additionalProperties;
+ ChatResponseExtensions.CoalesceTextContent((List)response.Contents);
return response;
}
@@ -104,40 +66,59 @@ static async Task ToResponseAsync(
/// Processes the , incorporating its contents and properties.
/// The update to process.
- /// The list of content items being accumulated.
- /// The response ID to update if the update has one.
- /// The model ID to update if the update has one.
- /// The additional properties to update if the update has any.
+ /// The object that should be updated based on .
private static void ProcessUpdate(
SpeechToTextResponseUpdate update,
- List contents,
- ref string? responseId,
- ref string? modelId,
- ref AdditionalPropertiesDictionary? additionalProperties)
+ SpeechToTextResponse response)
{
if (update.ResponseId is not null)
{
- responseId = update.ResponseId;
+ response.ResponseId = update.ResponseId;
}
if (update.ModelId is not null)
{
- modelId = update.ModelId;
+ response.ModelId = update.ModelId;
}
- contents.AddRange(update.Contents);
+ if (response.StartTime is null || (update.StartTime is not null && update.StartTime < response.StartTime))
+ {
+ // Track the first start time provided by the updates
+ response.StartTime = update.StartTime;
+ }
+
+ if (response.EndTime is null || (update.EndTime is not null && update.EndTime > response.EndTime))
+ {
+ // Track the last end time provided by the updates
+ response.EndTime = update.EndTime;
+ }
+
+ foreach (var content in update.Contents)
+ {
+ switch (content)
+ {
+ // Usage content is treated specially and propagated to the response's Usage.
+ case UsageContent usage:
+ (response.Usage ??= new()).Add(usage.Details);
+ break;
+
+ default:
+ response.Contents.Add(content);
+ break;
+ }
+ }
if (update.AdditionalProperties is not null)
{
- if (additionalProperties is null)
+ if (response.AdditionalProperties is null)
{
- additionalProperties = new(update.AdditionalProperties);
+ response.AdditionalProperties = new(update.AdditionalProperties);
}
else
{
foreach (var entry in update.AdditionalProperties)
{
- additionalProperties[entry.Key] = entry.Value;
+ response.AdditionalProperties[entry.Key] = entry.Value;
}
}
}
diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs
index 33b27b01291..5c2ff74279e 100644
--- a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs
+++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs
@@ -31,6 +31,7 @@ public void Constructor_Parameterless_PropsDefaulted()
Assert.Null(response.StartTime);
Assert.Null(response.EndTime);
Assert.Equal(string.Empty, response.ToString());
+ Assert.Null(response.Usage);
}
[Theory]
@@ -132,6 +133,11 @@ public void Properties_Roundtrip()
List newContents = [new TextContent("text1"), new TextContent("text2")];
response.Contents = newContents;
Assert.Same(newContents, response.Contents);
+
+ Assert.Null(response.Usage);
+ UsageDetails usageDetails = new();
+ response.Usage = usageDetails;
+ Assert.Same(usageDetails, response.Usage);
}
[Fact]
@@ -152,6 +158,7 @@ public void JsonSerialization_Roundtrips()
EndTime = TimeSpan.FromSeconds(2),
RawRepresentation = new(),
AdditionalProperties = new() { ["key"] = "value" },
+ Usage = new() { InputTokenCount = 42, OutputTokenCount = 84, TotalTokenCount = 126 },
};
string json = JsonSerializer.Serialize(original, TestJsonSerializerContext.Default.SpeechToTextResponse);
@@ -176,6 +183,11 @@ public void JsonSerialization_Roundtrips()
Assert.True(result.AdditionalProperties.TryGetValue("key", out object? value));
Assert.IsType(value);
Assert.Equal("value", ((JsonElement)value!).GetString());
+
+ Assert.NotNull(result.Usage);
+ Assert.Equal(42, result.Usage.InputTokenCount);
+ Assert.Equal(84, result.Usage.OutputTokenCount);
+ Assert.Equal(126, result.Usage.TotalTokenCount);
}
[Fact]
@@ -185,8 +197,10 @@ public void ToString_OutputsText()
Assert.Equal("This is a test." + Environment.NewLine + "It's multiple lines.", response.ToString());
}
- [Fact]
- public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
+ [Theory]
+ [InlineData(false)]
+ [InlineData(true)]
+ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate(bool withUsage)
{
// Arrange: create a response with contents
SpeechToTextResponse response = new()
@@ -202,6 +216,7 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
ResponseId = "12345",
ModelId = "someModel",
AdditionalProperties = new() { ["key1"] = "value1", ["key2"] = 42 },
+ Usage = withUsage ? new UsageDetails { InputTokenCount = 100, OutputTokenCount = 200, TotalTokenCount = 300 } : null
};
// Act: convert to streaming updates
@@ -217,7 +232,7 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
Assert.Equal(TimeSpan.FromSeconds(1), update.StartTime);
Assert.Equal(TimeSpan.FromSeconds(2), update.EndTime);
- Assert.Equal(3, update.Contents.Count);
+ Assert.Equal(withUsage ? 4 : 3, update.Contents.Count);
Assert.Equal("Hello, ", Assert.IsType(update.Contents[0]).Text);
Assert.Equal("image/png", Assert.IsType(update.Contents[1]).MediaType);
Assert.Equal("world!", Assert.IsType(update.Contents[2]).Text);
@@ -225,5 +240,13 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
Assert.NotNull(update.AdditionalProperties);
Assert.Equal("value1", update.AdditionalProperties["key1"]);
Assert.Equal(42, update.AdditionalProperties["key2"]);
+
+ if (withUsage)
+ {
+ var usage = Assert.IsType(update.Contents[3]);
+ Assert.Equal(100, usage.Details.InputTokenCount);
+ Assert.Equal(200, usage.Details.OutputTokenCount);
+ Assert.Equal(300, usage.Details.TotalTokenCount);
+ }
}
}
diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs
index f0a2f08ab13..5d5a035bfe8 100644
--- a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs
+++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs
@@ -70,6 +70,8 @@ public async Task ToSpeechToTextResponse_SuccessfullyCreatesResponse(bool useAsy
Assert.Equal("d", response.AdditionalProperties["c"]);
Assert.Equal("Hello human, How are You?", response.Text);
+
+ Assert.Null(response.Usage);
}
[Theory]
@@ -129,6 +131,28 @@ void AddGap()
}
}
+ [Fact]
+ public async Task ToSpeechToTextResponse_UsageContentExtractedFromContents()
+ {
+ SpeechToTextResponseUpdate[] updates =
+ {
+ new() { Contents = [new TextContent("Hello, ")] },
+ new() { Contents = [new UsageContent(new() { TotalTokenCount = 42 })] },
+ new() { Contents = [new TextContent("world!")] },
+ new() { Contents = [new UsageContent(new() { InputTokenCount = 12, TotalTokenCount = 24 })] },
+ };
+
+ SpeechToTextResponse response = await YieldAsync(updates).ToSpeechToTextResponseAsync();
+
+ Assert.NotNull(response);
+
+ Assert.NotNull(response.Usage);
+ Assert.Equal(12, response.Usage.InputTokenCount);
+ Assert.Equal(66, response.Usage.TotalTokenCount);
+
+ Assert.Equal("Hello, world!", Assert.IsType(Assert.Single(response.Contents)).Text);
+ }
+
private static async IAsyncEnumerable YieldAsync(IEnumerable updates)
{
foreach (SpeechToTextResponseUpdate update in updates)