diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs index 24fa20a11ed..63c6c137411 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponse.cs @@ -47,10 +47,10 @@ public SpeechToTextResponse(string? content) /// Gets or sets the ID of the speech to text response. public string? ResponseId { get; set; } - /// Gets or sets the model ID used in the creation of the speech to text completion. + /// Gets or sets the model ID used in the creation of the speech to text response. public string? ModelId { get; set; } - /// Gets or sets the raw representation of the speech to text completion from an underlying implementation. + /// Gets or sets the raw representation of the speech to text response from an underlying implementation. /// /// If a is created to represent some underlying object from another object /// model, this property can be used to store that original object. This can be useful for debugging or @@ -59,7 +59,7 @@ public SpeechToTextResponse(string? content) [JsonIgnore] public object? RawRepresentation { get; set; } - /// Gets or sets any additional properties associated with the speech to text completion. + /// Gets or sets any additional properties associated with the speech to text response. public AdditionalPropertiesDictionary? AdditionalProperties { get; set; } /// Gets the text of this speech to text response. @@ -76,9 +76,15 @@ public SpeechToTextResponse(string? content) /// An array of instances that may be used to represent this . public SpeechToTextResponseUpdate[] ToSpeechToTextResponseUpdates() { - SpeechToTextResponseUpdate update = new SpeechToTextResponseUpdate + IList contents = Contents; + if (Usage is { } usage) { - Contents = Contents, + contents = [.. contents, new UsageContent(usage)]; + } + + SpeechToTextResponseUpdate update = new() + { + Contents = contents, AdditionalProperties = AdditionalProperties, RawRepresentation = RawRepresentation, StartTime = StartTime, @@ -98,4 +104,7 @@ public IList Contents get => _contents ??= []; set => _contents = value; } + + /// Gets or sets usage details for the speech to text response. + public UsageDetails? Usage { get; set; } } diff --git a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs index 230ec838ba3..0f83a7a8bee 100644 --- a/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs +++ b/src/Libraries/Microsoft.Extensions.AI.Abstractions/SpeechToText/SpeechToTextResponseUpdateExtensions.cs @@ -1,13 +1,14 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Threading; using System.Threading.Tasks; using Microsoft.Shared.Diagnostics; +#pragma warning disable S1121 // Assignments should not be made from within sub-expressions + namespace Microsoft.Extensions.AI; /// @@ -25,32 +26,13 @@ public static SpeechToTextResponse ToSpeechToTextResponse( _ = Throw.IfNull(updates); SpeechToTextResponse response = new(); - List contents = []; - string? responseId = null; - string? modelId = null; - AdditionalPropertiesDictionary? additionalProperties = null; - TimeSpan? endTime = null; foreach (var update in updates) { - // Track the first start time provided by the updates - response.StartTime ??= update.StartTime; - - // Track the last end time provided by the updates - if (update.EndTime is not null) - { - endTime = update.EndTime; - } - - ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties); + ProcessUpdate(update, response); } - ChatResponseExtensions.CoalesceTextContent(contents); - response.EndTime = endTime; - response.Contents = contents; - response.ResponseId = responseId; - response.ModelId = modelId; - response.AdditionalProperties = additionalProperties; + ChatResponseExtensions.CoalesceTextContent((List)response.Contents); return response; } @@ -70,33 +52,13 @@ static async Task ToResponseAsync( IAsyncEnumerable updates, CancellationToken cancellationToken) { SpeechToTextResponse response = new(); - List contents = []; - string? responseId = null; - string? modelId = null; - AdditionalPropertiesDictionary? additionalProperties = null; - TimeSpan? endTime = null; await foreach (var update in updates.WithCancellation(cancellationToken).ConfigureAwait(false)) { - // Track the first start time provided by the updates - response.StartTime ??= update.StartTime; - - // Track the last end time provided by the updates - if (update.EndTime is not null) - { - endTime = update.EndTime; - } - - ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties); + ProcessUpdate(update, response); } - ChatResponseExtensions.CoalesceTextContent(contents); - - response.EndTime = endTime; - response.Contents = contents; - response.ResponseId = responseId; - response.ModelId = modelId; - response.AdditionalProperties = additionalProperties; + ChatResponseExtensions.CoalesceTextContent((List)response.Contents); return response; } @@ -104,40 +66,59 @@ static async Task ToResponseAsync( /// Processes the , incorporating its contents and properties. /// The update to process. - /// The list of content items being accumulated. - /// The response ID to update if the update has one. - /// The model ID to update if the update has one. - /// The additional properties to update if the update has any. + /// The object that should be updated based on . private static void ProcessUpdate( SpeechToTextResponseUpdate update, - List contents, - ref string? responseId, - ref string? modelId, - ref AdditionalPropertiesDictionary? additionalProperties) + SpeechToTextResponse response) { if (update.ResponseId is not null) { - responseId = update.ResponseId; + response.ResponseId = update.ResponseId; } if (update.ModelId is not null) { - modelId = update.ModelId; + response.ModelId = update.ModelId; } - contents.AddRange(update.Contents); + if (response.StartTime is null || (update.StartTime is not null && update.StartTime < response.StartTime)) + { + // Track the first start time provided by the updates + response.StartTime = update.StartTime; + } + + if (response.EndTime is null || (update.EndTime is not null && update.EndTime > response.EndTime)) + { + // Track the last end time provided by the updates + response.EndTime = update.EndTime; + } + + foreach (var content in update.Contents) + { + switch (content) + { + // Usage content is treated specially and propagated to the response's Usage. + case UsageContent usage: + (response.Usage ??= new()).Add(usage.Details); + break; + + default: + response.Contents.Add(content); + break; + } + } if (update.AdditionalProperties is not null) { - if (additionalProperties is null) + if (response.AdditionalProperties is null) { - additionalProperties = new(update.AdditionalProperties); + response.AdditionalProperties = new(update.AdditionalProperties); } else { foreach (var entry in update.AdditionalProperties) { - additionalProperties[entry.Key] = entry.Value; + response.AdditionalProperties[entry.Key] = entry.Value; } } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs index 33b27b01291..5c2ff74279e 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseTests.cs @@ -31,6 +31,7 @@ public void Constructor_Parameterless_PropsDefaulted() Assert.Null(response.StartTime); Assert.Null(response.EndTime); Assert.Equal(string.Empty, response.ToString()); + Assert.Null(response.Usage); } [Theory] @@ -132,6 +133,11 @@ public void Properties_Roundtrip() List newContents = [new TextContent("text1"), new TextContent("text2")]; response.Contents = newContents; Assert.Same(newContents, response.Contents); + + Assert.Null(response.Usage); + UsageDetails usageDetails = new(); + response.Usage = usageDetails; + Assert.Same(usageDetails, response.Usage); } [Fact] @@ -152,6 +158,7 @@ public void JsonSerialization_Roundtrips() EndTime = TimeSpan.FromSeconds(2), RawRepresentation = new(), AdditionalProperties = new() { ["key"] = "value" }, + Usage = new() { InputTokenCount = 42, OutputTokenCount = 84, TotalTokenCount = 126 }, }; string json = JsonSerializer.Serialize(original, TestJsonSerializerContext.Default.SpeechToTextResponse); @@ -176,6 +183,11 @@ public void JsonSerialization_Roundtrips() Assert.True(result.AdditionalProperties.TryGetValue("key", out object? value)); Assert.IsType(value); Assert.Equal("value", ((JsonElement)value!).GetString()); + + Assert.NotNull(result.Usage); + Assert.Equal(42, result.Usage.InputTokenCount); + Assert.Equal(84, result.Usage.OutputTokenCount); + Assert.Equal(126, result.Usage.TotalTokenCount); } [Fact] @@ -185,8 +197,10 @@ public void ToString_OutputsText() Assert.Equal("This is a test." + Environment.NewLine + "It's multiple lines.", response.ToString()); } - [Fact] - public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate() + [Theory] + [InlineData(false)] + [InlineData(true)] + public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate(bool withUsage) { // Arrange: create a response with contents SpeechToTextResponse response = new() @@ -202,6 +216,7 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate() ResponseId = "12345", ModelId = "someModel", AdditionalProperties = new() { ["key1"] = "value1", ["key2"] = 42 }, + Usage = withUsage ? new UsageDetails { InputTokenCount = 100, OutputTokenCount = 200, TotalTokenCount = 300 } : null }; // Act: convert to streaming updates @@ -217,7 +232,7 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate() Assert.Equal(TimeSpan.FromSeconds(1), update.StartTime); Assert.Equal(TimeSpan.FromSeconds(2), update.EndTime); - Assert.Equal(3, update.Contents.Count); + Assert.Equal(withUsage ? 4 : 3, update.Contents.Count); Assert.Equal("Hello, ", Assert.IsType(update.Contents[0]).Text); Assert.Equal("image/png", Assert.IsType(update.Contents[1]).MediaType); Assert.Equal("world!", Assert.IsType(update.Contents[2]).Text); @@ -225,5 +240,13 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate() Assert.NotNull(update.AdditionalProperties); Assert.Equal("value1", update.AdditionalProperties["key1"]); Assert.Equal(42, update.AdditionalProperties["key2"]); + + if (withUsage) + { + var usage = Assert.IsType(update.Contents[3]); + Assert.Equal(100, usage.Details.InputTokenCount); + Assert.Equal(200, usage.Details.OutputTokenCount); + Assert.Equal(300, usage.Details.TotalTokenCount); + } } } diff --git a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs index f0a2f08ab13..5d5a035bfe8 100644 --- a/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs +++ b/test/Libraries/Microsoft.Extensions.AI.Abstractions.Tests/SpeechToText/SpeechToTextResponseUpdateExtensionsTests.cs @@ -70,6 +70,8 @@ public async Task ToSpeechToTextResponse_SuccessfullyCreatesResponse(bool useAsy Assert.Equal("d", response.AdditionalProperties["c"]); Assert.Equal("Hello human, How are You?", response.Text); + + Assert.Null(response.Usage); } [Theory] @@ -129,6 +131,28 @@ void AddGap() } } + [Fact] + public async Task ToSpeechToTextResponse_UsageContentExtractedFromContents() + { + SpeechToTextResponseUpdate[] updates = + { + new() { Contents = [new TextContent("Hello, ")] }, + new() { Contents = [new UsageContent(new() { TotalTokenCount = 42 })] }, + new() { Contents = [new TextContent("world!")] }, + new() { Contents = [new UsageContent(new() { InputTokenCount = 12, TotalTokenCount = 24 })] }, + }; + + SpeechToTextResponse response = await YieldAsync(updates).ToSpeechToTextResponseAsync(); + + Assert.NotNull(response); + + Assert.NotNull(response.Usage); + Assert.Equal(12, response.Usage.InputTokenCount); + Assert.Equal(66, response.Usage.TotalTokenCount); + + Assert.Equal("Hello, world!", Assert.IsType(Assert.Single(response.Contents)).Text); + } + private static async IAsyncEnumerable YieldAsync(IEnumerable updates) { foreach (SpeechToTextResponseUpdate update in updates)