Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ public SpeechToTextResponse(string? content)
/// <summary>Gets or sets the ID of the speech to text response.</summary>
public string? ResponseId { get; set; }

/// <summary>Gets or sets the model ID used in the creation of the speech to text completion.</summary>
/// <summary>Gets or sets the model ID used in the creation of the speech to text response.</summary>
public string? ModelId { get; set; }

/// <summary>Gets or sets the raw representation of the speech to text completion from an underlying implementation.</summary>
/// <summary>Gets or sets the raw representation of the speech to text response from an underlying implementation.</summary>
/// <remarks>
/// If a <see cref="SpeechToTextResponse"/> is created to represent some underlying object from another object
/// model, this property can be used to store that original object. This can be useful for debugging or
Expand All @@ -59,7 +59,7 @@ public SpeechToTextResponse(string? content)
[JsonIgnore]
public object? RawRepresentation { get; set; }

/// <summary>Gets or sets any additional properties associated with the speech to text completion.</summary>
/// <summary>Gets or sets any additional properties associated with the speech to text response.</summary>
public AdditionalPropertiesDictionary? AdditionalProperties { get; set; }

/// <summary>Gets the text of this speech to text response.</summary>
Expand All @@ -76,9 +76,15 @@ public SpeechToTextResponse(string? content)
/// <returns>An array of <see cref="SpeechToTextResponseUpdate" /> instances that may be used to represent this <see cref="SpeechToTextResponse" />.</returns>
public SpeechToTextResponseUpdate[] ToSpeechToTextResponseUpdates()
{
SpeechToTextResponseUpdate update = new SpeechToTextResponseUpdate
IList<AIContent> contents = Contents;
if (Usage is { } usage)
{
Contents = Contents,
contents = [.. contents, new UsageContent(usage)];
}

SpeechToTextResponseUpdate update = new()
{
Contents = contents,
AdditionalProperties = AdditionalProperties,
RawRepresentation = RawRepresentation,
StartTime = StartTime,
Expand All @@ -98,4 +104,7 @@ public IList<AIContent> Contents
get => _contents ??= [];
set => _contents = value;
}

/// <summary>Gets or sets usage details for the speech to text response.</summary>
public UsageDetails? Usage { get; set; }
}
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;

#pragma warning disable S1121 // Assignments should not be made from within sub-expressions

namespace Microsoft.Extensions.AI;

/// <summary>
Expand All @@ -25,32 +26,13 @@ public static SpeechToTextResponse ToSpeechToTextResponse(
_ = Throw.IfNull(updates);

SpeechToTextResponse response = new();
List<AIContent> contents = [];
string? responseId = null;
string? modelId = null;
AdditionalPropertiesDictionary? additionalProperties = null;

TimeSpan? endTime = null;
foreach (var update in updates)
{
// Track the first start time provided by the updates
response.StartTime ??= update.StartTime;

// Track the last end time provided by the updates
if (update.EndTime is not null)
{
endTime = update.EndTime;
}

ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties);
ProcessUpdate(update, response);
}

ChatResponseExtensions.CoalesceTextContent(contents);
response.EndTime = endTime;
response.Contents = contents;
response.ResponseId = responseId;
response.ModelId = modelId;
response.AdditionalProperties = additionalProperties;
ChatResponseExtensions.CoalesceTextContent((List<AIContent>)response.Contents);

return response;
}
Expand All @@ -70,74 +52,73 @@ static async Task<SpeechToTextResponse> ToResponseAsync(
IAsyncEnumerable<SpeechToTextResponseUpdate> updates, CancellationToken cancellationToken)
{
SpeechToTextResponse response = new();
List<AIContent> contents = [];
string? responseId = null;
string? modelId = null;
AdditionalPropertiesDictionary? additionalProperties = null;

TimeSpan? endTime = null;
await foreach (var update in updates.WithCancellation(cancellationToken).ConfigureAwait(false))
{
// Track the first start time provided by the updates
response.StartTime ??= update.StartTime;

// Track the last end time provided by the updates
if (update.EndTime is not null)
{
endTime = update.EndTime;
}

ProcessUpdate(update, contents, ref responseId, ref modelId, ref additionalProperties);
ProcessUpdate(update, response);
}

ChatResponseExtensions.CoalesceTextContent(contents);

response.EndTime = endTime;
response.Contents = contents;
response.ResponseId = responseId;
response.ModelId = modelId;
response.AdditionalProperties = additionalProperties;
ChatResponseExtensions.CoalesceTextContent((List<AIContent>)response.Contents);

return response;
}
}

/// <summary>Processes the <see cref="SpeechToTextResponseUpdate"/>, incorporating its contents and properties.</summary>
/// <param name="update">The update to process.</param>
/// <param name="contents">The list of content items being accumulated.</param>
/// <param name="responseId">The response ID to update if the update has one.</param>
/// <param name="modelId">The model ID to update if the update has one.</param>
/// <param name="additionalProperties">The additional properties to update if the update has any.</param>
/// <param name="response">The <see cref="SpeechToTextResponse"/> object that should be updated based on <paramref name="update"/>.</param>
private static void ProcessUpdate(
SpeechToTextResponseUpdate update,
List<AIContent> contents,
ref string? responseId,
ref string? modelId,
ref AdditionalPropertiesDictionary? additionalProperties)
SpeechToTextResponse response)
{
if (update.ResponseId is not null)
{
responseId = update.ResponseId;
response.ResponseId = update.ResponseId;
}

if (update.ModelId is not null)
{
modelId = update.ModelId;
response.ModelId = update.ModelId;
}

contents.AddRange(update.Contents);
if (response.StartTime is null || (update.StartTime is not null && update.StartTime < response.StartTime))
{
// Track the first start time provided by the updates
response.StartTime = update.StartTime;
}

if (response.EndTime is null || (update.EndTime is not null && update.EndTime > response.EndTime))
{
// Track the last end time provided by the updates
response.EndTime = update.EndTime;
}

foreach (var content in update.Contents)
{
switch (content)
{
// Usage content is treated specially and propagated to the response's Usage.
case UsageContent usage:
(response.Usage ??= new()).Add(usage.Details);
break;

default:
response.Contents.Add(content);
break;
}
}

if (update.AdditionalProperties is not null)
{
if (additionalProperties is null)
if (response.AdditionalProperties is null)
{
additionalProperties = new(update.AdditionalProperties);
response.AdditionalProperties = new(update.AdditionalProperties);
}
else
{
foreach (var entry in update.AdditionalProperties)
{
additionalProperties[entry.Key] = entry.Value;
response.AdditionalProperties[entry.Key] = entry.Value;
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public void Constructor_Parameterless_PropsDefaulted()
Assert.Null(response.StartTime);
Assert.Null(response.EndTime);
Assert.Equal(string.Empty, response.ToString());
Assert.Null(response.Usage);
}

[Theory]
Expand Down Expand Up @@ -132,6 +133,11 @@ public void Properties_Roundtrip()
List<AIContent> newContents = [new TextContent("text1"), new TextContent("text2")];
response.Contents = newContents;
Assert.Same(newContents, response.Contents);

Assert.Null(response.Usage);
UsageDetails usageDetails = new();
response.Usage = usageDetails;
Assert.Same(usageDetails, response.Usage);
}

[Fact]
Expand All @@ -152,6 +158,7 @@ public void JsonSerialization_Roundtrips()
EndTime = TimeSpan.FromSeconds(2),
RawRepresentation = new(),
AdditionalProperties = new() { ["key"] = "value" },
Usage = new() { InputTokenCount = 42, OutputTokenCount = 84, TotalTokenCount = 126 },
};

string json = JsonSerializer.Serialize(original, TestJsonSerializerContext.Default.SpeechToTextResponse);
Expand All @@ -176,6 +183,11 @@ public void JsonSerialization_Roundtrips()
Assert.True(result.AdditionalProperties.TryGetValue("key", out object? value));
Assert.IsType<JsonElement>(value);
Assert.Equal("value", ((JsonElement)value!).GetString());

Assert.NotNull(result.Usage);
Assert.Equal(42, result.Usage.InputTokenCount);
Assert.Equal(84, result.Usage.OutputTokenCount);
Assert.Equal(126, result.Usage.TotalTokenCount);
}

[Fact]
Expand All @@ -185,8 +197,10 @@ public void ToString_OutputsText()
Assert.Equal("This is a test." + Environment.NewLine + "It's multiple lines.", response.ToString());
}

[Fact]
public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
[Theory]
[InlineData(false)]
[InlineData(true)]
public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate(bool withUsage)
{
// Arrange: create a response with contents
SpeechToTextResponse response = new()
Expand All @@ -202,6 +216,7 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
ResponseId = "12345",
ModelId = "someModel",
AdditionalProperties = new() { ["key1"] = "value1", ["key2"] = 42 },
Usage = withUsage ? new UsageDetails { InputTokenCount = 100, OutputTokenCount = 200, TotalTokenCount = 300 } : null
};

// Act: convert to streaming updates
Expand All @@ -217,13 +232,21 @@ public void ToSpeechToTextResponseUpdates_ReturnsExpectedUpdate()
Assert.Equal(TimeSpan.FromSeconds(1), update.StartTime);
Assert.Equal(TimeSpan.FromSeconds(2), update.EndTime);

Assert.Equal(3, update.Contents.Count);
Assert.Equal(withUsage ? 4 : 3, update.Contents.Count);
Assert.Equal("Hello, ", Assert.IsType<TextContent>(update.Contents[0]).Text);
Assert.Equal("image/png", Assert.IsType<DataContent>(update.Contents[1]).MediaType);
Assert.Equal("world!", Assert.IsType<TextContent>(update.Contents[2]).Text);

Assert.NotNull(update.AdditionalProperties);
Assert.Equal("value1", update.AdditionalProperties["key1"]);
Assert.Equal(42, update.AdditionalProperties["key2"]);

if (withUsage)
{
var usage = Assert.IsType<UsageContent>(update.Contents[3]);
Assert.Equal(100, usage.Details.InputTokenCount);
Assert.Equal(200, usage.Details.OutputTokenCount);
Assert.Equal(300, usage.Details.TotalTokenCount);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ public async Task ToSpeechToTextResponse_SuccessfullyCreatesResponse(bool useAsy
Assert.Equal("d", response.AdditionalProperties["c"]);

Assert.Equal("Hello human, How are You?", response.Text);

Assert.Null(response.Usage);
}

[Theory]
Expand Down Expand Up @@ -129,6 +131,28 @@ void AddGap()
}
}

[Fact]
public async Task ToSpeechToTextResponse_UsageContentExtractedFromContents()
{
SpeechToTextResponseUpdate[] updates =
{
new() { Contents = [new TextContent("Hello, ")] },
new() { Contents = [new UsageContent(new() { TotalTokenCount = 42 })] },
new() { Contents = [new TextContent("world!")] },
new() { Contents = [new UsageContent(new() { InputTokenCount = 12, TotalTokenCount = 24 })] },
};

SpeechToTextResponse response = await YieldAsync(updates).ToSpeechToTextResponseAsync();

Assert.NotNull(response);

Assert.NotNull(response.Usage);
Assert.Equal(12, response.Usage.InputTokenCount);
Assert.Equal(66, response.Usage.TotalTokenCount);

Assert.Equal("Hello, world!", Assert.IsType<TextContent>(Assert.Single(response.Contents)).Text);
}

private static async IAsyncEnumerable<SpeechToTextResponseUpdate> YieldAsync(IEnumerable<SpeechToTextResponseUpdate> updates)
{
foreach (SpeechToTextResponseUpdate update in updates)
Expand Down
Loading