diff --git a/packages/ai/src/prompt/convert-to-language-model-prompt.test.ts b/packages/ai/src/prompt/convert-to-language-model-prompt.test.ts index 80b1ce7608ff..d34044049239 100644 --- a/packages/ai/src/prompt/convert-to-language-model-prompt.test.ts +++ b/packages/ai/src/prompt/convert-to-language-model-prompt.test.ts @@ -1426,5 +1426,70 @@ describe('convertToLanguageModelMessage', () => { } `); }); + + it('should convert URL in tool result content to base64', async () => { + const result = await convertToLanguageModelPrompt({ + prompt: { + messages: [ + { + role: 'tool', + content: [ + { + type: 'tool-result', + toolName: 'screenshot', + toolCallId: 'call-123', + output: { + type: 'content', + value: [ + { type: 'text', text: 'Screenshot captured' }, + { + type: 'media', + data: 'https://example.com/screenshot.png', + mediaType: 'image/png', + }, + ], + }, + }, + ], + }, + ], + }, + supportedUrls: {}, + downloadImplementation: async ({ url }) => { + expect(url).toEqual(new URL('https://example.com/screenshot.png')); + return { + data: new Uint8Array([137, 80, 78, 71]), // PNG magic bytes + mediaType: 'image/png', + }; + }, + }); + + expect(result).toEqual([ + { + role: 'tool', + content: [ + { + type: 'tool-result', + toolCallId: 'call-123', + toolName: 'screenshot', + output: { + type: 'content', + value: [ + { type: 'text', text: 'Screenshot captured' }, + { + type: 'media', + data: 'iVBORw==', // base64 of [137, 80, 78, 71] + mediaType: 'image/png', + }, + ], + }, + providerOptions: undefined, + }, + ], + providerOptions: undefined, + }, + ]); + }); + }); }); diff --git a/packages/ai/src/prompt/convert-to-language-model-prompt.ts b/packages/ai/src/prompt/convert-to-language-model-prompt.ts index f08152a4c3d4..13ddda99aab9 100644 --- a/packages/ai/src/prompt/convert-to-language-model-prompt.ts +++ b/packages/ai/src/prompt/convert-to-language-model-prompt.ts @@ -3,8 +3,10 @@ import { LanguageModelV2Message, LanguageModelV2Prompt, LanguageModelV2TextPart, + LanguageModelV2ToolResultOutput, } from '@ai-sdk/provider'; import { + convertToBase64, DataContent, FilePart, ImagePart, @@ -156,7 +158,10 @@ export function convertToLanguageModelMessage({ type: 'tool-result' as const, toolCallId: part.toolCallId, toolName: part.toolName, - output: part.output, + output: convertOutputToLanguageModelOutput( + part.output, + downloadedAssets, + ), providerOptions, }; } @@ -173,7 +178,10 @@ export function convertToLanguageModelMessage({ type: 'tool-result' as const, toolCallId: part.toolCallId, toolName: part.toolName, - output: part.output, + output: convertOutputToLanguageModelOutput( + part.output, + downloadedAssets, + ), providerOptions: part.providerOptions, })), providerOptions: message.providerOptions, @@ -237,8 +245,49 @@ async function downloadAssets( }), })); + const toolUrls = messages + .filter(message => message.role === 'tool') + .map(message => message.content) + .flat() + .filter(item => item.type === 'tool-result') + .flatMap(item => { + if (item.output.type === 'content') { + const results = item.output.value; + return results + .map(result => { + if (result.type === 'media' && result.data && result.mediaType) { + const url = + typeof result.data === 'string' ? new URL(result.data) : null; + if (url instanceof URL) { + return { ...result, data: url }; + } + } + return null; + }) + .filter(url => url !== null); + } + return null; + }) + .filter( + item => + item && + !isUrlSupported({ + url: item.data.toString(), + mediaType: item.mediaType, + supportedUrls, + }), + ) + .map(item => item!.data); + + const allUrls = [...urls, ...toolUrls]; + // download in parallel: - const downloadedFiles = await download(plannedDownloads); + const downloadedFiles = await Promise.all( + allUrls.map(async url => ({ + url, + data: await downloadImplementation({ url }), + })), + ); return Object.fromEntries( downloadedFiles @@ -347,3 +396,38 @@ function convertPartToLanguageModelPart( } } } + +function convertOutputToLanguageModelOutput( + output: LanguageModelV2ToolResultOutput, + downloadedAssets: Record< + string, + { mediaType: string | undefined; data: Uint8Array } + >, +): LanguageModelV2ToolResultOutput { + if (output.type === 'content') { + return { + type: 'content' as const, + value: output.value.map(result => { + if (result.type === 'media' && result.data && result.mediaType) { + const { data: convertedData, mediaType: convertedMediaType } = + convertToLanguageModelV2DataContent(result.data); + if (convertedData instanceof URL) { + const downloadedFile = downloadedAssets[convertedData.toString()]; + if (downloadedFile) { + return { + type: 'media' as const, + data: convertToBase64(downloadedFile.data), + mediaType: + downloadedFile.mediaType ?? + convertedMediaType ?? + result.mediaType, + }; + } + } + } + return result; + }), + }; + } + return output; +}