Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/ProjectTemplates/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ package-lock.json
*/src/**/*.sln
*/src/**/NuGet.config
*/src/**/Directory.Build.targets
*/src/**/Directory.Build.props
*/src/**/ingestioncache.*

# launchSettings.json files are required for the templates.
Expand Down
7 changes: 5 additions & 2 deletions src/ProjectTemplates/GeneratedContent.targets
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
<TemplatePackageVersion_AzureSearchDocuments>11.6.0</TemplatePackageVersion_AzureSearchDocuments>
<TemplatePackageVersion_CommunityToolkitAspire>9.4.1-beta.277</TemplatePackageVersion_CommunityToolkitAspire>
<TemplatePackageVersion_MicrosoftExtensionsServiceDiscovery>9.2.0</TemplatePackageVersion_MicrosoftExtensionsServiceDiscovery>
<TemplatePackageVersion_MicrosoftSemanticKernel>1.50.0</TemplatePackageVersion_MicrosoftSemanticKernel>
<TemplatePackageVersion_MicrosoftSemanticKernel_Preview>1.50.0-preview</TemplatePackageVersion_MicrosoftSemanticKernel_Preview>
<TemplatePackageVersion_MicrosoftSemanticKernel>1.52.1</TemplatePackageVersion_MicrosoftSemanticKernel>
<TemplatePackageVersion_MicrosoftSemanticKernel_Preview>1.52.1-preview</TemplatePackageVersion_MicrosoftSemanticKernel_Preview>
<TemplatePackageVersion_OllamaSharp>5.1.16</TemplatePackageVersion_OllamaSharp>
<TemplatePackageVersion_OpenTelemetry>1.9.0</TemplatePackageVersion_OpenTelemetry>
<TemplatePackageVersion_PdfPig>0.1.10</TemplatePackageVersion_PdfPig>
Expand Down Expand Up @@ -85,6 +85,9 @@
<GeneratedContent
Include="$(_ChatWithCustomDataContentRoot)Directory.Build.targets.in"
OutputPath="$(_ChatWithCustomDataContentRoot)Directory.Build.targets" />
<GeneratedContent
Include="$(_ChatWithCustomDataContentRoot)Directory.Build.props.in"
OutputPath="$(_ChatWithCustomDataContentRoot)Directory.Build.props" />
<GeneratedContent
Include="$(_ChatWithCustomDataContentRoot)ChatWithCustomData-CSharp.Web\ChatWithCustomData-CSharp.Web.csproj.in"
OutputPath="$(_ChatWithCustomDataContentRoot)ChatWithCustomData-CSharp.Web\ChatWithCustomData-CSharp.Web.csproj" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
</ItemGroup>

<ItemGroup>
<!-- Keep the exclude patterns below in sync with those in AichatwebTemplatesTests.cs -->
<!-- Keep the exclude patterns below in sync with those in AIChatWebSnapshotTests.cs -->
<Content
Include="src\ChatWithCustomData\**\*"
Exclude="
Expand All @@ -59,7 +59,8 @@
**\package-lock.json;
**\ingestioncache.*;
**\NuGet.config;
**\Directory.Build.targets;" />
**\Directory.Build.targets;
**\Directory.Build.props;" />
<None Include="THIRD-PARTY-NOTICES.TXT" Pack="true" PackagePath="." />
<Compile Remove="**\*" />
</ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
#elif (UseQdrant)-->
<PackageReference Include="Aspire.Qdrant.Client" Version="${TemplatePackageVersion_Aspire}" />
<PackageReference Include="Microsoft.SemanticKernel.Connectors.Qdrant" Version="${TemplatePackageVersion_MicrosoftSemanticKernel_Preview}" />
<!--#elif (UseLocalVectorStore)-->
<PackageReference Include="Microsoft.SemanticKernel.Connectors.SqliteVec" Version="${TemplatePackageVersion_MicrosoftSemanticKernel_Preview}" />
<!--#endif -->
</ItemGroup>
<!--#if (IsAspire) -->
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
using Microsoft.Extensions.AI;
using Microsoft.Extensions.VectorData;
using ChatWithCustomData_CSharp.Web.Components;
using ChatWithCustomData_CSharp.Web.Services;
using ChatWithCustomData_CSharp.Web.Services.Ingestion;
#if (IsOllama)
#else // IsAzureOpenAI || IsOpenAI || IsGHModels
using OpenAI;
#endif
#if (UseAzureAISearch)
using Microsoft.SemanticKernel.Connectors.AzureAISearch;
#elif (UseQdrant)
using Microsoft.SemanticKernel.Connectors.Qdrant;
#endif

var builder = WebApplication.CreateBuilder(args);
builder.AddServiceDefaults();
Expand Down Expand Up @@ -41,15 +35,17 @@

#if (UseAzureAISearch)
builder.AddAzureSearchClient("azureAISearch");

builder.Services.AddSingleton<IVectorStore, AzureAISearchVectorStore>();
builder.Services.AddAzureAISearchCollection<IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks");
builder.Services.AddAzureAISearchCollection<IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents");
#elif (UseQdrant)
builder.AddQdrantClient("vectordb");

builder.Services.AddSingleton<IVectorStore, QdrantVectorStore>();
builder.Services.AddQdrantCollection<Guid, IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks");
builder.Services.AddQdrantCollection<Guid, IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents");
#else // UseLocalVectorStore
var vectorStore = new JsonVectorStore(Path.Combine(AppContext.BaseDirectory, "vector-store"));
builder.Services.AddSingleton<IVectorStore>(vectorStore);
var vectorStorePath = Path.Combine(AppContext.BaseDirectory, "vector-store.db");
var vectorStoreConnectionString = $"Data Source={vectorStorePath}";
builder.Services.AddSqliteCollection<string, IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks", vectorStoreConnectionString);
builder.Services.AddSqliteCollection<string, IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents", vectorStoreConnectionString);
#endif
builder.Services.AddScoped<DataIngestor>();
builder.Services.AddSingleton<SemanticSearch>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using Microsoft.Extensions.AI;
using Microsoft.Extensions.VectorData;
using ChatWithCustomData_CSharp.Web.Components;
using ChatWithCustomData_CSharp.Web.Services;
using ChatWithCustomData_CSharp.Web.Services.Ingestion;
Expand All @@ -18,10 +17,6 @@
using Azure.AI.OpenAI;
using System.ClientModel;
#endif
#if (UseAzureAISearch)
using Azure.Search.Documents.Indexes;
using Microsoft.SemanticKernel.Connectors.AzureAISearch;
#endif

var builder = WebApplication.CreateBuilder(args);
builder.Services.AddRazorComponents().AddInteractiveServerComponents();
Expand Down Expand Up @@ -83,19 +78,23 @@
#if (!UseManagedIdentity)
// dotnet user-secrets set AzureAISearch:Key YOUR-API-KEY
#endif
var vectorStore = new AzureAISearchVectorStore(
new SearchIndexClient(
new Uri(builder.Configuration["AzureAISearch:Endpoint"] ?? throw new InvalidOperationException("Missing configuration: AzureAISearch:Endpoint. See the README for details.")),
var azureAISearchEndpoint = new Uri(builder.Configuration["AzureAISearch:Endpoint"]
?? throw new InvalidOperationException("Missing configuration: AzureAISearch:Endpoint. See the README for details."));
#if (UseManagedIdentity)
new DefaultAzureCredential()));
var azureAISearchCredential = new DefaultAzureCredential();
#else
new AzureKeyCredential(builder.Configuration["AzureAISearch:Key"] ?? throw new InvalidOperationException("Missing configuration: AzureAISearch:Key. See the README for details."))));
var azureAISearchCredential = new AzureKeyCredential(builder.Configuration["AzureAISearch:Key"]
?? throw new InvalidOperationException("Missing configuration: AzureAISearch:Key. See the README for details."));
#endif
builder.Services.AddAzureAISearchCollection<IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks", azureAISearchEndpoint, azureAISearchCredential);
builder.Services.AddAzureAISearchCollection<IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents", azureAISearchEndpoint, azureAISearchCredential);
#else // UseLocalVectorStore
var vectorStore = new JsonVectorStore(Path.Combine(AppContext.BaseDirectory, "vector-store"));
var vectorStorePath = Path.Combine(AppContext.BaseDirectory, "vector-store.db");
var vectorStoreConnectionString = $"Data Source={vectorStorePath}";
builder.Services.AddSqliteCollection<string, IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks", vectorStoreConnectionString);
builder.Services.AddSqliteCollection<string, IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents", vectorStoreConnectionString);
#endif

builder.Services.AddSingleton<IVectorStore>(vectorStore);
builder.Services.AddScoped<DataIngestor>();
builder.Services.AddSingleton<SemanticSearch>();
builder.Services.AddChatClient(chatClient).UseFunctionInvocation().UseLogging();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,33 @@ namespace ChatWithCustomData_CSharp.Web.Services;

public class IngestedChunk
{
[VectorStoreRecordKey]
#if (IsOllama)
private const int VectorDimensions = 384; // 384 is the default vector size for the all-minilm embedding model
#else
private const int VectorDimensions = 1536; // 1536 is the default vector size for the OpenAI text-embedding-3-small model
#endif
#if (UseAzureAISearch || UseQdrant)
private const string VectorDistanceFunction = DistanceFunction.CosineSimilarity;
#else
private const string VectorDistanceFunction = DistanceFunction.CosineDistance;
#endif

[VectorStoreKey]
#if (UseQdrant)
public required Guid Key { get; set; }
#else
public required string Key { get; set; }
#endif

[VectorStoreRecordData(IsIndexed = true)]
[VectorStoreData(IsIndexed = true)]
public required string DocumentId { get; set; }

[VectorStoreRecordData]
[VectorStoreData]
public int PageNumber { get; set; }

[VectorStoreRecordData]
[VectorStoreData]
public required string Text { get; set; }

#if (IsOllama)
[VectorStoreRecordVector(384, DistanceFunction = DistanceFunction.CosineSimilarity)] // 384 is the default vector size for the all-minilm embedding model
#else
[VectorStoreRecordVector(1536, DistanceFunction = DistanceFunction.CosineSimilarity)] // 1536 is the default vector size for the OpenAI text-embedding-3-small model
#endif
public ReadOnlyMemory<float> Vector { get; set; }
[VectorStoreVector(VectorDimensions, DistanceFunction = VectorDistanceFunction)]
public string? Vector => Text;
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,30 @@ namespace ChatWithCustomData_CSharp.Web.Services;

public class IngestedDocument
{
[VectorStoreRecordKey]
private const int VectorDimensions = 2;
#if (UseAzureAISearch || UseQdrant)
private const string VectorDistanceFunction = DistanceFunction.CosineSimilarity;
#else
private const string VectorDistanceFunction = DistanceFunction.CosineDistance;
#endif

[VectorStoreKey]
#if (UseQdrant)
public required Guid Key { get; set; }
#else
public required string Key { get; set; }
#endif

[VectorStoreRecordData(IsIndexed = true)]
[VectorStoreData(IsIndexed = true)]
public required string SourceId { get; set; }

[VectorStoreRecordData]
[VectorStoreData]
public required string DocumentId { get; set; }

[VectorStoreRecordData]
[VectorStoreData]
public required string DocumentVersion { get; set; }

// The vector is not used but required for some vector databases
[VectorStoreRecordVector(2, DistanceFunction = DistanceFunction.CosineSimilarity)]
[VectorStoreVector(VectorDimensions, DistanceFunction = VectorDistanceFunction)]
public ReadOnlyMemory<float> Vector { get; set; } = new ReadOnlyMemory<float>([0, 0]);
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@ namespace ChatWithCustomData_CSharp.Web.Services.Ingestion;

public class DataIngestor(
ILogger<DataIngestor> logger,
IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator,
IVectorStore vectorStore)
#if (UseQdrant)
VectorStoreCollection<Guid, IngestedChunk> chunksCollection,
VectorStoreCollection<Guid, IngestedDocument> documentsCollection)
#else
VectorStoreCollection<string, IngestedChunk> chunksCollection,
VectorStoreCollection<string, IngestedDocument> documentsCollection)
#endif
{
public static async Task IngestDataAsync(IServiceProvider services, IIngestionSource source)
{
Expand All @@ -17,15 +22,8 @@ public static async Task IngestDataAsync(IServiceProvider services, IIngestionSo

public async Task IngestDataAsync(IIngestionSource source)
{
#if (UseQdrant)
var chunksCollection = vectorStore.GetCollection<Guid, IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks");
var documentsCollection = vectorStore.GetCollection<Guid, IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents");
#else
var chunksCollection = vectorStore.GetCollection<string, IngestedChunk>("data-ChatWithCustomData-CSharp.Web-chunks");
var documentsCollection = vectorStore.GetCollection<string, IngestedDocument>("data-ChatWithCustomData-CSharp.Web-documents");
#endif
await chunksCollection.CreateCollectionIfNotExistsAsync();
await documentsCollection.CreateCollectionIfNotExistsAsync();
await chunksCollection.EnsureCollectionExistsAsync();
await documentsCollection.EnsureCollectionExistsAsync();

var sourceId = source.SourceId;
var documentsForSource = await documentsCollection.GetAsync(doc => doc.SourceId == sourceId, top: int.MaxValue).ToListAsync();
Expand All @@ -46,7 +44,7 @@ public async Task IngestDataAsync(IIngestionSource source)

await documentsCollection.UpsertAsync(modifiedDocument);

var newRecords = await source.CreateChunksForDocumentAsync(embeddingGenerator, modifiedDocument);
var newRecords = await source.CreateChunksForDocumentAsync(modifiedDocument);
await chunksCollection.UpsertAsync(newRecords);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
using Microsoft.Extensions.AI;

namespace ChatWithCustomData_CSharp.Web.Services.Ingestion;
namespace ChatWithCustomData_CSharp.Web.Services.Ingestion;

public interface IIngestionSource
{
Expand All @@ -10,5 +8,5 @@ public interface IIngestionSource

Task<IEnumerable<IngestedDocument>> GetDeletedDocumentsAsync(IReadOnlyList<IngestedDocument> existingDocuments);

Task<IEnumerable<IngestedChunk>> CreateChunksForDocumentAsync(IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator, IngestedDocument document);
Task<IEnumerable<IngestedChunk>> CreateChunksForDocumentAsync(IngestedDocument document);
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using Microsoft.Extensions.AI;
using Microsoft.SemanticKernel.Text;
using Microsoft.SemanticKernel.Text;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
Expand Down Expand Up @@ -46,25 +45,22 @@ public Task<IEnumerable<IngestedDocument>> GetDeletedDocumentsAsync(IReadOnlyLis
return Task.FromResult(deletedDocuments);
}

public async Task<IEnumerable<IngestedChunk>> CreateChunksForDocumentAsync(IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator, IngestedDocument document)
public Task<IEnumerable<IngestedChunk>> CreateChunksForDocumentAsync(IngestedDocument document)
{
using var pdf = PdfDocument.Open(Path.Combine(sourceDirectory, document.DocumentId));
var paragraphs = pdf.GetPages().SelectMany(GetPageParagraphs).ToList();

var embeddings = await embeddingGenerator.GenerateAsync(paragraphs.Select(c => c.Text));

return paragraphs.Zip(embeddings).Select(pair => new IngestedChunk
return Task.FromResult(paragraphs.Select(p => new IngestedChunk
{
#if (UseQdrant)
Key = Guid.CreateVersion7(),
#else
Key = Guid.CreateVersion7().ToString(),
#endif
DocumentId = document.DocumentId,
PageNumber = pair.First.PageNumber,
Text = pair.First.Text,
Vector = pair.Second.Vector,
});
PageNumber = p.PageNumber,
Text = p.Text,
}));
}

private static IEnumerable<(int PageNumber, int IndexOnPage, string Text)> GetPageParagraphs(Page pdfPage)
Expand Down
Loading
Loading