Skip to content

Commit bf6fab5

Browse files
feat: Update embedding logic to bulk (#1037)
## Description Describe your changes here. Fixes #Issue_Number (if available) ### Ensure that your pull request has followed all the steps below: - [ ] Code compilation - [ ] Created tests which fail without the change (if possible) - [ ] All tests passing - [ ] Extended the README / documentation, if necessary --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent de6d616 commit bf6fab5

9 files changed

Lines changed: 277 additions & 71 deletions

File tree

EssentialCSharp.Chat.Shared/Models/BookContentChunk.cs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,24 @@ public sealed class BookContentChunk
3232
public string ChunkText { get; set; } = string.Empty;
3333

3434
/// <summary>
35-
/// Chapter number extracted from filename (e.g., "Chapter01.md" -> 1)
35+
/// Chapter number extracted from filename (e.g., "Chapter01.md" -> 1).
36+
/// Null for files that do not follow the ChapterNN naming pattern.
3637
/// </summary>
3738
[VectorStoreData]
3839
public int? ChapterNumber { get; set; }
3940

41+
/// <summary>
42+
/// Zero-based ordinal of this chunk within its source file.
43+
/// Together with FileName, forms the basis for the deterministic Id.
44+
/// </summary>
45+
/// <remarks>
46+
/// This column was added as part of the bulk-embedding refactor. Existing vector-store
47+
/// collections created before this change must be rebuilt (via the staging-swap upload
48+
/// command) before reads against the live table will succeed with this schema.
49+
/// </remarks>
50+
[VectorStoreData]
51+
public int ChunkIndex { get; set; }
52+
4053
/// <summary>
4154
/// SHA256 hash of the chunk content for change detection
4255
/// </summary>

EssentialCSharp.Chat.Shared/Services/AISearchService.cs

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
using System.Diagnostics;
1+
using System.Diagnostics;
22
using EssentialCSharp.Chat.Common.Models;
33
using Microsoft.Extensions.Logging;
44
using Microsoft.Extensions.VectorData;
@@ -35,11 +35,29 @@ public async Task<IReadOnlyList<VectorSearchResult<BookContentChunk>>> ExecuteVe
3535
{
3636
try
3737
{
38-
var results = new List<VectorSearchResult<BookContentChunk>>();
39-
await foreach (var result in collection.SearchAsync(searchVector, options: vectorSearchOptions, top: top, cancellationToken: cancellationToken))
38+
// Fetch more candidates than needed so we can deduplicate by heading.
39+
// Multiple chunks from the same section share the same Heading; without dedup
40+
// all top-N results could come from one long section, reducing context diversity.
41+
int candidates = top * 3;
42+
43+
var candidatesList = new List<VectorSearchResult<BookContentChunk>>();
44+
await foreach (var result in collection.SearchAsync(searchVector, options: vectorSearchOptions, top: candidates, cancellationToken: cancellationToken))
4045
{
41-
results.Add(result);
46+
candidatesList.Add(result);
4247
}
48+
49+
// Keep only the highest-scoring chunk per unique heading, then take the globally
50+
// top-N by score. GroupBy on a materialized list preserves insertion (score desc)
51+
// order, but we make the ordering explicit via OrderByDescending so the result
52+
// is correct regardless of provider sort guarantees.
53+
// MaxBy on a non-empty IGrouping never returns null; ! asserts this invariant.
54+
var results = candidatesList
55+
.GroupBy(r => r.Record.Heading)
56+
.Select(g => g.MaxBy(r => r.Score)!)
57+
.OrderByDescending(r => r.Score)
58+
.Take(top)
59+
.ToList();
60+
4361
return results;
4462
}
4563
catch (PostgresException ex) when (ex.SqlState == "28000" && attempt == 0)

EssentialCSharp.Chat.Shared/Services/ChunkingResultExtensions.cs

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,35 @@
11
using System.Security.Cryptography;
22
using System.Text;
3-
using System.Linq;
43
using EssentialCSharp.Chat.Common.Models;
54

65
namespace EssentialCSharp.Chat.Common.Services;
76

87
public static partial class ChunkingResultExtensions
98
{
9+
/// <summary>
10+
/// Converts a <see cref="FileChunkingResult"/> into a list of <see cref="BookContentChunk"/> records
11+
/// ready for embedding and vector store upload.
12+
/// </summary>
13+
/// <remarks>
14+
/// <see cref="BookContentChunk.ChapterNumber"/> is set to null for files that do not match
15+
/// the <c>ChapterNN</c> naming pattern (e.g. appendix or non-chapter markdown files).
16+
/// </remarks>
1017
public static List<BookContentChunk> ToBookContentChunks(this FileChunkingResult result)
1118
{
1219
int? chapterNumber = ExtractChapterNumber(result.FileName);
1320

1421
var chunks = result.Chunks
15-
.Select(chunkText =>
22+
.Select((markdownChunk, index) =>
1623
{
17-
var contentHash = ComputeSha256Hash(chunkText);
24+
var contentHash = ComputeSha256Hash(markdownChunk.ChunkText);
1825
return new BookContentChunk
1926
{
20-
Id = Guid.NewGuid().ToString(),
27+
Id = $"{result.FileName}_{index}",
2128
FileName = result.FileName,
22-
Heading = ExtractHeading(chunkText),
23-
ChunkText = chunkText,
29+
Heading = markdownChunk.Heading,
30+
ChunkText = markdownChunk.ChunkText,
2431
ChapterNumber = chapterNumber,
32+
ChunkIndex = index,
2533
ContentHash = contentHash
2634
};
2735
})
@@ -30,25 +38,13 @@ public static List<BookContentChunk> ToBookContentChunks(this FileChunkingResult
3038
return chunks;
3139
}
3240

33-
private static string ExtractHeading(string chunkText)
41+
private static int? ExtractChapterNumber(string fileName)
3442
{
35-
// get characters until the first " - " or newline
36-
var firstLine = chunkText.Split(["\r\n", "\r", "\n"], StringSplitOptions.None)[0];
37-
var headingParts = firstLine.Split([" - "], StringSplitOptions.None);
38-
return headingParts.Length > 0 ? headingParts[0].Trim() : string.Empty;
39-
}
40-
41-
private static int ExtractChapterNumber(string fileName)
42-
{
43-
// Example: "Chapter01.md" -> 1
44-
// Regex: Chapter(?<ChapterNumber>[0-9]{2})
43+
// Example: "Chapter01.md" -> 1; non-chapter files return null.
4544
var match = ChapterNumberRegex().Match(fileName);
4645
if (match.Success && int.TryParse(match.Groups["ChapterNumber"].Value, out int chapterNumber))
47-
48-
{
4946
return chapterNumber;
50-
}
51-
throw new InvalidOperationException($"File name '{fileName}' does not contain a valid chapter number in the expected format.");
47+
return null;
5248
}
5349

5450
private static string ComputeSha256Hash(string text)
Lines changed: 139 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,170 @@
1+
using System.Text.RegularExpressions;
12
using EssentialCSharp.Chat.Common.Models;
23
using Microsoft.Extensions.AI;
34
using Microsoft.Extensions.VectorData;
5+
using Npgsql;
46

57
namespace EssentialCSharp.Chat.Common.Services;
68

79
/// <summary>
8-
/// Service for generating embeddings for markdown chunks using Azure OpenAI
10+
/// Service for generating embeddings for markdown chunks using Azure OpenAI and uploading
11+
/// them to a PostgreSQL vector store via a staging-then-swap pattern to avoid downtime.
912
/// </summary>
10-
public class EmbeddingService(VectorStore vectorStore, IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator)
13+
public class EmbeddingService(
14+
VectorStore vectorStore,
15+
IEmbeddingGenerator<string, Embedding<float>> embeddingGenerator,
16+
NpgsqlDataSource? dataSource = null)
1117
{
1218
public static string CollectionName { get; } = "markdown_chunks";
1319

20+
/// <summary>
21+
/// Maximum number of inputs per Azure OpenAI embedding batch call.
22+
/// </summary>
23+
private const int EmbeddingBatchSize = 2048;
24+
25+
// Only allow simple identifiers: letters, digits, and underscores, starting with a letter or underscore.
26+
private static readonly Regex _safeIdentifierRegex = new(@"^[a-zA-Z_][a-zA-Z0-9_]*$", RegexOptions.Compiled);
27+
1428
/// <summary>
1529
/// Generate an embedding for the given text.
1630
/// </summary>
17-
/// <param name="text">The text to generate an embedding for.</param>
18-
/// <param name="cancellationToken">The cancellation token.</param>
19-
/// <returns>A search vector as ReadOnlyMemory&lt;float&gt;.</returns>
2031
public async Task<ReadOnlyMemory<float>> GenerateEmbeddingAsync(string text, CancellationToken cancellationToken = default)
2132
{
2233
var embedding = await embeddingGenerator.GenerateAsync(text, cancellationToken: cancellationToken);
2334
return embedding.Vector;
2435
}
2536

2637
/// <summary>
27-
/// Generate an embedding for each text paragraph and upload it to the specified collection.
38+
/// Generate embeddings for all chunks in batches and upload them to the vector store
39+
/// using a staging-then-atomic-swap pattern so the live collection stays queryable
40+
/// throughout the rebuild.
41+
///
42+
/// Steps:
43+
/// 1. Create a staging collection ({collectionName}_staging).
44+
/// 2. For each batch of <see cref="EmbeddingBatchSize"/> chunks: embed the batch
45+
/// and immediately upsert it into staging, keeping peak memory bounded.
46+
/// 3. Atomically swap tables in a single transaction using two SQL RENAME operations
47+
/// (live → old, staging → live). PostgreSQL ALTER TABLE acquires
48+
/// AccessExclusiveLock automatically; no explicit LOCK TABLE is needed. The
49+
/// transaction ensures no reader sees an intermediate state.
50+
/// 4. Drop the old live backup table with DROP TABLE.
51+
///
52+
/// If an error occurs before the swap, only the staging table is affected — the live
53+
/// collection is untouched.
2854
/// </summary>
29-
/// <param name="collectionName">The name of the collection to upload the text paragraphs to.</param>
30-
/// <returns>An async task.</returns>
31-
public async Task GenerateBookContentEmbeddingsAndUploadToVectorStore(IEnumerable<BookContentChunk> bookContents, CancellationToken cancellationToken, string? collectionName = null)
55+
public async Task GenerateBookContentEmbeddingsAndUploadToVectorStore(
56+
IEnumerable<BookContentChunk> bookContents,
57+
CancellationToken cancellationToken,
58+
string? collectionName = null)
3259
{
3360
collectionName ??= CollectionName;
3461

35-
var collection = vectorStore.GetCollection<string, BookContentChunk>(collectionName);
36-
await collection.EnsureCollectionDeletedAsync(cancellationToken);
37-
await collection.EnsureCollectionExistsAsync(cancellationToken);
62+
if (dataSource is null)
63+
throw new InvalidOperationException(
64+
$"{nameof(NpgsqlDataSource)} must be provided to upload embeddings. Ensure it is registered in DI.");
3865

39-
int uploadedCount = 0;
66+
if (!_safeIdentifierRegex.IsMatch(collectionName))
67+
throw new ArgumentException(
68+
$"Collection name '{collectionName}' contains unsafe characters. Use only letters, digits, and underscores.",
69+
nameof(collectionName));
4070

41-
foreach (var chunk in bookContents)
71+
string stagingName = $"{collectionName}_staging";
72+
string oldName = $"{collectionName}_old";
73+
74+
// ── Step 1: Prepare staging collection ────────────────────────────────────────
75+
var staging = vectorStore.GetCollection<string, BookContentChunk>(stagingName);
76+
await staging.EnsureCollectionDeletedAsync(cancellationToken);
77+
await staging.EnsureCollectionExistsAsync(cancellationToken);
78+
79+
// ── Step 2 & 3: Batch-embed and immediately upsert each batch ─────────────────
80+
// Azure OpenAI supports at most EmbeddingBatchSize inputs per GenerateAsync call.
81+
// bookContents is streamed in fixed-size batches without full upfront materialization,
82+
// keeping peak memory bounded to one batch of chunk objects and their embeddings at a time.
83+
// The staging-swap (Step 3) is safe because it only runs after all batches have
84+
// been successfully upserted.
85+
var buffer = new List<BookContentChunk>(EmbeddingBatchSize);
86+
int totalCount = 0;
87+
88+
async Task EmbedAndUpsertBatchAsync()
4289
{
43-
cancellationToken.ThrowIfCancellationRequested();
44-
chunk.TextEmbedding = await GenerateEmbeddingAsync(chunk.ChunkText, cancellationToken);
45-
await collection.UpsertAsync(chunk, cancellationToken);
46-
Console.WriteLine($"Uploaded chunk '{chunk.Id}' to collection '{collectionName}' for file '{chunk.FileName}' with heading '{chunk.Heading}'.");
47-
uploadedCount++;
90+
var batchEmbeddings = await embeddingGenerator.GenerateAsync(
91+
buffer.Select(c => c.ChunkText), cancellationToken: cancellationToken);
92+
93+
if (batchEmbeddings.Count != buffer.Count)
94+
throw new InvalidOperationException(
95+
$"Embedding count mismatch: expected {buffer.Count}, got {batchEmbeddings.Count}.");
96+
97+
for (int i = 0; i < buffer.Count; i++)
98+
buffer[i].TextEmbedding = batchEmbeddings[i].Vector;
99+
100+
await staging.UpsertAsync(buffer, cancellationToken);
101+
totalCount += buffer.Count;
102+
buffer.Clear();
103+
}
104+
105+
try
106+
{
107+
foreach (var chunk in bookContents)
108+
{
109+
buffer.Add(chunk);
110+
if (buffer.Count == EmbeddingBatchSize)
111+
await EmbedAndUpsertBatchAsync();
112+
}
113+
114+
if (buffer.Count > 0)
115+
await EmbedAndUpsertBatchAsync();
116+
117+
Console.WriteLine($"Uploaded {totalCount} chunks to staging collection '{stagingName}'.");
48118
}
49-
Console.WriteLine($"Successfully generated embeddings and uploaded {uploadedCount} chunks to collection '{collectionName}'.");
119+
catch
120+
{
121+
// Best-effort cleanup: drop the partially-populated staging table so the
122+
// next run starts clean. Do not let this secondary failure mask the original.
123+
try
124+
{
125+
await staging.EnsureCollectionDeletedAsync(cancellationToken);
126+
}
127+
catch (Exception cleanupEx) when (cleanupEx is not OperationCanceledException)
128+
{
129+
Console.Error.WriteLine($"Warning: failed to clean up staging collection '{stagingName}' after upsert failure: {cleanupEx.Message}");
130+
}
131+
throw;
132+
}
133+
134+
// ── Step 3: Atomic swap — staging → live ──────────────────────────────────────
135+
// Two ALTER TABLE RENAME operations in one transaction (live → old, staging → live).
136+
// Each RENAME auto-acquires AccessExclusiveLock on its table; the transaction
137+
// guarantees both renames are visible atomically to other sessions.
138+
await using var conn = await dataSource.OpenConnectionAsync(cancellationToken);
139+
await using var tx = await conn.BeginTransactionAsync(cancellationToken);
140+
141+
await using (var cmd = conn.CreateCommand())
142+
{
143+
cmd.Transaction = tx;
144+
145+
// Drop any leftover backup from a previous run
146+
cmd.CommandText = $"DROP TABLE IF EXISTS \"{oldName}\"";
147+
await cmd.ExecuteNonQueryAsync(cancellationToken);
148+
149+
// Rename live → old. IF EXISTS is a no-op on first run when no live table exists.
150+
cmd.CommandText = $"ALTER TABLE IF EXISTS \"{collectionName}\" RENAME TO \"{oldName}\"";
151+
await cmd.ExecuteNonQueryAsync(cancellationToken);
152+
153+
// Rename staging → live
154+
cmd.CommandText = $"ALTER TABLE \"{stagingName}\" RENAME TO \"{collectionName}\"";
155+
await cmd.ExecuteNonQueryAsync(cancellationToken);
156+
}
157+
158+
await tx.CommitAsync(cancellationToken);
159+
Console.WriteLine($"Swapped '{stagingName}' → '{collectionName}' atomically.");
160+
161+
// ── Step 4: Drop the old backup ───────────────────────────────────────────────
162+
await using (var cmd = conn.CreateCommand())
163+
{
164+
cmd.CommandText = $"DROP TABLE IF EXISTS \"{oldName}\"";
165+
await cmd.ExecuteNonQueryAsync(cancellationToken);
166+
}
167+
168+
Console.WriteLine($"Successfully generated embeddings and uploaded {totalCount} chunks to collection '{collectionName}'.");
50169
}
51170
}

EssentialCSharp.Chat.Shared/Services/FileChunkingResult.cs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
namespace EssentialCSharp.Chat.Common.Services;
22

3+
/// <summary>
4+
/// A single chunk from a markdown file, paired with the section heading it belongs to.
5+
/// </summary>
6+
/// <param name="Heading">Full breadcrumb heading for the section (e.g. "Chapter: 1: Intro: Summary").</param>
7+
/// <param name="ChunkText">The raw chunk text, including the "Heading - " prefix prepended by TextChunker.</param>
8+
public record MarkdownChunk(string Heading, string ChunkText);
9+
310
/// <summary>
411
/// Data structure to hold chunking results for a single file
512
/// </summary>
@@ -9,6 +16,6 @@ public class FileChunkingResult
916
public string FilePath { get; set; } = string.Empty;
1017
public int OriginalCharCount { get; set; }
1118
public int ChunkCount { get; set; }
12-
public List<string> Chunks { get; set; } = [];
19+
public List<MarkdownChunk> Chunks { get; set; } = [];
1320
public int TotalChunkCharacters { get; set; }
1421
}

0 commit comments

Comments
 (0)