Skip to content

Commit ab9d8e5

Browse files
committed
cache-test-refactoring
1 parent db35df6 commit ab9d8e5

33 files changed

Lines changed: 3697 additions & 54 deletions

src/MarkdownLd.Kb/Extraction/Cache/FileKnowledgeExtractionCache.cs

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
using System.Security.Cryptography;
2+
using System.Text;
13
using System.Text.Json;
24
using static ManagedCode.MarkdownLd.Kb.Pipeline.PipelineConstants;
35

46
namespace ManagedCode.MarkdownLd.Kb.Pipeline;
57

68
public sealed class FileKnowledgeExtractionCache : IKnowledgeExtractionCache
79
{
10+
private const string TemporaryFileSuffix = ".tmp-";
11+
private const int CacheKeyHashLength = 24;
12+
813
private readonly string _directoryPath;
914
private readonly JsonSerializerOptions _serializerOptions;
1015

@@ -34,10 +39,18 @@ public FileKnowledgeExtractionCache(
3439
}
3540

3641
await using var stream = File.OpenRead(path);
37-
var entry = await JsonSerializer.DeserializeAsync<KnowledgeExtractionCacheEntry>(
38-
stream,
39-
_serializerOptions,
40-
cancellationToken).ConfigureAwait(false) ?? throw new InvalidDataException(CacheEntryMissingMessage);
42+
KnowledgeExtractionCacheEntry entry;
43+
try
44+
{
45+
entry = await JsonSerializer.DeserializeAsync<KnowledgeExtractionCacheEntry>(
46+
stream,
47+
_serializerOptions,
48+
cancellationToken).ConfigureAwait(false) ?? throw new InvalidDataException(CacheEntryMissingMessage);
49+
}
50+
catch (JsonException exception)
51+
{
52+
throw new InvalidDataException(CacheEntryMissingMessage, exception);
53+
}
4154

4255
return entry.Key.Matches(key) ? entry : null;
4356
}
@@ -51,8 +64,25 @@ public async Task SetAsync(
5164
Directory.CreateDirectory(_directoryPath);
5265

5366
var path = GetCacheFilePath(entry.Key);
54-
await using var stream = File.Create(path);
55-
await JsonSerializer.SerializeAsync(stream, entry, _serializerOptions, cancellationToken).ConfigureAwait(false);
67+
var temporaryPath = path + TemporaryFileSuffix + Guid.NewGuid().ToString("N");
68+
69+
try
70+
{
71+
await using (var stream = new FileStream(temporaryPath, FileMode.CreateNew, FileAccess.Write, FileShare.None))
72+
{
73+
await JsonSerializer.SerializeAsync(stream, entry, _serializerOptions, cancellationToken).ConfigureAwait(false);
74+
await stream.FlushAsync(cancellationToken).ConfigureAwait(false);
75+
}
76+
77+
File.Move(temporaryPath, path, overwrite: true);
78+
}
79+
finally
80+
{
81+
if (File.Exists(temporaryPath))
82+
{
83+
File.Delete(temporaryPath);
84+
}
85+
}
5686
}
5787

5888
private string GetCacheFilePath(KnowledgeExtractionCacheKey key)
@@ -61,6 +91,7 @@ private string GetCacheFilePath(KnowledgeExtractionCacheKey key)
6191
var chunker = KnowledgeNaming.Slugify(key.ChunkerProfileId);
6292
var prompt = KnowledgeNaming.Slugify(key.PromptVersion);
6393
var model = KnowledgeNaming.Slugify(key.ModelId);
94+
var keyHash = CreateCacheKeyHash(key);
6495
var fileName = string.Concat(
6596
slug,
6697
DotSeparator,
@@ -71,8 +102,20 @@ private string GetCacheFilePath(KnowledgeExtractionCacheKey key)
71102
prompt,
72103
DotSeparator,
73104
model,
105+
DotSeparator,
106+
keyHash,
74107
CacheFileExtension);
75108

76109
return Path.Combine(_directoryPath, fileName);
77110
}
111+
112+
private string CreateCacheKeyHash(KnowledgeExtractionCacheKey key)
113+
{
114+
var keyJson = JsonSerializer.Serialize(key, _serializerOptions);
115+
var keyBytes = Encoding.UTF8.GetBytes(keyJson);
116+
var hashBytes = SHA256.HashData(keyBytes);
117+
var hash = Convert.ToHexString(hashBytes);
118+
119+
return hash[..CacheKeyHashLength];
120+
}
78121
}

src/MarkdownLd.Kb/Extraction/Processing/KnowledgeFactMerger.cs

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,19 @@ namespace ManagedCode.MarkdownLd.Kb.Pipeline;
44

55
public sealed class KnowledgeFactMerger(Uri? baseUri = null)
66
{
7+
private static readonly IReadOnlyDictionary<string, int> TypePriorities = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase)
8+
{
9+
[SchemaPersonTypeText] = 5,
10+
[SchemaOrganizationTypeText] = 5,
11+
[SchemaSoftwareApplicationTypeText] = 5,
12+
[SchemaTechArticleTypeText] = 4,
13+
[SchemaScholarlyArticleTypeText] = 4,
14+
[SchemaBlogPostingTypeText] = 4,
15+
[SchemaCreativeWorkTypeText] = 4,
16+
[SchemaArticleTypeText] = 4,
17+
[SchemaThingTypeText] = 1,
18+
};
19+
720
private readonly Uri _baseUri = KnowledgeNaming.NormalizeBaseUri(baseUri ?? new Uri(DefaultBaseUriText, UriKind.Absolute));
821

922
public KnowledgeExtractionResult Merge(params KnowledgeExtractionResult[] results)
@@ -206,18 +219,6 @@ private static string PreferHigherPriority(string left, string right)
206219

207220
private static int TypePriority(string type)
208221
{
209-
return type switch
210-
{
211-
SchemaPersonTypeText => 5,
212-
SchemaOrganizationTypeText => 5,
213-
SchemaSoftwareApplicationTypeText => 5,
214-
SchemaTechArticleTypeText => 4,
215-
SchemaScholarlyArticleTypeText => 4,
216-
SchemaBlogPostingTypeText => 4,
217-
SchemaCreativeWorkTypeText => 4,
218-
SchemaArticleTypeText => 4,
219-
SchemaThingTypeText => 1,
220-
_ => 0,
221-
};
222+
return TypePriorities.TryGetValue(type, out var priority) ? priority : 0;
222223
}
223224
}

src/MarkdownLd.Kb/Graph/Runtime/KnowledgeGraph.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ public sealed partial class KnowledgeGraph
1313
{
1414
private readonly Graph _graph;
1515
private readonly ReaderWriterLockSlim _graphLock = new();
16-
private readonly TokenizedKnowledgeIndex? _tokenIndex;
16+
private TokenizedKnowledgeIndex? _tokenIndex;
1717

1818
internal KnowledgeGraph(Graph graph, TokenizedKnowledgeIndex? tokenIndex = null)
1919
{
@@ -190,6 +190,7 @@ private void MergeSnapshot(Graph graph, CancellationToken cancellationToken)
190190
{
191191
cancellationToken.ThrowIfCancellationRequested();
192192
_graph.Merge(graph);
193+
_tokenIndex = null;
193194
}
194195
finally
195196
{

src/MarkdownLd.Kb/Query/NaturalLanguage/ChatClientNaturalLanguageSparqlTranslator.cs

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,18 @@ private static string NormalizeQuery(string text)
133133
throw new InvalidOperationException(EmptyTranslationMessage);
134134
}
135135

136+
var fenced = ExtractFencedQuery(trimmed, SparqlFence, StringComparison.OrdinalIgnoreCase);
137+
if (fenced is not null)
138+
{
139+
return fenced;
140+
}
141+
142+
fenced = ExtractFencedQuery(trimmed, CodeFence, StringComparison.Ordinal);
143+
if (fenced is not null)
144+
{
145+
return fenced;
146+
}
147+
136148
if (trimmed.StartsWith(SparqlFence, StringComparison.OrdinalIgnoreCase))
137149
{
138150
trimmed = trimmed[SparqlFence.Length..].Trim();
@@ -142,12 +154,25 @@ private static string NormalizeQuery(string text)
142154
trimmed = trimmed[CodeFence.Length..].Trim();
143155
}
144156

145-
if (trimmed.EndsWith(CodeFence, StringComparison.Ordinal))
157+
return trimmed.EndsWith(CodeFence, StringComparison.Ordinal)
158+
? trimmed[..^CodeFence.Length].Trim()
159+
: trimmed;
160+
}
161+
162+
private static string? ExtractFencedQuery(string text, string openingFence, StringComparison comparison)
163+
{
164+
var start = text.IndexOf(openingFence, comparison);
165+
if (start < 0)
146166
{
147-
trimmed = trimmed[..^CodeFence.Length].Trim();
167+
return null;
148168
}
149169

150-
return trimmed;
170+
var contentStart = start + openingFence.Length;
171+
var content = text[contentStart..].Trim();
172+
var end = content.IndexOf(CodeFence, StringComparison.Ordinal);
173+
return end >= 0
174+
? content[..end].Trim()
175+
: content;
151176
}
152177

153178
private static NaturalLanguageSparqlQueryKind ResolveQueryKind(string queryText)

src/MarkdownLd.Kb/Query/Sparql/SparqlSafety.cs

Lines changed: 110 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ public static class SparqlSafety
1717
private const char SingleQuoteCharacter = '\'';
1818
private const char EscapeCharacter = '\\';
1919
private const char MaskCharacter = ' ';
20+
private const char CommentCharacter = '#';
21+
private const char IriStartCharacter = '<';
22+
private const char IriEndCharacter = '>';
23+
private const char LineFeedCharacter = '\n';
24+
private const char CarriageReturnCharacter = '\r';
2025

2126
private static readonly SparqlQueryParser Parser = new();
2227
private static readonly Regex MutatingKeywordRegex = new(MutatingKeywordPattern, RegexOptions.Compiled | RegexOptions.CultureInvariant | RegexOptions.IgnoreCase);
@@ -29,10 +34,6 @@ public static SparqlSafetyResult EnforceReadOnly(string query, int defaultLimit
2934
}
3035

3136
var trimmed = query.Trim();
32-
if (TryGetMutatingKeywordOutsideString(trimmed, out _))
33-
{
34-
return new(false, query, OnlySelectAndAskQueriesAllowedMessage);
35-
}
3637

3738
SparqlQuery parsed;
3839
try
@@ -41,6 +42,11 @@ public static SparqlSafetyResult EnforceReadOnly(string query, int defaultLimit
4142
}
4243
catch (Exception ex)
4344
{
45+
if (TryGetMutatingKeywordOutsideString(trimmed, out _))
46+
{
47+
return new(false, query, OnlySelectAndAskQueriesAllowedMessage);
48+
}
49+
4450
return new(false, query, ex.Message);
4551
}
4652

@@ -72,39 +78,119 @@ or SparqlQueryType.SelectAllDistinct
7278
internal static bool TryGetMutatingKeywordOutsideString(string query, out string? keyword)
7379
{
7480
var masked = query.ToCharArray();
75-
var inString = false;
81+
var state = SparqlMaskState.None;
7682
var quote = MaskCharacter;
7783

7884
for (var index = 0; index < masked.Length; index++)
7985
{
8086
var current = masked[index];
81-
if (inString)
87+
if (TryMaskCurrentCharacter(masked, ref index, current, ref state, ref quote))
8288
{
83-
masked[index] = MaskCharacter;
84-
if (current == EscapeCharacter && index + 1 < masked.Length)
85-
{
86-
masked[++index] = MaskCharacter;
87-
continue;
88-
}
89-
90-
if (current == quote)
91-
{
92-
inString = false;
93-
}
94-
9589
continue;
9690
}
9791

98-
if (current is DoubleQuoteCharacter or SingleQuoteCharacter)
99-
{
100-
inString = true;
101-
quote = current;
102-
masked[index] = MaskCharacter;
103-
}
92+
StartMaskingIfNeeded(masked, index, current, ref state, ref quote);
10493
}
10594

10695
var match = MutatingKeywordRegex.Match(new string(masked));
10796
keyword = match.Success ? match.Value : null;
10897
return match.Success;
10998
}
99+
100+
private static bool TryMaskCurrentCharacter(
101+
char[] masked,
102+
ref int index,
103+
char current,
104+
ref SparqlMaskState state,
105+
ref char quote)
106+
{
107+
return state switch
108+
{
109+
SparqlMaskState.Comment => MaskComment(masked, index, current, ref state),
110+
SparqlMaskState.Iri => MaskIri(masked, index, current, ref state),
111+
SparqlMaskState.String => MaskString(masked, ref index, current, ref state, ref quote),
112+
_ => false,
113+
};
114+
}
115+
116+
private static void StartMaskingIfNeeded(
117+
char[] masked,
118+
int index,
119+
char current,
120+
ref SparqlMaskState state,
121+
ref char quote)
122+
{
123+
switch (current)
124+
{
125+
case CommentCharacter:
126+
state = SparqlMaskState.Comment;
127+
masked[index] = MaskCharacter;
128+
break;
129+
130+
case IriStartCharacter:
131+
state = SparqlMaskState.Iri;
132+
masked[index] = MaskCharacter;
133+
break;
134+
135+
case DoubleQuoteCharacter:
136+
case SingleQuoteCharacter:
137+
state = SparqlMaskState.String;
138+
quote = current;
139+
masked[index] = MaskCharacter;
140+
break;
141+
}
142+
}
143+
144+
private static bool MaskComment(char[] masked, int index, char current, ref SparqlMaskState state)
145+
{
146+
masked[index] = MaskCharacter;
147+
if (current is LineFeedCharacter or CarriageReturnCharacter)
148+
{
149+
state = SparqlMaskState.None;
150+
}
151+
152+
return true;
153+
}
154+
155+
private static bool MaskIri(char[] masked, int index, char current, ref SparqlMaskState state)
156+
{
157+
masked[index] = MaskCharacter;
158+
if (current == IriEndCharacter)
159+
{
160+
state = SparqlMaskState.None;
161+
}
162+
163+
return true;
164+
}
165+
166+
private static bool MaskString(
167+
char[] masked,
168+
ref int index,
169+
char current,
170+
ref SparqlMaskState state,
171+
ref char quote)
172+
{
173+
masked[index] = MaskCharacter;
174+
if (current == EscapeCharacter && index + 1 < masked.Length)
175+
{
176+
masked[++index] = MaskCharacter;
177+
return true;
178+
}
179+
180+
if (current == quote)
181+
{
182+
state = SparqlMaskState.None;
183+
quote = MaskCharacter;
184+
}
185+
186+
return true;
187+
}
188+
189+
private enum SparqlMaskState
190+
{
191+
None,
192+
String,
193+
Comment,
194+
Iri,
195+
}
110196
}

0 commit comments

Comments
 (0)