Skip to content

Commit 8def203

Browse files
committed
update graph
1 parent 6aa31e2 commit 8def203

13 files changed

Lines changed: 655 additions & 17 deletions

Directory.Packages.props

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
<ItemGroup>
77
<PackageVersion Include="dotNetRdf" Version="3.5.1" />
8+
<PackageVersion Include="dotNetRdf.Shacl" Version="3.5.1" />
89
<PackageVersion Include="DotNet.ReproducibleBuilds" Version="1.2.25" />
910
<PackageVersion Include="Markdig" Version="1.1.2" />
1011
<PackageVersion Include="Microsoft.Extensions.AI" Version="10.4.1" />

docs/ADR/ADR-0001-rdf-sparql-library.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Related Features: `docs/Architecture.md`
1111
- [x] Analyze upstream graph stack and .NET options.
1212
- [x] Choose the RDF/SPARQL dependency.
1313
- [x] Add dotNetRDF to the production project.
14+
- [x] Add `dotNetRdf.Shacl` when graph validation became a first-class library boundary.
1415
- [x] Add flow tests that query generated graphs through SPARQL.
1516
- [x] Run build, test, format, and coverage commands.
1617
- [x] Update `docs/Architecture.md` if dependency boundaries change.
@@ -41,12 +42,13 @@ Non-goals:
4142

4243
## Decision
4344

44-
Use dotNetRDF as the RDF graph, serialization, and SPARQL engine for the first .NET implementation slice.
45+
Use dotNetRDF as the RDF graph, serialization, SPARQL, and SHACL validation engine for the .NET implementation.
4546

4647
Key points:
4748

4849
- dotNetRDF replaces Python RDFLib for the C# port.
4950
- The selected package supports RDF/SPARQL in .NET and the user guide documents in-memory RDF data and in-memory SPARQL querying, which matches the no-server core runtime boundary.
51+
- The `dotNetRdf.Shacl` package provides a SHACL processor over in-memory RDF graphs, which keeps validation standards-based and local.
5052
- Markdig and YamlDotNet will handle Markdown/front matter parsing separately.
5153
- AI extraction remains behind an extraction port that uses `Microsoft.Extensions.AI.IChatClient`; provider/orchestration packages are not part of this RDF dependency decision.
5254

@@ -64,6 +66,7 @@ flowchart LR
6466
None --> Builder
6567
Builder --> DotNetRdf["dotNetRDF graph"]
6668
DotNetRdf --> Sparql["Local SPARQL execution"]
69+
DotNetRdf --> Shacl["Local SHACL validation"]
6770
DotNetRdf --> Turtle["Turtle writer"]
6871
DotNetRdf --> JsonLd["JSON-LD writer"]
6972
```
@@ -99,13 +102,15 @@ flowchart LR
99102
### Negative / risks
100103

101104
- The core library takes a dependency on dotNetRDF APIs.
105+
- SHACL validation uses dotNetRDF report objects internally, but public results stay in repository-owned models.
102106
- JSON-LD support may require a specific package shape or writer availability in the selected version.
103107
- Performance characteristics are inherited from dotNetRDF and must be measured before promising large-scale query throughput.
104108

105109
Mitigations:
106110

107111
- Hide dependency details behind `KnowledgeGraph` query methods, `KnowledgeSearchService`, and serialization methods where practical.
108112
- Add tests for serialization and SPARQL query paths.
113+
- Add tests for SHACL conformance and violation report paths.
109114
- Keep remote/federated SPARQL out of the first slice.
110115

111116
## Impact
@@ -151,6 +156,7 @@ Mitigations:
151156
- Serialize the graph and parse/inspect the output.
152157
- Negative flows:
153158
- Reject mutating SPARQL operations.
159+
- Validate malformed graph metadata through SHACL reports.
154160
- Default no-extractor mode.
155161
- Explicit Tiktoken token-distance mode.
156162
- Edge flows:
@@ -183,6 +189,7 @@ Mitigations:
183189
| TST-003 | Mutating SPARQL query | Integration | Rejected before execution | Safety |
184190
| TST-004 | Empty input | Integration | Empty graph and empty search results | Edge |
185191
| TST-005 | Malformed assertion syntax | Integration | Ignored without graph corruption | Negative |
192+
| TST-006 | SHACL validation | Integration | Valid graphs conform and invalid graph metadata reports violations | Standards validation |
186193

187194
## Rollout and migration
188195

@@ -196,3 +203,4 @@ No migration exists yet. This is the initial implementation decision.
196203
- `external/lqdev-markdown-ld-kb/.ai-memex/blog-post-zero-cost-knowledge-graph-from-markdown.md`
197204
- dotNetRDF upstream repository: `https://github.com/dotnetrdf/dotnetrdf`
198205
- dotNetRDF user guide: `https://dotnetrdf.org/docs/stable/user_guide/index.html`
206+
- SHACL validation feature: `docs/Features/GraphShaclValidation.md`

docs/Architecture.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Markdown-LD Knowledge Bank Architecture
22

3-
Date: 2026-04-11
3+
Date: 2026-04-15
44

55
## Purpose
66

@@ -32,6 +32,7 @@ flowchart LR
3232
Graph --> Sparql["In-memory SPARQL executor API"]
3333
Graph --> Search["In-memory graph search API"]
3434
Graph --> Focused["Focused graph search API"]
35+
Graph --> Shacl["SHACL validation API"]
3536
Graph --> Serializers["Turtle and JSON-LD serializers"]
3637
Graph --> Merge["Thread-safe graph merge API"]
3738
IChatClient["Microsoft.Extensions.AI IChatClient"] --> ChatExtractor
@@ -50,6 +51,7 @@ sequenceDiagram
5051
participant Chat as IChatClient
5152
participant Tokenizer as Tiktoken tokenizer
5253
participant Graph as KnowledgeGraphBuilder
54+
participant BuiltGraph as KnowledgeGraph
5355
participant Query as InMemorySparqlExecutor
5456
5557
Caller->>Pipeline: BuildAsync(documents, options)
@@ -69,6 +71,9 @@ sequenceDiagram
6971
Router-->>Pipeline: Entities, assertions, optional token index
7072
Pipeline->>Graph: Add facts as RDF triples
7173
Graph-->>Pipeline: In-memory KnowledgeGraph
74+
Pipeline-->>Caller: MarkdownKnowledgeBuildResult
75+
Caller->>BuiltGraph: ValidateShacl(optional shapes)
76+
BuiltGraph-->>Caller: SHACL validation report
7277
Caller->>Query: ExecuteSelect(graph, sparql)
7378
Query-->>Caller: SPARQL bindings
7479
```
@@ -84,6 +89,7 @@ flowchart TB
8489
Tokens["Tiktoken: subword TF-IDF vectors, keyphrase topics, explicit entity hints, and token-distance search"]
8590
Rules["Capability rules: graph_entities, graph_edges, graph_groups, graph_related, graph_next_steps"]
8691
Rdf["RDF: graph construction, namespaces, serialization"]
92+
Shacl["SHACL: default shapes, validation reports, assertion metadata"]
8793
Query["Query: SPARQL and graph search"]
8894
end
8995
@@ -99,6 +105,7 @@ flowchart TB
99105
FlowTests --> Tokens
100106
FlowTests --> Rules
101107
FlowTests --> Rdf
108+
FlowTests --> Shacl
102109
FlowTests --> Query
103110
```
104111

@@ -138,6 +145,7 @@ flowchart LR
138145

139146
- Parsing depends on Markdig and YamlDotNet.
140147
- RDF graph building and SPARQL execution depend on dotNetRDF.
148+
- SHACL validation depends on `dotNetRdf.Shacl` and runs against the in-memory graph through `VDS.RDF.Shacl.ShapesGraph`.
141149
- LLM extraction depends on `Microsoft.Extensions.AI.Abstractions` and accepts `IChatClient`.
142150
- Tiktoken extraction depends on `Microsoft.ML.Tokenizers` and the O200k data package. It uses tokenizer IDs and Unicode word n-gram keyphrase candidates only, and does not add an embedding provider. The default vector weighting is subword TF-IDF fitted over the current build corpus.
143151
- Embeddings are not required for the core graph build/query flow.
@@ -154,6 +162,7 @@ Required first-slice scenarios:
154162
- Empty Markdown input produces an empty graph without throwing.
155163
- Explicit Tiktoken mode builds section/segment/topic/entity-hint nodes plus `schema:hasPart`, `schema:about`, `schema:mentions`, and token-distance `kb:relatedTo` edges without network access.
156164
- Capability graph rules build `kb:memberOf`, `kb:relatedTo`, and `kb:nextStep` workflow edges from Markdown front matter or caller options, and focused search returns primary, related, and next-step result groups.
165+
- SHACL validation uses default Markdown-LD Knowledge Bank shapes or caller-supplied shapes, and assertion confidence/provenance metadata is represented as RDF statements so validation remains RDF-native.
157166
- English, Ukrainian, French, and German queries over same-language token graphs produce a higher hit rate than cross-language translated-topic queries.
158167
- Term frequency, binary presence, and subword TF-IDF token weighting modes are covered by focused and flow tests.
159168
- SPARQL mutating queries are rejected before execution.
@@ -183,3 +192,4 @@ Coverage requirement: 95%+ line coverage for changed production code.
183192
- LLM extraction dependency decision: `docs/ADR/ADR-0002-llm-extraction-ichatclient.md`
184193
- Capability graph rules decision: `docs/ADR/ADR-0004-capability-graph-rules.md`
185194
- Capability graph rules feature: `docs/Features/CapabilityGraphRules.md`
195+
- SHACL validation feature: `docs/Features/GraphShaclValidation.md`
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Graph SHACL Validation
2+
3+
Date: 2026-04-15
4+
5+
## Purpose
6+
7+
Markdown-LD Knowledge Bank validates built RDF graphs with SHACL so callers can detect malformed graph construction through a standards-based report instead of custom post-processing.
8+
9+
The feature uses `dotNetRdf.Shacl` over the in-memory `KnowledgeGraph`. It does not add a server, database, cache, provider SDK, or Python runtime.
10+
11+
## Flow
12+
13+
```mermaid
14+
flowchart LR
15+
Markdown["Markdown documents"] --> Pipeline["MarkdownKnowledgePipeline"]
16+
Pipeline --> Facts["Merged entities and assertions"]
17+
Facts --> Reification["Direct RDF edges plus rdf:Statement metadata"]
18+
Reification --> Graph["KnowledgeGraph"]
19+
Shapes["Default or caller SHACL shapes"] --> Validator["dotNetRDF ShapesGraph"]
20+
Graph --> Validator
21+
Validator --> Report["KnowledgeGraphShaclValidationReport"]
22+
Report --> Caller["Caller"]
23+
```
24+
25+
## Default Shapes
26+
27+
The built-in shapes graph validates:
28+
29+
- `schema:Article` nodes have `schema:name` and IRI provenance.
30+
- common entity classes have `schema:name`.
31+
- `schema:sameAs` values are IRIs.
32+
- `prov:wasDerivedFrom` values are IRIs.
33+
- reified `rdf:Statement` assertion metadata has one IRI subject, predicate, object, and a decimal `kb:confidence` from 0 through 1.
34+
35+
Callers can pass custom Turtle SHACL shapes to `KnowledgeGraph.ValidateShacl(shapesTurtle)` or `MarkdownKnowledgeBuildResult.ValidateShacl(shapesTurtle)`.
36+
37+
## Assertion Metadata
38+
39+
Graph assertions remain direct RDF edges for existing SPARQL/search callers. Each assertion also receives RDF reification metadata:
40+
41+
```mermaid
42+
flowchart TB
43+
Subject["subject IRI"] -->|"direct predicate"| Object["object IRI"]
44+
Statement["blank rdf:Statement"] -->|"rdf:subject"| Subject
45+
Statement -->|"rdf:predicate"| Predicate["predicate IRI"]
46+
Statement -->|"rdf:object"| Object
47+
Statement -->|"kb:confidence"| Confidence["xsd:decimal"]
48+
Statement -->|"prov:wasDerivedFrom"| Source["source IRI or invalid literal"]
49+
```
50+
51+
Invalid caller-authored `sameAs` and provenance values are represented as literals so SHACL can report node-kind violations instead of silently dropping them.
52+
53+
## Testing Methodology
54+
55+
Flow tests cover:
56+
57+
- valid Markdown and configured graph rules conform to the default shapes;
58+
- invalid `schema:sameAs`, provenance, and assertion confidence produce SHACL results;
59+
- caller-supplied shapes validate the same built graph;
60+
- sameAs-first entity merge rewrites assertion endpoints before validation.
61+
62+
Verification commands:
63+
64+
- `dotnet build MarkdownLd.Kb.slnx --no-restore`
65+
- `dotnet test --solution MarkdownLd.Kb.slnx --configuration Release`
66+
- `dotnet format MarkdownLd.Kb.slnx --verify-no-changes`
67+
- coverage command from the root `AGENTS.md`

src/MarkdownLd.Kb/MarkdownLd.Kb.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
<ItemGroup>
1414
<PackageReference Include="dotNetRdf" />
15+
<PackageReference Include="dotNetRdf.Shacl" />
1516
<PackageReference Include="Markdig" />
1617
<PackageReference Include="Microsoft.Extensions.AI" />
1718
<PackageReference Include="Microsoft.Extensions.AI.Abstractions" />

src/MarkdownLd.Kb/Pipeline/KnowledgeFactMerger.cs

Lines changed: 80 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,28 @@ public KnowledgeExtractionResult Merge(params KnowledgeExtractionResult[] result
1212

1313
var entities = new Dictionary<string, KnowledgeEntityFact>(StringComparer.OrdinalIgnoreCase);
1414
var assertions = new Dictionary<string, KnowledgeAssertionFact>(StringComparer.OrdinalIgnoreCase);
15+
var entityAliases = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
16+
var sameAsAliases = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
17+
var pendingAssertions = new List<KnowledgeAssertionFact>();
1518

1619
foreach (var result in results)
1720
{
1821
ArgumentNullException.ThrowIfNull(result);
1922

2023
foreach (var entity in result.Entities)
2124
{
22-
UpsertEntity(entities, CanonicalizeEntity(entity));
25+
UpsertEntity(entities, entityAliases, sameAsAliases, CanonicalizeEntity(entity));
2326
}
2427

25-
foreach (var assertion in result.Assertions)
28+
pendingAssertions.AddRange(result.Assertions);
29+
}
30+
31+
foreach (var assertion in pendingAssertions)
32+
{
33+
var canonical = RewriteAssertionAliases(CanonicalizeAssertion(assertion), entityAliases);
34+
if (IsValidAssertion(canonical))
2635
{
27-
var canonical = CanonicalizeAssertion(assertion);
28-
if (IsValidAssertion(canonical))
29-
{
30-
UpsertAssertion(assertions, canonical);
31-
}
36+
UpsertAssertion(assertions, canonical);
3237
}
3338
}
3439

@@ -94,12 +99,42 @@ private static bool IsValidAssertion(KnowledgeAssertionFact assertion)
9499
!string.IsNullOrWhiteSpace(assertion.ObjectId);
95100
}
96101

97-
private static void UpsertEntity(IDictionary<string, KnowledgeEntityFact> entities, KnowledgeEntityFact entity)
102+
private static KnowledgeAssertionFact RewriteAssertionAliases(
103+
KnowledgeAssertionFact assertion,
104+
IReadOnlyDictionary<string, string> entityAliases)
105+
{
106+
return assertion with
107+
{
108+
SubjectId = ResolveEntityAlias(assertion.SubjectId, entityAliases),
109+
ObjectId = ShouldRewriteObject(assertion)
110+
? ResolveEntityAlias(assertion.ObjectId, entityAliases)
111+
: assertion.ObjectId,
112+
};
113+
}
114+
115+
private static bool ShouldRewriteObject(KnowledgeAssertionFact assertion)
116+
{
117+
return !assertion.Predicate.Equals(ExpectedSchemaSameAs, StringComparison.OrdinalIgnoreCase);
118+
}
119+
120+
private static string ResolveEntityAlias(string nodeId, IReadOnlyDictionary<string, string> entityAliases)
98121
{
99-
var key = entity.Id ?? entity.Label;
122+
return entityAliases.TryGetValue(nodeId, out var canonicalId)
123+
? canonicalId
124+
: nodeId;
125+
}
126+
127+
private static void UpsertEntity(
128+
IDictionary<string, KnowledgeEntityFact> entities,
129+
IDictionary<string, string> entityAliases,
130+
IDictionary<string, string> sameAsAliases,
131+
KnowledgeEntityFact entity)
132+
{
133+
var key = ResolveEntityKey(sameAsAliases, entity);
100134
if (!entities.TryGetValue(key, out var existing))
101135
{
102-
entities[key] = entity;
136+
entities[key] = entity with { Id = key };
137+
IndexEntityAliases(entityAliases, sameAsAliases, key, entity);
103138
return;
104139
}
105140

@@ -111,6 +146,41 @@ private static void UpsertEntity(IDictionary<string, KnowledgeEntityFact> entiti
111146
Confidence = Math.Max(existing.Confidence, entity.Confidence),
112147
Source = string.IsNullOrWhiteSpace(existing.Source) ? entity.Source : existing.Source,
113148
};
149+
IndexEntityAliases(entityAliases, sameAsAliases, key, entities[key]);
150+
IndexEntityAliases(entityAliases, sameAsAliases, key, entity);
151+
}
152+
153+
private static string ResolveEntityKey(
154+
IDictionary<string, string> sameAsAliases,
155+
KnowledgeEntityFact entity)
156+
{
157+
foreach (var sameAs in entity.SameAs)
158+
{
159+
if (sameAsAliases.TryGetValue(sameAs, out var existingKey))
160+
{
161+
return existingKey;
162+
}
163+
}
164+
165+
return entity.Id ?? entity.Label;
166+
}
167+
168+
private static void IndexEntityAliases(
169+
IDictionary<string, string> entityAliases,
170+
IDictionary<string, string> sameAsAliases,
171+
string key,
172+
KnowledgeEntityFact entity)
173+
{
174+
if (!string.IsNullOrWhiteSpace(entity.Id))
175+
{
176+
entityAliases[entity.Id] = key;
177+
}
178+
179+
foreach (var sameAs in entity.SameAs)
180+
{
181+
entityAliases[sameAs] = key;
182+
sameAsAliases[sameAs] = key;
183+
}
114184
}
115185

116186
private static void UpsertAssertion(IDictionary<string, KnowledgeAssertionFact> assertions, KnowledgeAssertionFact assertion)

0 commit comments

Comments
 (0)