Skip to content

Commit 0109b41

Browse files
committed
Add heuristic ingestion and community detection upgrades
1 parent f02f5c6 commit 0109b41

13 files changed

+938
-46
lines changed

src/ManagedCode.GraphRag/Community/CommunityBuilder.cs

Lines changed: 91 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -23,53 +23,12 @@ public static IReadOnlyList<CommunityRecord> Build(
2323
return Array.Empty<CommunityRecord>();
2424
}
2525

26-
var adjacency = BuildAdjacency(entities, relationships);
2726
var titleLookup = entities.ToDictionary(entity => entity.Title, StringComparer.OrdinalIgnoreCase);
28-
var random = new Random(config.Seed);
29-
30-
var orderedTitles = titleLookup.Keys
31-
.OrderBy(_ => random.Next())
32-
.ToList();
33-
34-
var visited = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
35-
var components = new List<List<string>>();
36-
37-
foreach (var title in orderedTitles)
27+
var components = config.Algorithm switch
3828
{
39-
if (!visited.Add(title))
40-
{
41-
continue;
42-
}
43-
44-
var component = new List<string>();
45-
var queue = new Queue<string>();
46-
queue.Enqueue(title);
47-
48-
while (queue.Count > 0)
49-
{
50-
var current = queue.Dequeue();
51-
component.Add(current);
52-
53-
if (!adjacency.TryGetValue(current, out var neighbors) || neighbors.Count == 0)
54-
{
55-
continue;
56-
}
57-
58-
var orderedNeighbors = neighbors
59-
.OrderBy(_ => random.Next())
60-
.ToList();
61-
62-
foreach (var neighbor in orderedNeighbors)
63-
{
64-
if (visited.Add(neighbor))
65-
{
66-
queue.Enqueue(neighbor);
67-
}
68-
}
69-
}
70-
71-
components.Add(component);
72-
}
29+
CommunityDetectionAlgorithm.FastLabelPropagation => BuildUsingLabelPropagation(entities, relationships, config),
30+
_ => BuildUsingConnectedComponents(entities, relationships, config)
31+
};
7332

7433
if (config.UseLargestConnectedComponent && components.Count > 0)
7534
{
@@ -183,6 +142,93 @@ public static IReadOnlyList<CommunityRecord> Build(
183142
return communityRecords;
184143
}
185144

145+
private static List<List<string>> BuildUsingConnectedComponents(
146+
IReadOnlyList<EntityRecord> entities,
147+
IReadOnlyList<RelationshipRecord> relationships,
148+
ClusterGraphConfig config)
149+
{
150+
var adjacency = BuildAdjacency(entities, relationships);
151+
var random = new Random(config.Seed);
152+
var orderedTitles = adjacency.Keys
153+
.OrderBy(_ => random.Next())
154+
.ToList();
155+
156+
var visited = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
157+
var components = new List<List<string>>();
158+
159+
foreach (var title in orderedTitles)
160+
{
161+
if (!visited.Add(title))
162+
{
163+
continue;
164+
}
165+
166+
var component = new List<string>();
167+
var queue = new Queue<string>();
168+
queue.Enqueue(title);
169+
170+
while (queue.Count > 0)
171+
{
172+
var current = queue.Dequeue();
173+
component.Add(current);
174+
175+
if (!adjacency.TryGetValue(current, out var neighbors) || neighbors.Count == 0)
176+
{
177+
continue;
178+
}
179+
180+
var orderedNeighbors = neighbors
181+
.OrderBy(_ => random.Next())
182+
.ToList();
183+
184+
foreach (var neighbor in orderedNeighbors)
185+
{
186+
if (visited.Add(neighbor))
187+
{
188+
queue.Enqueue(neighbor);
189+
}
190+
}
191+
}
192+
193+
components.Add(component);
194+
}
195+
196+
return components;
197+
}
198+
199+
private static List<List<string>> BuildUsingLabelPropagation(
200+
IReadOnlyList<EntityRecord> entities,
201+
IReadOnlyList<RelationshipRecord> relationships,
202+
ClusterGraphConfig config)
203+
{
204+
var assignments = FastLabelPropagationCommunityDetector.AssignLabels(entities, relationships, config);
205+
if (assignments.Count == 0)
206+
{
207+
return new List<List<string>>();
208+
}
209+
210+
var groups = new Dictionary<string, List<string>>(StringComparer.OrdinalIgnoreCase);
211+
212+
foreach (var pair in assignments)
213+
{
214+
if (!groups.TryGetValue(pair.Value, out var members))
215+
{
216+
members = new List<string>();
217+
groups[pair.Value] = members;
218+
}
219+
220+
members.Add(pair.Key);
221+
}
222+
223+
return groups.Values
224+
.Select(list => list
225+
.Distinct(StringComparer.OrdinalIgnoreCase)
226+
.OrderBy(title => title, StringComparer.OrdinalIgnoreCase)
227+
.ToList())
228+
.Where(list => list.Count > 0)
229+
.ToList();
230+
}
231+
186232
private static Dictionary<string, HashSet<string>> BuildAdjacency(
187233
IReadOnlyList<EntityRecord> entities,
188234
IReadOnlyList<RelationshipRecord> relationships)
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
using GraphRag.Config;
2+
using GraphRag.Entities;
3+
using GraphRag.Relationships;
4+
5+
namespace GraphRag.Community;
6+
7+
internal static class FastLabelPropagationCommunityDetector
8+
{
9+
public static IReadOnlyDictionary<string, string> AssignLabels(
10+
IReadOnlyList<EntityRecord> entities,
11+
IReadOnlyList<RelationshipRecord> relationships,
12+
ClusterGraphConfig config)
13+
{
14+
ArgumentNullException.ThrowIfNull(entities);
15+
ArgumentNullException.ThrowIfNull(relationships);
16+
ArgumentNullException.ThrowIfNull(config);
17+
18+
var adjacency = BuildAdjacency(entities, relationships);
19+
if (adjacency.Count == 0)
20+
{
21+
return new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
22+
}
23+
24+
var random = new Random(config.Seed);
25+
var labels = adjacency.Keys.ToDictionary(node => node, node => node, StringComparer.OrdinalIgnoreCase);
26+
var nodes = adjacency.Keys.ToList();
27+
var maxIterations = Math.Max(1, config.MaxIterations);
28+
29+
for (var iteration = 0; iteration < maxIterations; iteration++)
30+
{
31+
var shuffled = nodes.OrderBy(_ => random.Next()).ToList();
32+
var changed = false;
33+
34+
foreach (var node in shuffled)
35+
{
36+
var neighbors = adjacency[node];
37+
if (neighbors.Count == 0)
38+
{
39+
continue;
40+
}
41+
42+
var labelWeights = new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
43+
foreach (var (neighbor, weight) in neighbors)
44+
{
45+
if (!labels.TryGetValue(neighbor, out var neighborLabel))
46+
{
47+
continue;
48+
}
49+
50+
labelWeights[neighborLabel] = labelWeights.GetValueOrDefault(neighborLabel) + (weight > 0 ? weight : 1);
51+
}
52+
53+
if (labelWeights.Count == 0)
54+
{
55+
continue;
56+
}
57+
58+
var maxWeight = labelWeights.Values.Max();
59+
var candidates = labelWeights
60+
.Where(pair => Math.Abs(pair.Value - maxWeight) < 1e-6)
61+
.Select(pair => pair.Key)
62+
.ToList();
63+
64+
var chosen = candidates.Count == 1
65+
? candidates[0]
66+
: candidates[random.Next(candidates.Count)];
67+
68+
if (!string.Equals(labels[node], chosen, StringComparison.OrdinalIgnoreCase))
69+
{
70+
labels[node] = chosen;
71+
changed = true;
72+
}
73+
}
74+
75+
if (!changed)
76+
{
77+
break;
78+
}
79+
}
80+
81+
return labels;
82+
}
83+
84+
private static Dictionary<string, List<(string Neighbor, double Weight)>> BuildAdjacency(
85+
IReadOnlyList<EntityRecord> entities,
86+
IReadOnlyList<RelationshipRecord> relationships)
87+
{
88+
var adjacency = entities
89+
.ToDictionary(entity => entity.Title, _ => new List<(string, double)>(), StringComparer.OrdinalIgnoreCase);
90+
91+
foreach (var relationship in relationships)
92+
{
93+
if (!adjacency.TryGetValue(relationship.Source, out var sourceNeighbors))
94+
{
95+
sourceNeighbors = new List<(string, double)>();
96+
adjacency[relationship.Source] = sourceNeighbors;
97+
}
98+
99+
if (!adjacency.TryGetValue(relationship.Target, out var targetNeighbors))
100+
{
101+
targetNeighbors = new List<(string, double)>();
102+
adjacency[relationship.Target] = targetNeighbors;
103+
}
104+
105+
sourceNeighbors.Add((relationship.Target, relationship.Weight));
106+
targetNeighbors.Add((relationship.Source, relationship.Weight));
107+
}
108+
109+
return adjacency;
110+
}
111+
}

src/ManagedCode.GraphRag/Config/ClusterGraphConfig.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,17 @@ public sealed class ClusterGraphConfig
2222
/// results deterministic across runs.
2323
/// </summary>
2424
public int Seed { get; set; } = unchecked((int)0xDEADBEEF);
25+
26+
/// <summary>
27+
/// Gets or sets the maximum number of label propagation iterations when the
28+
/// <see cref="CommunityDetectionAlgorithm.FastLabelPropagation"/> algorithm is used.
29+
/// </summary>
30+
public int MaxIterations { get; set; } = 20;
31+
32+
/// <summary>
33+
/// Gets or sets the community detection algorithm. The fast label propagation
34+
/// implementation mirrors the in-process routine provided by GraphRag.Net.
35+
/// </summary>
36+
public CommunityDetectionAlgorithm Algorithm { get; set; }
37+
= CommunityDetectionAlgorithm.FastLabelPropagation;
2538
}

src/ManagedCode.GraphRag/Config/Enums.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,9 @@ public enum ModularityMetric
6363
Lcc,
6464
WeightedComponents
6565
}
66+
67+
public enum CommunityDetectionAlgorithm
68+
{
69+
FastLabelPropagation,
70+
ConnectedComponents
71+
}

src/ManagedCode.GraphRag/Config/GraphRagConfig.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ public sealed class GraphRagConfig
4242

4343
public ClusterGraphConfig ClusterGraph { get; set; } = new();
4444

45+
public HeuristicMaintenanceConfig Heuristics { get; set; } = new();
46+
4547
public CommunityReportsConfig CommunityReports { get; set; } = new();
4648

4749
public PromptTuningConfig PromptTuning { get; set; } = new();
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
namespace GraphRag.Config;
2+
3+
/// <summary>
4+
/// Represents heuristic controls that fine-tune ingestion and graph maintenance behaviour.
5+
/// The defaults mirror the semantics implemented in the GraphRag.Net demo service where
6+
/// ingestion aggressively deduplicates, trims by token budgets, and repairs sparse graphs.
7+
/// </summary>
8+
public sealed class HeuristicMaintenanceConfig
9+
{
10+
/// <summary>
11+
/// Gets or sets a value indicating whether semantic deduplication should be applied
12+
/// to text units produced during ingestion. When enabled, text chunks that are deemed
13+
/// near-duplicates are merged so downstream LLM prompts are not wasted on redundant
14+
/// context.
15+
/// </summary>
16+
public bool EnableSemanticDeduplication { get; set; } = true;
17+
18+
/// <summary>
19+
/// Gets or sets the cosine similarity threshold used when merging duplicate text units.
20+
/// Values should fall within [0,1]; higher values keep the deduplication stricter.
21+
/// </summary>
22+
public double SemanticDeduplicationThreshold { get; set; } = 0.92;
23+
24+
/// <summary>
25+
/// Gets or sets the maximum number of tokens permitted within a single text unit.
26+
/// Oversized chunks are discarded to keep prompts within model context limits.
27+
/// </summary>
28+
public int MaxTokensPerTextUnit { get; set; } = 1400;
29+
30+
/// <summary>
31+
/// Gets or sets the maximum cumulative token budget allocated to each document during
32+
/// ingestion. Set to a value less than or equal to zero to disable document level trimming.
33+
/// </summary>
34+
public int MaxDocumentTokenBudget { get; set; } = 6000;
35+
36+
/// <summary>
37+
/// Gets or sets the maximum number of text units that should remain attached to a
38+
/// relationship when persisting graph edges. Excess associations are trimmed to keep
39+
/// follow-up prompts compact.
40+
/// </summary>
41+
public int MaxTextUnitsPerRelationship { get; set; } = 6;
42+
43+
/// <summary>
44+
/// Gets or sets the minimum amount of overlap (expressed as a ratio) required when linking
45+
/// orphan entities. The ratio compares shared text units against the smaller of the
46+
/// participating entity sets.
47+
/// </summary>
48+
public double OrphanLinkMinimumOverlap { get; set; } = 0.2;
49+
50+
/// <summary>
51+
/// Gets or sets the default weight assigned to synthetic orphan relationships.
52+
/// </summary>
53+
public double OrphanLinkWeight { get; set; } = 0.35;
54+
55+
/// <summary>
56+
/// Gets or sets a value indicating whether relationship heuristics should normalise,
57+
/// validate, and enhance extracted edges.
58+
/// </summary>
59+
public bool EnhanceRelationships { get; set; } = true;
60+
61+
/// <summary>
62+
/// Gets or sets the minimum weight enforced when relationship heuristics run. Extracted
63+
/// relationships that fall below this floor (after normalisation) are bumped up so they
64+
/// remain queryable.
65+
/// </summary>
66+
public double RelationshipConfidenceFloor { get; set; } = 0.35;
67+
68+
/// <summary>
69+
/// Gets or sets the minimum overlap (in tokens) required when chunking source documents.
70+
/// </summary>
71+
public int MinimumChunkOverlap { get; set; } = 80;
72+
73+
/// <summary>
74+
/// Gets or sets an optional keyed model id used to resolve a text embedding generator.
75+
/// When not supplied, the pipeline falls back to <see cref="TextEmbeddingConfig.ModelId"/>.
76+
/// </summary>
77+
public string? EmbeddingModelId { get; set; }
78+
79+
/// <summary>
80+
/// Gets or sets a value indicating whether orphan entities should be linked back into the
81+
/// graph using co-occurrence heuristics.
82+
/// </summary>
83+
public bool LinkOrphanEntities { get; set; } = true;
84+
}

0 commit comments

Comments
 (0)