Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 91 additions & 45 deletions src/ManagedCode.GraphRag/Community/CommunityBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,53 +23,12 @@ public static IReadOnlyList<CommunityRecord> Build(
return Array.Empty<CommunityRecord>();
}

var adjacency = BuildAdjacency(entities, relationships);
var titleLookup = entities.ToDictionary(entity => entity.Title, StringComparer.OrdinalIgnoreCase);
var random = new Random(config.Seed);

var orderedTitles = titleLookup.Keys
.OrderBy(_ => random.Next())
.ToList();

var visited = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var components = new List<List<string>>();

foreach (var title in orderedTitles)
var components = config.Algorithm switch
{
if (!visited.Add(title))
{
continue;
}

var component = new List<string>();
var queue = new Queue<string>();
queue.Enqueue(title);

while (queue.Count > 0)
{
var current = queue.Dequeue();
component.Add(current);

if (!adjacency.TryGetValue(current, out var neighbors) || neighbors.Count == 0)
{
continue;
}

var orderedNeighbors = neighbors
.OrderBy(_ => random.Next())
.ToList();

foreach (var neighbor in orderedNeighbors)
{
if (visited.Add(neighbor))
{
queue.Enqueue(neighbor);
}
}
}

components.Add(component);
}
CommunityDetectionAlgorithm.FastLabelPropagation => BuildUsingLabelPropagation(entities, relationships, config),
_ => BuildUsingConnectedComponents(entities, relationships, config)
};

if (config.UseLargestConnectedComponent && components.Count > 0)
{
Expand Down Expand Up @@ -183,6 +142,93 @@ public static IReadOnlyList<CommunityRecord> Build(
return communityRecords;
}

private static List<List<string>> BuildUsingConnectedComponents(
IReadOnlyList<EntityRecord> entities,
IReadOnlyList<RelationshipRecord> relationships,
ClusterGraphConfig config)
{
var adjacency = BuildAdjacency(entities, relationships);
var random = new Random(config.Seed);
var orderedTitles = adjacency.Keys
.OrderBy(_ => random.Next())
.ToList();

var visited = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var components = new List<List<string>>();

foreach (var title in orderedTitles)
{
if (!visited.Add(title))
{
continue;
}

var component = new List<string>();
var queue = new Queue<string>();
queue.Enqueue(title);

while (queue.Count > 0)
{
var current = queue.Dequeue();
component.Add(current);

if (!adjacency.TryGetValue(current, out var neighbors) || neighbors.Count == 0)
{
continue;
}

var orderedNeighbors = neighbors
.OrderBy(_ => random.Next())
.ToList();

foreach (var neighbor in orderedNeighbors)
{
if (visited.Add(neighbor))
{
queue.Enqueue(neighbor);
}
Copy link

Copilot AI Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This foreach loop implicitly filters its target sequence - consider filtering the sequence explicitly using '.Where(...)'.

Suggested change
foreach (var neighbor in orderedNeighbors)
{
if (visited.Add(neighbor))
{
queue.Enqueue(neighbor);
}
foreach (var neighbor in orderedNeighbors.Where(neighbor => visited.Add(neighbor)))
{
queue.Enqueue(neighbor);

Copilot uses AI. Check for mistakes.
}
}

components.Add(component);
}

return components;
}

private static List<List<string>> BuildUsingLabelPropagation(
IReadOnlyList<EntityRecord> entities,
IReadOnlyList<RelationshipRecord> relationships,
ClusterGraphConfig config)
{
var assignments = FastLabelPropagationCommunityDetector.AssignLabels(entities, relationships, config);
if (assignments.Count == 0)
{
return new List<List<string>>();
}

var groups = new Dictionary<string, List<string>>(StringComparer.OrdinalIgnoreCase);

foreach (var pair in assignments)
{
if (!groups.TryGetValue(pair.Value, out var members))
{
members = new List<string>();
groups[pair.Value] = members;
}

members.Add(pair.Key);
}

return groups.Values
.Select(list => list
.Distinct(StringComparer.OrdinalIgnoreCase)
.OrderBy(title => title, StringComparer.OrdinalIgnoreCase)
.ToList())
.Where(list => list.Count > 0)
.ToList();
}

private static Dictionary<string, HashSet<string>> BuildAdjacency(
IReadOnlyList<EntityRecord> entities,
IReadOnlyList<RelationshipRecord> relationships)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
using GraphRag.Config;
using GraphRag.Entities;
using GraphRag.Relationships;

namespace GraphRag.Community;

internal static class FastLabelPropagationCommunityDetector
{
public static IReadOnlyDictionary<string, string> AssignLabels(
IReadOnlyList<EntityRecord> entities,
IReadOnlyList<RelationshipRecord> relationships,
ClusterGraphConfig config)
{
ArgumentNullException.ThrowIfNull(entities);
ArgumentNullException.ThrowIfNull(relationships);
ArgumentNullException.ThrowIfNull(config);

var adjacency = BuildAdjacency(entities, relationships);
if (adjacency.Count == 0)
{
return new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
}

var random = new Random(config.Seed);
var labels = adjacency.Keys.ToDictionary(node => node, node => node, StringComparer.OrdinalIgnoreCase);
var nodes = adjacency.Keys.ToList();
var maxIterations = Math.Max(1, config.MaxIterations);

for (var iteration = 0; iteration < maxIterations; iteration++)
{
var shuffled = nodes.OrderBy(_ => random.Next()).ToList();
var changed = false;

foreach (var node in shuffled)
{
var neighbors = adjacency[node];
if (neighbors.Count == 0)
{
continue;
}

var labelWeights = new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
foreach (var (neighbor, weight) in neighbors)
{
if (!labels.TryGetValue(neighbor, out var neighborLabel))
{
continue;
}

labelWeights[neighborLabel] = labelWeights.GetValueOrDefault(neighborLabel) + (weight > 0 ? weight : 1);
}

if (labelWeights.Count == 0)
{
continue;
}

var maxWeight = labelWeights.Values.Max();
var candidates = labelWeights
.Where(pair => Math.Abs(pair.Value - maxWeight) < 1e-6)
.Select(pair => pair.Key)
.ToList();

var chosen = candidates.Count == 1
? candidates[0]
: candidates[random.Next(candidates.Count)];

if (!string.Equals(labels[node], chosen, StringComparison.OrdinalIgnoreCase))
{
labels[node] = chosen;
changed = true;
}
}

if (!changed)
{
break;
}
}

return labels;
}

private static Dictionary<string, List<(string Neighbor, double Weight)>> BuildAdjacency(
IReadOnlyList<EntityRecord> entities,
IReadOnlyList<RelationshipRecord> relationships)
{
var adjacency = entities
.ToDictionary(entity => entity.Title, _ => new List<(string, double)>(), StringComparer.OrdinalIgnoreCase);

foreach (var relationship in relationships)
{
if (!adjacency.TryGetValue(relationship.Source, out var sourceNeighbors))
{
sourceNeighbors = new List<(string, double)>();
adjacency[relationship.Source] = sourceNeighbors;
}

if (!adjacency.TryGetValue(relationship.Target, out var targetNeighbors))
{
targetNeighbors = new List<(string, double)>();
adjacency[relationship.Target] = targetNeighbors;
}

sourceNeighbors.Add((relationship.Target, relationship.Weight));
targetNeighbors.Add((relationship.Source, relationship.Weight));
}

return adjacency;
}
}
13 changes: 13 additions & 0 deletions src/ManagedCode.GraphRag/Config/ClusterGraphConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,17 @@ public sealed class ClusterGraphConfig
/// results deterministic across runs.
/// </summary>
public int Seed { get; set; } = unchecked((int)0xDEADBEEF);

/// <summary>
/// Gets or sets the maximum number of label propagation iterations when the
/// <see cref="CommunityDetectionAlgorithm.FastLabelPropagation"/> algorithm is used.
/// </summary>
public int MaxIterations { get; set; } = 20;

/// <summary>
/// Gets or sets the community detection algorithm. The fast label propagation
/// implementation mirrors the in-process routine provided by GraphRag.Net.
/// </summary>
public CommunityDetectionAlgorithm Algorithm { get; set; }
= CommunityDetectionAlgorithm.FastLabelPropagation;
}
6 changes: 6 additions & 0 deletions src/ManagedCode.GraphRag/Config/Enums.cs
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,9 @@ public enum ModularityMetric
Lcc,
WeightedComponents
}

public enum CommunityDetectionAlgorithm
{
FastLabelPropagation,
ConnectedComponents
}
2 changes: 2 additions & 0 deletions src/ManagedCode.GraphRag/Config/GraphRagConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ public sealed class GraphRagConfig

public ClusterGraphConfig ClusterGraph { get; set; } = new();

public HeuristicMaintenanceConfig Heuristics { get; set; } = new();

public CommunityReportsConfig CommunityReports { get; set; } = new();

public PromptTuningConfig PromptTuning { get; set; } = new();
Expand Down
84 changes: 84 additions & 0 deletions src/ManagedCode.GraphRag/Config/HeuristicMaintenanceConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
namespace GraphRag.Config;

/// <summary>
/// Represents heuristic controls that fine-tune ingestion and graph maintenance behaviour.
Copy link

Copilot AI Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected spelling of 'behaviour' to 'behavior'.

Suggested change
/// Represents heuristic controls that fine-tune ingestion and graph maintenance behaviour.
/// Represents heuristic controls that fine-tune ingestion and graph maintenance behavior.

Copilot uses AI. Check for mistakes.
/// The defaults mirror the semantics implemented in the GraphRag.Net demo service where
/// ingestion aggressively deduplicates, trims by token budgets, and repairs sparse graphs.
/// </summary>
public sealed class HeuristicMaintenanceConfig
{
/// <summary>
/// Gets or sets a value indicating whether semantic deduplication should be applied
/// to text units produced during ingestion. When enabled, text chunks that are deemed
/// near-duplicates are merged so downstream LLM prompts are not wasted on redundant
/// context.
/// </summary>
public bool EnableSemanticDeduplication { get; set; } = true;

/// <summary>
/// Gets or sets the cosine similarity threshold used when merging duplicate text units.
/// Values should fall within [0,1]; higher values keep the deduplication stricter.
/// </summary>
public double SemanticDeduplicationThreshold { get; set; } = 0.92;

/// <summary>
/// Gets or sets the maximum number of tokens permitted within a single text unit.
/// Oversized chunks are discarded to keep prompts within model context limits.
/// </summary>
public int MaxTokensPerTextUnit { get; set; } = 1400;

/// <summary>
/// Gets or sets the maximum cumulative token budget allocated to each document during
/// ingestion. Set to a value less than or equal to zero to disable document level trimming.
/// </summary>
public int MaxDocumentTokenBudget { get; set; } = 6000;

/// <summary>
/// Gets or sets the maximum number of text units that should remain attached to a
/// relationship when persisting graph edges. Excess associations are trimmed to keep
/// follow-up prompts compact.
/// </summary>
public int MaxTextUnitsPerRelationship { get; set; } = 6;

/// <summary>
/// Gets or sets the minimum amount of overlap (expressed as a ratio) required when linking
/// orphan entities. The ratio compares shared text units against the smaller of the
/// participating entity sets.
/// </summary>
public double OrphanLinkMinimumOverlap { get; set; } = 0.2;

/// <summary>
/// Gets or sets the default weight assigned to synthetic orphan relationships.
/// </summary>
public double OrphanLinkWeight { get; set; } = 0.35;

/// <summary>
/// Gets or sets a value indicating whether relationship heuristics should normalise,
/// validate, and enhance extracted edges.
/// </summary>
public bool EnhanceRelationships { get; set; } = true;

/// <summary>
/// Gets or sets the minimum weight enforced when relationship heuristics run. Extracted
/// relationships that fall below this floor (after normalisation) are bumped up so they
/// remain queryable.
/// </summary>
public double RelationshipConfidenceFloor { get; set; } = 0.35;

/// <summary>
/// Gets or sets the minimum overlap (in tokens) required when chunking source documents.
/// </summary>
public int MinimumChunkOverlap { get; set; } = 80;

/// <summary>
/// Gets or sets an optional keyed model id used to resolve a text embedding generator.
/// When not supplied, the pipeline falls back to <see cref="TextEmbeddingConfig.ModelId"/>.
/// </summary>
public string? EmbeddingModelId { get; set; }

/// <summary>
/// Gets or sets a value indicating whether orphan entities should be linked back into the
/// graph using co-occurrence heuristics.
/// </summary>
public bool LinkOrphanEntities { get; set; } = true;
}
Loading
Loading