Skip to content

Commit 3cc9ea2

Browse files
committed
Fix Louvain community detection O(N^2) bottleneck
Replace naive modularityGain (scanned all nodes per call) with per-community commSumTot accumulators maintained incrementally. Each iteration is now O(m) instead of O(N^2). Django benchmark: communities pass 4m40s → 0.94s (297x faster), total indexing 5m38s → 24s.
1 parent 5fe2b50 commit 3cc9ea2

1 file changed

Lines changed: 48 additions & 32 deletions

File tree

internal/pipeline/communities.go

Lines changed: 48 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -45,83 +45,99 @@ func (p *Pipeline) passCommunities() {
4545
slog.Info("pass.communities.done", "communities", communityCount, "member_of", memberOfCount)
4646
}
4747

48-
// louvainCommunities implements a simplified Louvain algorithm for community detection.
48+
// louvainCommunities implements the Louvain algorithm for community detection.
49+
// Uses per-community degree accumulators for O(m) per iteration instead of O(N^2).
4950
// Returns a map of community_id → []node_id.
5051
func louvainCommunities(adj map[int64]map[int64]bool, allNodes map[int64]bool) map[int][]int64 {
5152
nodeCommunity := make(map[int64]int, len(allNodes))
52-
communityID := 0
53+
commID := 0
5354
for nodeID := range allNodes {
54-
nodeCommunity[nodeID] = communityID
55-
communityID++
55+
nodeCommunity[nodeID] = commID
56+
commID++
5657
}
5758

59+
// Pre-compute node degrees
60+
nodeDegree := make(map[int64]float64, len(allNodes))
5861
totalEdges := 0
59-
for _, neighbors := range adj {
62+
for nodeID, neighbors := range adj {
63+
nodeDegree[nodeID] = float64(len(neighbors))
6064
totalEdges += len(neighbors)
6165
}
6266
m := float64(totalEdges) / 2.0
6367
if m == 0 {
6468
m = 1
6569
}
6670

71+
// Per-community accumulator: sum of degrees of all members.
72+
// Updated incrementally when nodes move between communities.
73+
commSumTot := make(map[int]float64, len(allNodes))
74+
for nodeID, comm := range nodeCommunity {
75+
commSumTot[comm] = nodeDegree[nodeID]
76+
}
77+
6778
improved := true
6879
for iteration := 0; improved && iteration < 50; iteration++ {
69-
improved = louvainIteration(adj, allNodes, nodeCommunity, m)
80+
improved = louvainIteration(adj, nodeCommunity, nodeDegree, commSumTot, m)
7081
}
7182

7283
return groupAndFilter(nodeCommunity)
7384
}
7485

7586
// louvainIteration runs one pass of greedy modularity optimization.
76-
// Returns true if any node changed community.
77-
func louvainIteration(adj map[int64]map[int64]bool, allNodes map[int64]bool, nodeCommunity map[int64]int, m float64) bool {
87+
// For each node, computes modularity gain for neighboring communities in O(degree)
88+
// using pre-maintained commSumTot accumulators. Returns true if any node moved.
89+
func louvainIteration(
90+
adj map[int64]map[int64]bool,
91+
nodeCommunity map[int64]int,
92+
nodeDegree map[int64]float64,
93+
commSumTot map[int]float64,
94+
m float64,
95+
) bool {
7896
improved := false
79-
for nodeID := range allNodes {
97+
m2 := 2.0 * m * m
98+
99+
for nodeID, neighbors := range adj {
80100
currentComm := nodeCommunity[nodeID]
101+
ki := nodeDegree[nodeID]
81102

82-
neighborComms := make(map[int]bool)
83-
for neighborID := range adj[nodeID] {
84-
neighborComms[nodeCommunity[neighborID]] = true
103+
// Aggregate edges to each neighboring community: O(degree)
104+
edgesToComm := make(map[int]float64, len(neighbors))
105+
for neighborID := range neighbors {
106+
edgesToComm[nodeCommunity[neighborID]]++
85107
}
86108

87-
bestComm, bestGain := currentComm, 0.0
88-
ki := float64(len(adj[nodeID]))
109+
// Remove self from current community for fair comparison
110+
commSumTot[currentComm] -= ki
111+
kiInCurrent := edgesToComm[currentComm]
112+
removeCost := kiInCurrent/m - ki*commSumTot[currentComm]/m2
113+
114+
bestComm := currentComm
115+
bestGain := 0.0
89116

90-
for comm := range neighborComms {
117+
for comm, kiIn := range edgesToComm {
91118
if comm == currentComm {
92119
continue
93120
}
94-
gain := modularityGain(nodeID, comm, adj, nodeCommunity, ki, m)
121+
gain := kiIn/m - ki*commSumTot[comm]/m2 - removeCost
95122
if gain > bestGain {
96123
bestGain = gain
97124
bestComm = comm
98125
}
99126
}
100127

128+
// Restore / update accumulator
101129
if bestComm != currentComm && bestGain > 1e-10 {
102130
nodeCommunity[nodeID] = bestComm
131+
commSumTot[bestComm] += ki
132+
// currentComm already had ki subtracted
103133
improved = true
134+
} else {
135+
commSumTot[currentComm] += ki // restore
104136
}
105137
}
106138
return improved
107139
}
108140

109-
// modularityGain calculates the gain from moving nodeID to targetComm.
110-
func modularityGain(nodeID int64, targetComm int, adj map[int64]map[int64]bool, nodeCommunity map[int64]int, ki, m float64) float64 {
111-
kiIn := 0.0
112-
sumTot := 0.0
113-
for otherID, otherComm := range nodeCommunity {
114-
if otherComm != targetComm {
115-
continue
116-
}
117-
if adj[nodeID][otherID] {
118-
kiIn++
119-
}
120-
sumTot += float64(len(adj[otherID]))
121-
}
122-
return kiIn/m - ki*sumTot/(2*m*m)
123-
}
124-
125141
// groupAndFilter groups nodes by community and filters out singletons.
126142
func groupAndFilter(nodeCommunity map[int64]int) map[int][]int64 {
127143
communities := make(map[int][]int64)

0 commit comments

Comments
 (0)