Skip to content

Commit f40009d

Browse files
committed
Tighten canonical-repo match in queryTargetsDependingOnModules
On MODULE.bazel-touching commits the impacted-targets path fans out one `bazel query rdeps(//..., @@<repo>//...)` subprocess per canonical repo whose label contains the changed module's name as a substring. A module named "cpp" pulled in every `abseil-cpp+//...` label; in workspaces with many bzlmod module extensions the fan-out reached thousands of serial subprocesses per `get-impacted-targets` call. Replace the `contains` scan with a two-tier predicate that matches on the parsed canonical repo name: 1. Tier A: consult `bazel mod dump_repo_mapping ""` (already implemented in BazelQueryService.discoverRepoMapping; made public for reuse). If the mapping resolves `module.apparentName` to canonical C, any repo equal to C or starting with "C+"/"C~" belongs to that module. 2. Tier B: prefix-match `module.name + "+"` / `module.name + "~"` for transitive modules that root's repo mapping does not include and when discoverRepoMapping fails. Extension-created forms (`name++ext+repo`, `name~~ext~repo`) are subsumed by these prefixes. Canonical repos to query are collected across all changed modules before running rdeps, which avoids duplicate queries when findChangedModules reports a version bump as both an add and a remove (two Module objects sharing the same canonical). Also fix a pre-existing bug in the same function: the trailing `computeSimpleImpactedTargets(emptyMap(), allTargets)` call returned `allTargets.keys`, silently unioning every target back into the rdeps result. Thread `from` through the call chain so the union uses the real hash diff. Without this, the rdeps filtering added here would be cost-only — the impacted set would still equal the full workspace. Tests: new CalculateImpactedTargetsInteractorModuleQueryTest covers canonical-plus, canonical-tilde, extension-created-via-mapping, extension-created-via-name-fallback, the "cpp must not match abseil-cpp" regression, transitive-only-via-name-fallback, ghost-module (Tier C skip), multiple-disjoint-modules, discoverRepoMapping failure, and the executeWithDistances module-query path. Output assertions guard against regression to the `emptyMap()` union bug.
1 parent e8b2f80 commit f40009d

5 files changed

Lines changed: 533 additions & 61 deletions

File tree

cli/BUILD

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,12 @@ kt_jvm_test(
7272
runtime_deps = [":cli-test-lib"],
7373
)
7474

75+
kt_jvm_test(
76+
name = "CalculateImpactedTargetsInteractorModuleQueryTest",
77+
test_class = "com.bazel_diff.interactor.CalculateImpactedTargetsInteractorModuleQueryTest",
78+
runtime_deps = [":cli-test-lib"],
79+
)
80+
7581
kt_jvm_test(
7682
name = "NormalisingPathConverterTest",
7783
test_class = "com.bazel_diff.cli.converter.NormalisingPathConverterTest",

cli/src/main/kotlin/com/bazel_diff/bazel/BazelQueryService.kt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,9 +349,13 @@ class BazelQueryService(
349349
* `bazel mod dump_repo_mapping ""`. Returns a map of apparent name → canonical name.
350350
* Filters out internal repos (bazel_tools, _builtins, local_config_*) that aren't
351351
* relevant for dependency hashing.
352+
*
353+
* Used by both `queryBzlmodRepos` (for synthetic //external:* target generation) and
354+
* `CalculateImpactedTargetsInteractor.queryTargetsDependingOnModules` (to resolve a
355+
* changed module's canonical repo prefix without substring matching `allTargets.keys`).
352356
*/
353357
@OptIn(ExperimentalCoroutinesApi::class)
354-
private suspend fun discoverRepoMapping(): Map<String, String> {
358+
suspend fun discoverRepoMapping(): Map<String, String> {
355359
val cmd: MutableList<String> =
356360
ArrayList<String>().apply {
357361
add(bazelPath.toString())

cli/src/main/kotlin/com/bazel_diff/interactor/CalculateImpactedTargetsInteractor.kt

Lines changed: 140 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package com.bazel_diff.interactor
22

33
import com.bazel_diff.bazel.BazelQueryService
4+
import com.bazel_diff.bazel.Module
45
import com.bazel_diff.bazel.ModuleGraphParser
56
import com.bazel_diff.hash.TargetHash
67
import com.bazel_diff.log.Logger
@@ -53,7 +54,7 @@ class CalculateImpactedTargetsInteractor : KoinComponent {
5354

5455
val impactedTargets = if (changedModules.isNotEmpty()) {
5556
logger.i { "Module changes detected - querying for targets that depend on changed modules" }
56-
queryTargetsDependingOnModules(changedModules, to)
57+
queryTargetsDependingOnModules(changedModules, from, to)
5758
} else {
5859
computeSimpleImpactedTargets(from, to)
5960
}
@@ -106,7 +107,7 @@ class CalculateImpactedTargetsInteractor : KoinComponent {
106107

107108
val impactedTargets = if (changedModules.isNotEmpty()) {
108109
logger.i { "Module changes detected - querying for targets that depend on changed modules" }
109-
val moduleImpactedTargets = queryTargetsDependingOnModules(changedModules, to)
110+
val moduleImpactedTargets = queryTargetsDependingOnModules(changedModules, from, to)
110111
// Mark module-impacted targets with distance 0, then compute distances from there
111112
val moduleImpactedHashes = from.filterKeys { !moduleImpactedTargets.contains(it) }
112113
computeAllDistances(moduleImpactedHashes, to, depEdges)
@@ -239,56 +240,75 @@ class CalculateImpactedTargetsInteractor : KoinComponent {
239240
}
240241

241242
/**
242-
* Detects module changes by comparing module graphs and returns changed module keys.
243+
* Detects module changes by comparing module graphs and returns the changed Modules.
243244
*
244-
* This method:
245-
* 1. Parses the from and to module graphs
246-
* 2. Identifies which modules changed (added, removed, or version changed)
247-
* 3. Logs the changes for visibility
248-
* 4. Returns the set of changed module keys
245+
* Resolves each changed key against the "to" graph first (the state we will query
246+
* against), falling back to the "from" graph for modules that were removed.
249247
*
250-
* @param fromModuleGraphJson JSON from `bazel mod graph --output=json` for starting revision
251-
* @param toModuleGraphJson JSON from `bazel mod graph --output=json` for final revision
252-
* @return Set of changed module keys, empty if no changes
248+
* @param fromModuleGraphJson JSON from `bazel mod graph --output=json` for the starting revision
249+
* @param toModuleGraphJson JSON from `bazel mod graph --output=json` for the final revision
250+
* @return Set of changed Modules, empty if no changes
253251
*/
254252
private fun detectChangedModules(
255253
fromModuleGraphJson: String?,
256254
toModuleGraphJson: String?
257-
): Set<String> {
258-
// If either module graph is missing, assume no changes
255+
): Set<Module> {
259256
if (fromModuleGraphJson == null || toModuleGraphJson == null) {
260257
return emptySet()
261258
}
262259

263-
// Parse module graphs
264260
val fromGraph = moduleGraphParser.parseModuleGraph(fromModuleGraphJson)
265261
val toGraph = moduleGraphParser.parseModuleGraph(toModuleGraphJson)
262+
val changedKeys = moduleGraphParser.findChangedModules(fromGraph, toGraph)
266263

267-
// Find changed modules
268-
val changedModules = moduleGraphParser.findChangedModules(fromGraph, toGraph)
269-
270-
if (changedModules.isEmpty()) {
264+
if (changedKeys.isEmpty()) {
271265
logger.i { "No module changes detected" }
272-
} else {
273-
logger.i { "Detected ${changedModules.size} module changes: ${changedModules.joinToString(", ")}" }
266+
return emptySet()
274267
}
275268

269+
val changedModules = changedKeys.mapNotNull { key -> toGraph[key] ?: fromGraph[key] }.toSet()
270+
logger.i { "Detected ${changedModules.size} module changes: ${changedModules.joinToString(", ") { it.key }}" }
276271
return changedModules
277272
}
278273

279274
/**
280275
* Queries Bazel to find all workspace targets that depend on any changed module.
281276
*
282-
* Maps every changed module to its matching bzlmod canonical repos, then issues a
283-
* single `rdeps(//..., @@a//... + @@b//... + ...)` query. Bazel executes the union
284-
* in one analysis pass, avoiding per-repo subprocess fan-out.
277+
* For each changed module M we compute the set of canonical repos in `allTargets`
278+
* that belong to M, then issue a single unioned
279+
* `rdeps(//..., @@<repo1>//... + @@<repo2>//... + ...)` query. Bazel executes
280+
* the union in one analysis pass, avoiding per-repo subprocess fan-out.
281+
*
282+
* Repo ownership is decided by two ordered predicates:
283+
*
284+
* 1. Tier A — root repo mapping. If `bazel mod dump_repo_mapping ""` maps
285+
* `M.apparentName` to canonical C, any repo R with R == C or R starts with
286+
* "C+" / "C~" belongs to M. Covers extension-created children (`C++ext+repo`,
287+
* `C~~ext~repo`) whose parent canonical lives in M.
288+
* 2. Tier B — name-prefix fallback. R belongs to M if R starts with
289+
* "{M.name}+" or "{M.name}~". Extension-created forms (`name++ext+repo`,
290+
* `name~~ext~repo`) are already covered by these prefixes. Handles
291+
* transitive modules absent from root's mapping and the case where
292+
* `discoverRepoMapping` failed.
285293
*
286-
* @param changedModuleKeys Set of changed module keys (e.g., "abseil-cpp@20240722.0")
287-
* @param allTargets Map of all targets from the final revision
294+
* Modules that match nothing (Tier C) are logged and skipped — a module with no
295+
* materialised repos in `allTargets.keys` cannot impact any hashed target, and
296+
* `computeSimpleImpactedTargets` still runs below to catch direct source changes.
297+
*
298+
* The key invariant vs. the previous implementation: we match on the parsed
299+
* canonical repo name (`label.substring(2).substringBefore("//")`), not on a
300+
* `contains` substring of the full label, so a module named "cpp" no longer
301+
* matches canonical `abseil-cpp+`.
302+
*
303+
* @param changedModules Modules identified as changed between the two graphs
304+
* @param from Starting-revision target hashes; used only to pick up labels whose
305+
* content changed independently of any module bump
306+
* @param allTargets Final-revision target hashes (the set we can query against)
288307
* @return Set of target labels that are impacted by module changes
289308
*/
290309
private fun queryTargetsDependingOnModules(
291-
changedModuleKeys: Set<String>,
310+
changedModules: Set<Module>,
311+
from: Map<String, TargetHash>,
292312
allTargets: Map<String, TargetHash>
293313
): Set<String> {
294314
val queryService: BazelQueryService? = try {
@@ -302,48 +322,108 @@ class CalculateImpactedTargetsInteractor : KoinComponent {
302322
return allTargets.keys
303323
}
304324

305-
// Map every changed module to its matching bzlmod canonical repos. A single module
306-
// name can match multiple canonical repos (e.g. rules_jvm_external matches
307-
// rules_jvm_external~~maven~maven, rules_jvm_external~~toolchains~...). Log per
308-
// module so an operator can attribute a pathologically large impacted set back to
309-
// a specific module bump.
310-
val moduleRepos = mutableSetOf<String>()
311-
for (moduleKey in changedModuleKeys) {
312-
val moduleName = moduleKey.substringBefore("@")
313-
val matched = allTargets.keys
314-
.filter { it.startsWith("@@") && it.contains(moduleName) }
315-
.map { it.substring(2).substringBefore("//") }
316-
if (matched.isEmpty()) {
317-
logger.w { "No external repository matched module $moduleKey" }
318-
} else {
319-
logger.i { "Module $moduleKey matched ${matched.size} repos: ${matched.joinToString(", ")}" }
320-
moduleRepos.addAll(matched)
325+
val repoMapping: Map<String, String> =
326+
try {
327+
runBlocking { queryService.discoverRepoMapping() }
328+
} catch (e: Exception) {
329+
logger.w { "discoverRepoMapping failed, falling back to module-name matching: ${e.message}" }
330+
emptyMap()
331+
}
332+
// Log size so operators can distinguish "Tier A had nothing to match
333+
// against" from "Tier A matched but the module wasn't in root mapping".
334+
// `discoverRepoMapping` returns an empty map on subprocess exit != 0
335+
// without throwing, so we cannot rely on the catch block above to
336+
// surface that case.
337+
logger.i { "Discovered ${repoMapping.size} root repo mapping entries" }
338+
339+
// Parse `allTargets` into the set of canonical repo names once, instead of
340+
// rescanning every label per changed module.
341+
val canonicalRepos: Set<String> = allTargets.keys.asSequence()
342+
.filter { it.startsWith("@@") }
343+
.map { it.substring(2).substringBefore("//") }
344+
.filter { it.isNotEmpty() }
345+
.toSet()
346+
347+
// Collect the canonical repos to query once — multiple "changed modules"
348+
// often collapse to the same canonical name (e.g. `findChangedModules`
349+
// reports both `foo@1.0` removed and `foo@2.0` added, or two modules have
350+
// overlapping extension-created children). Running `rdeps` per distinct
351+
// canonical name — not per module iteration — avoids the redundant work.
352+
val reposToQuery = mutableSetOf<String>()
353+
for (module in changedModules) {
354+
logger.i { "Resolving repos for changed module: ${module.name} (key: ${module.key})" }
355+
val moduleRepos = reposOwnedBy(module, canonicalRepos, repoMapping)
356+
if (moduleRepos.isEmpty()) {
357+
logger.w { "No external repository found for module ${module.name}" }
358+
continue
321359
}
360+
logger.i { "Found ${moduleRepos.size} repositories for module ${module.name}: ${moduleRepos.joinToString(", ")}" }
361+
reposToQuery.addAll(moduleRepos)
322362
}
323363

324-
if (moduleRepos.isEmpty()) {
325-
logger.i { "No external repositories matched any changed module" }
326-
return computeSimpleImpactedTargets(emptyMap(), allTargets)
327-
}
328-
329-
logger.i { "Querying rdeps for ${moduleRepos.size} repositories across ${changedModuleKeys.size} changed modules" }
330-
331364
val impactedTargets = mutableSetOf<String>()
332-
try {
333-
// Single unioned rdeps query: bazel executes the union in one analysis pass.
334-
val queryExpression = "rdeps(//..., ${moduleRepos.joinToString(" + ") { "@@$it//..." }})"
335-
val rdeps = runBlocking { queryService.query(queryExpression, useCquery = false) }
336-
val rdepLabels = rdeps.map { it.name }.filter { !it.startsWith("@@") }
337-
logger.i { "Found ${rdepLabels.size} workspace targets depending on changed modules" }
338-
impactedTargets.addAll(rdepLabels)
339-
} catch (e: Exception) {
340-
logger.e(e) { "Unioned rdeps query failed - conservatively marking all workspace targets impacted" }
341-
impactedTargets.addAll(allTargets.keys.filter { !it.startsWith("@@") })
365+
if (reposToQuery.isNotEmpty()) {
366+
try {
367+
// Single unioned rdeps query: bazel executes the union in one analysis pass,
368+
// collapsing N × (startup + analysis) into 1 × (startup + N × analysis).
369+
val queryExpression = "rdeps(//..., ${reposToQuery.joinToString(" + ") { "@@$it//..." }})"
370+
logger.i { "Executing unioned rdeps query across ${reposToQuery.size} repositories" }
371+
val rdeps = runBlocking { queryService.query(queryExpression, useCquery = false) }
372+
val rdepLabels = rdeps.map { it.name }.filter { !it.startsWith("@@") }
373+
logger.i { "Found ${rdepLabels.size} workspace targets depending on changed modules" }
374+
impactedTargets.addAll(rdepLabels)
375+
} catch (e: Exception) {
376+
logger.e(e) { "Unioned rdeps query failed - conservatively marking all workspace targets impacted" }
377+
impactedTargets.addAll(allTargets.keys.filter { !it.startsWith("@@") })
378+
}
342379
}
343380

344-
impactedTargets.addAll(computeSimpleImpactedTargets(emptyMap(), allTargets))
381+
// Union with hash-diff results so we still surface labels whose content changed
382+
// independently of any module version bump (e.g. a source file in `//app:app`
383+
// edited in the same commit as a MODULE.bazel update). The earlier
384+
// `computeSimpleImpactedTargets(emptyMap(), allTargets)` form returned every
385+
// key in `allTargets`, which silently defeated the rdeps filtering above.
386+
val directlyChanged = computeSimpleImpactedTargets(from, allTargets)
387+
impactedTargets.addAll(directlyChanged)
345388

346389
logger.i { "Total targets impacted by module changes: ${impactedTargets.size}" }
347390
return impactedTargets
348391
}
392+
393+
/**
394+
* Canonical repos in `canonicalRepos` that belong to `module`, using Tier A
395+
* (root repo mapping) plus Tier B (name-prefix fallback). See
396+
* [queryTargetsDependingOnModules] for the full contract.
397+
*
398+
* @param module Changed module whose owned repos we want to resolve
399+
* @param canonicalRepos Set of canonical repo names parsed from `allTargets.keys`
400+
* @param repoMapping Root module's apparent→canonical repo mapping (may be empty)
401+
* @return Canonical repos belonging to `module`, or empty if none matched (Tier C)
402+
*/
403+
private fun reposOwnedBy(
404+
module: Module,
405+
canonicalRepos: Set<String>,
406+
repoMapping: Map<String, String>
407+
): Set<String> {
408+
val prefixes = mutableListOf<String>()
409+
val exactMatches = mutableSetOf<String>()
410+
411+
val mappedCanonical = repoMapping[module.apparentName]
412+
if (!mappedCanonical.isNullOrEmpty()) {
413+
exactMatches.add(mappedCanonical)
414+
prefixes.add("$mappedCanonical+")
415+
prefixes.add("$mappedCanonical~")
416+
}
417+
418+
// Tier B prefixes always apply — they cover transitive modules that root's
419+
// `dump_repo_mapping` does not include, and also act as the sole source when
420+
// `discoverRepoMapping` returned empty. `++`/`~~` are subsumed by `+`/`~`
421+
// (extension-created repos start with `name+` or `name~` by definition).
422+
prefixes.add("${module.name}+")
423+
prefixes.add("${module.name}~")
424+
425+
return canonicalRepos.filter { repo ->
426+
repo in exactMatches || prefixes.any { repo.startsWith(it) }
427+
}.toSet()
428+
}
349429
}

0 commit comments

Comments
 (0)