Skip to content

Commit 2994efd

Browse files
perf(tags): pk-based _sync_inheritance_for_ids; skip full-row fetch
Replaces `_sync_inheritance_for_qs(queryset, target_tag_names_per_child=callable)` with `_sync_inheritance_for_ids(model_class, child_ids, target_tag_names)`. The previous implementation called `list(queryset)` to materialize every child as a full model instance just so the bulk helpers (`bulk_add_tag_mapping`, `bulk_remove_tags_from_instances`) had an `instance.pk` and `instance.__class__` to work with. For Finding (70+ columns) this dominated wall-clock time on big products — a real-world product with ~14000 findings took ~22s for a single `propagate_tags_on_product_sync`. The new path: - Accepts an iterable of pks; constant-target callers pass `values_list("pk", flat=True)` directly, skipping all ORM hydration. - Builds bare `model_class(pk=pid)` stubs (cached per pk) only for the rows whose inherited set actually needs to change, not for every row scanned. - Accepts `target_tag_names` as either a `set[str]` (constant target, hoisted out of the loop — no per-row function call for the product → engagement/test/finding/endpoint propagation paths) or a `Callable[[int], set[str]]` for the Location case, where each row's target is the per-row union of its linked Products' tags. - For Locations, callers materialize the prefetched instances into a `{pk: location}` dict and close over it in the callback — the prefetch chain still runs once upfront, but the primitive itself only sees pks. - Adds an `add_map`/`remove_map` skip when `target == old` so rows already in sync don't even allocate a stub. Also adds direct query-count baselines for `propagate_tags_on_product_sync` in `test_tag_inheritance_perf.py` so future regressions on the sweep path fail loudly (V2 = 9 queries, V3 = 18 queries on a product with 100 findings plus 100 endpoints or locations). ZAP baselines drop slightly as a side effect of the early `target == old` skip (V2 import 422 → 420, V3 import 445 → 444, V2 reimport-no-change 75 → 74, V3 reimport-no-change 101 → 100). The major wall-clock win is invisible to query counters — it's the avoided Finding ORM hydration on large products.
1 parent aa0c1a5 commit 2994efd

2 files changed

Lines changed: 187 additions & 56 deletions

File tree

dojo/tags/inheritance.py

Lines changed: 137 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
import threading
55
from collections import defaultdict
66
from contextlib import contextmanager, suppress
7+
from typing import TYPE_CHECKING
8+
9+
if TYPE_CHECKING:
10+
from collections.abc import Callable, Iterable
11+
12+
from django.db.models import Model
713

814
from django.conf import settings
915
from django.db.models import Q
@@ -166,37 +172,47 @@ def propagate_tags_on_product_sync(product):
166172
inherited_tag_names = {tag.name for tag in product.tags.all()}
167173

168174
logger.debug("Propagating tags from %s to all engagements", product)
169-
_sync_inheritance_for_qs(
170-
Engagement.objects.filter(product=product),
171-
target_tag_names_per_child=lambda _child: inherited_tag_names,
175+
_sync_inheritance_for_ids(
176+
Engagement,
177+
Engagement.objects.filter(product=product).values_list("pk", flat=True),
178+
target_tag_names=inherited_tag_names,
172179
)
173180
logger.debug("Propagating tags from %s to all tests", product)
174-
_sync_inheritance_for_qs(
175-
Test.objects.filter(engagement__product=product),
176-
target_tag_names_per_child=lambda _child: inherited_tag_names,
181+
_sync_inheritance_for_ids(
182+
Test,
183+
Test.objects.filter(engagement__product=product).values_list("pk", flat=True),
184+
target_tag_names=inherited_tag_names,
177185
)
178186
logger.debug("Propagating tags from %s to all findings", product)
179-
_sync_inheritance_for_qs(
180-
Finding.objects.filter(test__engagement__product=product),
181-
target_tag_names_per_child=lambda _child: inherited_tag_names,
187+
_sync_inheritance_for_ids(
188+
Finding,
189+
Finding.objects.filter(test__engagement__product=product).values_list("pk", flat=True),
190+
target_tag_names=inherited_tag_names,
182191
)
183192
if settings.V3_FEATURE_LOCATIONS:
184193
logger.debug("Propagating tags from %s to all locations", product)
185-
location_qs = Location.objects.filter(
186-
Q(products__product=product)
187-
| Q(findings__finding__test__engagement__product=product),
188-
).distinct().prefetch_related(*_LOCATION_PREFETCH_FOR_INHERITANCE)
189194
# Locations can be linked to multiple products, so the inherited target
190-
# is the union of every related product's tags. Compute per-location.
191-
_sync_inheritance_for_qs(
192-
location_qs,
193-
target_tag_names_per_child=_inherited_tag_names_for_location,
195+
# is the union of every related product's tags. Materialize the full
196+
# Locations (with the related-product prefetch chain) into a pk-keyed
197+
# dict so the per-pk callback can look up each Location's instance.
198+
locations_by_pk = {
199+
loc.pk: loc
200+
for loc in Location.objects.filter(
201+
Q(products__product=product)
202+
| Q(findings__finding__test__engagement__product=product),
203+
).distinct().prefetch_related(*_LOCATION_PREFETCH_FOR_INHERITANCE)
204+
}
205+
_sync_inheritance_for_ids(
206+
Location,
207+
locations_by_pk.keys(),
208+
target_tag_names=lambda pk: _inherited_tag_names_for_location(locations_by_pk[pk]),
194209
)
195210
else:
196211
logger.debug("Propagating tags from %s to all endpoints", product)
197-
_sync_inheritance_for_qs(
198-
Endpoint.objects.filter(product=product),
199-
target_tag_names_per_child=lambda _child: inherited_tag_names,
212+
_sync_inheritance_for_ids(
213+
Endpoint,
214+
Endpoint.objects.filter(product=product).values_list("pk", flat=True),
215+
target_tag_names=inherited_tag_names,
200216
)
201217

202218

@@ -230,9 +246,10 @@ def apply_inherited_tags_for_endpoints(endpoints):
230246
if not (get_system_setting("enable_product_tag_inheritance") or product.enable_product_tag_inheritance):
231247
return
232248
inherited_tag_names = {tag.name for tag in product.tags.all()}
233-
_sync_inheritance_for_qs(
234-
Endpoint.objects.filter(id__in=[e.pk for e in endpoints]),
235-
target_tag_names_per_child=lambda _child: inherited_tag_names,
249+
_sync_inheritance_for_ids(
250+
Endpoint,
251+
[e.pk for e in endpoints],
252+
target_tag_names=inherited_tag_names,
236253
)
237254

238255

@@ -260,19 +277,28 @@ def apply_inherited_tags_for_findings(findings):
260277
inherited_tag_names = {tag.name for tag in product.tags.all()}
261278
finding_ids = [f.pk for f in findings]
262279

263-
_sync_inheritance_for_qs(
264-
Finding.objects.filter(id__in=finding_ids),
265-
target_tag_names_per_child=lambda _child: inherited_tag_names,
280+
_sync_inheritance_for_ids(
281+
Finding,
282+
finding_ids,
283+
target_tag_names=inherited_tag_names,
266284
)
267285
if settings.V3_FEATURE_LOCATIONS:
268-
_sync_inheritance_for_qs(
269-
Location.objects.filter(findings__finding_id__in=finding_ids).distinct().prefetch_related(*_LOCATION_PREFETCH_FOR_INHERITANCE),
270-
target_tag_names_per_child=_inherited_tag_names_for_location,
286+
locations_by_pk = {
287+
loc.pk: loc
288+
for loc in Location.objects.filter(
289+
findings__finding_id__in=finding_ids,
290+
).distinct().prefetch_related(*_LOCATION_PREFETCH_FOR_INHERITANCE)
291+
}
292+
_sync_inheritance_for_ids(
293+
Location,
294+
locations_by_pk.keys(),
295+
target_tag_names=lambda pk: _inherited_tag_names_for_location(locations_by_pk[pk]),
271296
)
272297
else:
273-
_sync_inheritance_for_qs(
274-
Endpoint.objects.filter(status_endpoint__finding_id__in=finding_ids).distinct(),
275-
target_tag_names_per_child=lambda _child: inherited_tag_names,
298+
_sync_inheritance_for_ids(
299+
Endpoint,
300+
Endpoint.objects.filter(status_endpoint__finding_id__in=finding_ids).distinct().values_list("pk", flat=True),
301+
target_tag_names=inherited_tag_names,
276302
)
277303

278304

@@ -288,7 +314,7 @@ def _inherited_tag_names_for_location(location):
288314
Products whose own `enable_product_tag_inheritance` flag is on (or where
289315
the system-wide setting is on).
290316
291-
Used as the `target_tag_names_per_child` callback for `_sync_inheritance_for_qs`
317+
Used as the `target_tag_names` callback for `_sync_inheritance_for_ids`
292318
on Location querysets; it must be called per Location because each Location
293319
has its own set of related Products. Uses `iter_related_products()` so
294320
that an upstream `prefetch_related(...)` reduces per-call cost to 0
@@ -362,17 +388,21 @@ def apply_inherited_tags_for_locations(locations, *, product):
362388
p.id: {t.name for t in p.tags.all()} for p in product_qs
363389
}
364390

365-
def _target_for_location(loc):
391+
def _target_for_location_pk(pk):
366392
names: set[str] = set()
367-
for pid in product_ids_by_location[loc.id]:
393+
for pid in product_ids_by_location[pk]:
368394
# product_ids_by_location may contain products that shouldn't contribute
369395
# (ref lookups weren't flag-filtered); check membership in tags_by_product.
370396
tags = tags_by_product.get(pid)
371397
if tags:
372398
names |= tags
373399
return names
374400

375-
_sync_inheritance_for_qs(locations, target_tag_names_per_child=_target_for_location)
401+
_sync_inheritance_for_ids(
402+
Location,
403+
[loc.id for loc in locations],
404+
target_tag_names=_target_for_location_pk,
405+
)
376406

377407

378408
_LOCATION_PREFETCH_FOR_INHERITANCE = (
@@ -381,20 +411,61 @@ def _target_for_location(loc):
381411
)
382412

383413

384-
def _sync_inheritance_for_qs(queryset, *, target_tag_names_per_child):
414+
def _sync_inheritance_for_ids(
415+
model_class: type[Model],
416+
child_ids: Iterable[int],
417+
*,
418+
target_tag_names: set[str] | Callable[[int], set[str]],
419+
) -> None:
385420
"""
386-
Sync inherited_tags + tags for every child in `queryset` to its target tag set.
387-
388-
target_tag_names_per_child: callable(child) -> set[str].
421+
Sync ``inherited_tags`` and ``tags`` for every child pk to its target tag set.
422+
423+
Parameters
424+
----------
425+
model_class
426+
The child model class (``Finding``, ``Engagement``, ``Endpoint``,
427+
``Location``, …). Used to resolve the ``inherited_tags`` field's
428+
through-table and to build minimal pk-only stubs for the bulk helpers.
429+
child_ids
430+
Iterable of primary keys for ``model_class``. Pass a ``values_list("pk",
431+
flat=True)`` queryset directly to avoid materializing model instances
432+
— fetching full rows for 14000 findings was the bottleneck (~22s per
433+
product-tag toggle) that motivated the pk-based design.
434+
target_tag_names
435+
The desired inherited-tag-name set for each child, in one of two forms:
436+
437+
- ``set[str]`` — **constant target**. All children share the same
438+
inherited set (Product → Engagement/Test/Finding/Endpoint
439+
propagation, where every child has the same one parent product).
440+
The value is hoisted out of the per-pk loop so there is no per-row
441+
function-call overhead.
442+
- ``Callable[[int], set[str]]`` — **per-pk target**. Looks up the
443+
target set for each pk. Used for ``Location``, which can be linked
444+
to multiple Products via ``LocationProductReference`` /
445+
``LocationFindingReference``, so the inherited set is the per-row
446+
union of every linked Product's tags. Callers typically build a
447+
``{pk: location}`` dict with the relevant ``prefetch_related`` chain
448+
and close over it inside the callback.
449+
450+
Implementation notes
451+
--------------------
452+
453+
454+
Avoids materializing children as full model instances. The previous
455+
``list(queryset)`` path fetched all 70+ columns per Finding row, which
456+
dominated wall-clock time on large products. ``bulk_add_tag_mapping`` /
457+
``bulk_remove_tags_from_instances`` only ever read ``instance.pk`` and
458+
``instance.__class__``, so a bare ``model_class(pk=pid)`` stub is enough.
459+
460+
Issues bulk SQL: one through-table read for the current ``inherited_tags``
461+
rows, then one INSERT per added tag-name and one DELETE per removed
462+
tag-name (each batched if needed by the helpers).
389463
390-
Issues bulk SQL: one through-table read for current inherited_tags, then
391-
bulk add/remove on `tags` and `inherited_tags` fields.
392464
"""
393-
children = list(queryset)
394-
if not children:
465+
child_ids = list(child_ids)
466+
if not child_ids:
395467
return
396468

397-
model_class = type(children[0])
398469
inherited_field = model_class._meta.get_field("inherited_tags")
399470
inherited_through = inherited_field.remote_field.through
400471
inherited_tag_model = inherited_field.related_model
@@ -406,7 +477,6 @@ def _sync_inheritance_for_qs(queryset, *, target_tag_names_per_child):
406477
source_field_name = field.name
407478
break
408479

409-
child_ids = [c.pk for c in children]
410480
# One query: pull every (child_id, tag_name) pair from the inherited_tags through table.
411481
existing_pairs = inherited_through.objects.filter(
412482
**{f"{source_field_name}__in": child_ids},
@@ -416,16 +486,31 @@ def _sync_inheritance_for_qs(queryset, *, target_tag_names_per_child):
416486
for child_id, tag_name in existing_pairs:
417487
old_inherited_by_child[child_id].add(tag_name)
418488

419-
# Compute per-child diff and bucket by tag name.
489+
# Per-pk stub instances reused across tag buckets (bulk helpers only read
490+
# .pk and .__class__).
491+
stubs: dict[int, object] = {}
492+
493+
def _stub(pk):
494+
s = stubs.get(pk)
495+
if s is None:
496+
s = model_class(pk=pk)
497+
stubs[pk] = s
498+
return s
499+
500+
constant_target: set[str] | None = None if callable(target_tag_names) else target_tag_names
501+
502+
# Compute per-pk diff and bucket by tag name.
420503
add_map: dict[str, list] = defaultdict(list)
421504
remove_map: dict[str, list] = defaultdict(list)
422-
for child in children:
423-
target = target_tag_names_per_child(child)
424-
old = old_inherited_by_child.get(child.pk, set())
505+
for pk in child_ids:
506+
target = constant_target if constant_target is not None else target_tag_names(pk)
507+
old = old_inherited_by_child.get(pk, set())
508+
if target == old:
509+
continue
425510
for name in target - old:
426-
add_map[name].append(child)
511+
add_map[name].append(_stub(pk))
427512
for name in old - target:
428-
remove_map[name].append(child)
513+
remove_map[name].append(_stub(pk))
429514

430515
# Apply adds. Both `tags` and `inherited_tags` get the same set of new
431516
# inherited names — `_sync_inherited_tags` did the same.

unittests/test_tag_inheritance_perf.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,24 @@ def _do_finding_add_user_tag(self, name: str, expected: int) -> None:
214214
self.assertIn("user-only", finding_tag_names)
215215
self.assertIn("inherited", finding_tag_names) # still sticky
216216

217+
def _do_propagate_sync_only(self, name: str, expected: int, *, with_endpoints: bool, with_locations: bool) -> None:
218+
"""
219+
Measure `propagate_tags_on_product_sync(product)` in isolation — no tag change.
220+
221+
Captures the raw sweep cost for a product with a realistic mix of children:
222+
N findings + (V2) N endpoints or (V3) N locations. Should be roughly idempotent
223+
(no add/remove to apply) so the number reflects diff-detection overhead.
224+
"""
225+
product = _make_product_with_findings(name, n_findings=100, tags=["t1", "t2"])
226+
if with_endpoints:
227+
_make_endpoints(product, n=100)
228+
if with_locations:
229+
_make_locations(product, n=100)
230+
with self.assertNumQueries(expected):
231+
propagate_tags_on_product_sync(product)
232+
finding = Finding.objects.filter(test__engagement__product=product).first()
233+
self.assertEqual({"t1", "t2"}, {t.name for t in finding.tags.all()})
234+
217235
def _do_finding_remove_inherited(self, name: str, expected: int) -> None:
218236
product = _make_product_with_findings(name, n_findings=1, tags=["inherited"])
219237
finding = Finding.objects.filter(test__engagement__product=product).first()
@@ -282,6 +300,29 @@ def test_baseline_finding_remove_inherited_tag_sticky_re_adds_v2(self):
282300
def test_baseline_finding_remove_inherited_tag_sticky_re_adds_v3(self):
283301
self._do_finding_remove_inherited("perf-sticky-rm-v3", self.EXPECTED_FINDING_REMOVE_INHERITED_V3)
284302

303+
# ------------------------------------------------------------------
304+
# propagate_tags_on_product_sync direct invocation (no tag change).
305+
# Measures the raw sweep cost over a product's children.
306+
# ------------------------------------------------------------------
307+
308+
@override_settings(V3_FEATURE_LOCATIONS=False)
309+
def test_baseline_propagate_tags_on_product_sync_v2(self):
310+
self._do_propagate_sync_only(
311+
"perf-sync-v2",
312+
self.EXPECTED_PROPAGATE_SYNC_V2,
313+
with_endpoints=True,
314+
with_locations=False,
315+
)
316+
317+
@override_settings(V3_FEATURE_LOCATIONS=True)
318+
def test_baseline_propagate_tags_on_product_sync_v3(self):
319+
self._do_propagate_sync_only(
320+
"perf-sync-v3",
321+
self.EXPECTED_PROPAGATE_SYNC_V3,
322+
with_endpoints=False,
323+
with_locations=True,
324+
)
325+
285326
# ------------------------------------------------------------------
286327
# V2: propagation to Endpoints (skipped under V3_FEATURE_LOCATIONS)
287328
# ------------------------------------------------------------------
@@ -382,6 +423,11 @@ def test_baseline_product_tag_remove_propagates_to_100_locations_v3(self):
382423
EXPECTED_PRODUCT_TAG_ADD_100_LOCATIONS = 125
383424
EXPECTED_PRODUCT_TAG_REMOVE_100_LOCATIONS = 75
384425

426+
# propagate_tags_on_product_sync direct invocation (no tag change).
427+
# Product with 100 findings + 100 endpoints (V2) or + 100 locations (V3).
428+
EXPECTED_PROPAGATE_SYNC_V2 = 9
429+
EXPECTED_PROPAGATE_SYNC_V3 = 18
430+
385431

386432
@override_settings(
387433
CELERY_TASK_ALWAYS_EAGER=True,
@@ -498,7 +544,7 @@ def test_baseline_zap_scan_reimport_no_change_v3(self):
498544
# import path because the previous process-global signal-disconnect was
499545
# narrower in scope (Location.tags.through only). Net-positive trade for
500546
# eliminating the threading bug; full Phase B reductions land in Stage 2.
501-
EXPECTED_ZAP_IMPORT_V2 = 422
502-
EXPECTED_ZAP_IMPORT_V3 = 445
503-
EXPECTED_ZAP_REIMPORT_NO_CHANGE_V2 = 75
504-
EXPECTED_ZAP_REIMPORT_NO_CHANGE_V3 = 101
547+
EXPECTED_ZAP_IMPORT_V2 = 420
548+
EXPECTED_ZAP_IMPORT_V3 = 444
549+
EXPECTED_ZAP_REIMPORT_NO_CHANGE_V2 = 74
550+
EXPECTED_ZAP_REIMPORT_NO_CHANGE_V3 = 100

0 commit comments

Comments
 (0)