Skip to content

Commit 262c83d

Browse files
git-hyagidralley
authored andcommitted
Optimize protected_versions() to avoid expensive JOINs
The protected_versions() method was building a single queryset using |= (OR) operations that caused Django to generate a LEFT OUTER JOIN on core_publication across all repository versions (10,000+ rows), resulting in queries taking minutes in large databases. Rewritten to collect protected version PKs from separate simple queries against Distribution and Publication tables, then return a simple filter(pk__in=...). Also defer the content_ids ArrayField in cleanup_old_versions since it can contain hundreds of thousands of UUIDs per version and is not needed by version.delete(). fixes: #7594 Assisted By: claude-opus-4.6
1 parent ac64ff3 commit 262c83d

2 files changed

Lines changed: 28 additions & 10 deletions

File tree

CHANGES/7594.bugfix

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Optimized cleanup_old_versions() by rewriting protected_versions() to avoid expensive JOINs
2+
on large databases and deferring the content_ids field during version deletion.

pulpcore/app/models/repository.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -323,20 +323,31 @@ def protected_versions(self):
323323
"""
324324
from .publication import Distribution, Publication
325325

326+
protected_pks = set()
327+
326328
# find all repo versions set on a distribution
327-
qs = self.versions.filter(pk__in=Distribution.objects.values_list("repository_version_id"))
329+
protected_pks.update(
330+
Distribution.objects.filter(
331+
repository_version__repository=self,
332+
).values_list("repository_version_id", flat=True)
333+
)
328334

329335
# find all repo versions with publications set on a distribution
330-
qs |= self.versions.filter(
331-
publication__pk__in=Distribution.objects.values_list("publication_id")
336+
dist_pub_ids = Distribution.objects.values_list("publication_id", flat=True)
337+
protected_pks.update(
338+
Publication.objects.filter(
339+
pk__in=dist_pub_ids,
340+
repository_version__repository=self,
341+
).values_list("repository_version_id", flat=True)
332342
)
333343

334344
# Protect repo versions of distributed checkpoint publications.
335345
if Distribution.objects.filter(repository=self.pk, checkpoint=True).exists():
336-
qs |= self.versions.filter(
337-
publication__pk__in=Publication.objects.filter(checkpoint=True).values_list(
338-
"pulp_id"
339-
)
346+
protected_pks.update(
347+
Publication.objects.filter(
348+
checkpoint=True,
349+
repository_version__repository=self,
350+
).values_list("repository_version_id", flat=True)
340351
)
341352

342353
if distro := Distribution.objects.filter(repository=self.pk, checkpoint=False).first():
@@ -352,9 +363,12 @@ def protected_versions(self):
352363
version = self.latest_version()
353364

354365
if version:
355-
qs |= self.versions.filter(pk=version.pk)
366+
protected_pks.add(version.pk)
367+
368+
# Discard None values from distributions with no repository_version set
369+
protected_pks.discard(None)
356370

357-
return qs.distinct()
371+
return self.versions.filter(pk__in=protected_pks)
358372

359373
def pull_through_add_content(self, content_artifact):
360374
"""
@@ -416,7 +430,9 @@ def cleanup_old_versions(self):
416430
if self.retain_repo_versions:
417431
# Consider only completed versions that aren't protected for cleanup
418432
versions = self.versions.complete().exclude(pk__in=self.protected_versions())
419-
for version in versions.order_by("-number")[self.retain_repo_versions :]:
433+
for version in versions.defer("content_ids").order_by("-number")[
434+
self.retain_repo_versions :
435+
]:
420436
_logger.info(
421437
"Deleting repository version {} due to version retention limit.".format(version)
422438
)

0 commit comments

Comments
 (0)