Skip to content

Commit 6c3f591

Browse files
authored
Merge pull request #1196 from pulp/patchback/backports/3.27/7aaf53fc2f2a46f21be5363e677f503d6ea86fc9/pr-1189
[PR #1189/7aaf53fc backport][3.27] Fix repair_metadata OOM on large repositories
2 parents 6bdd9d9 + 8fab49e commit 6c3f591

File tree

3 files changed

+61
-18
lines changed

3 files changed

+61
-18
lines changed

CHANGES/1188.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fixed "Worker has gone missing" errors during repair_metadata on large repositories (1000+ packages) by reducing peak memory consumption.

pulp_python/app/tasks/repair.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import os
23
from collections import defaultdict
34
from gettext import gettext as _
45
from itertools import groupby
@@ -8,18 +9,20 @@
89
from django.db.models.query import QuerySet
910
from pulp_python.app.models import PythonPackageContent, PythonRepository
1011
from pulp_python.app.utils import (
11-
artifact_to_metadata_artifact,
1212
artifact_to_python_content_data,
13+
copy_artifact_to_temp_file,
14+
extract_wheel_metadata,
1315
fetch_json_release_metadata,
16+
metadata_content_to_artifact,
1417
parse_metadata,
1518
)
16-
from pulpcore.plugin.models import Artifact, ContentArtifact, ProgressReport
19+
from pulpcore.plugin.models import ContentArtifact, ProgressReport
1720
from pulpcore.plugin.util import get_domain
1821

1922
log = logging.getLogger(__name__)
2023

2124

22-
BULK_SIZE = 1000
25+
BULK_SIZE = 250
2326

2427

2528
def repair(repository_pk: UUID) -> None:
@@ -118,11 +121,21 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
118121
.first()
119122
.artifact
120123
)
121-
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
124+
# Copy artifact to temp file once, extract both content data and metadata
125+
temp_path = copy_artifact_to_temp_file(main_artifact, package.filename)
126+
try:
127+
new_data = artifact_to_python_content_data(
128+
package.filename, main_artifact, domain, temp_path=temp_path
129+
)
130+
metadata_content = (
131+
extract_wheel_metadata(temp_path) if package.filename.endswith(".whl") else None
132+
)
133+
finally:
134+
os.unlink(temp_path)
122135
total_metadata_repaired += update_metadata_artifact_if_needed(
123136
package,
124137
new_data.get("metadata_sha256"),
125-
main_artifact,
138+
metadata_content,
126139
metadata_batch,
127140
pkgs_metadata_not_repaired,
128141
)
@@ -236,7 +249,7 @@ def update_package_if_needed(
236249
def update_metadata_artifact_if_needed(
237250
package: PythonPackageContent,
238251
new_metadata_sha256: str | None,
239-
main_artifact: Artifact,
252+
metadata_content: bytes | None,
240253
metadata_batch: list[tuple],
241254
pkgs_metadata_not_repaired: set[str],
242255
) -> int:
@@ -248,7 +261,7 @@ def update_metadata_artifact_if_needed(
248261
Args:
249262
package: Package to check for metadata changes.
250263
new_metadata_sha256: The correct metadata_sha256 extracted from the main artifact, or None.
251-
main_artifact: The main package artifact used to generate metadata.
264+
metadata_content: Raw metadata bytes extracted from the wheel, or None.
252265
metadata_batch: List of tuples for batch processing (updated in-place).
253266
pkgs_metadata_not_repaired: Set of package PKs that failed repair (updated in-place).
254267
@@ -265,13 +278,13 @@ def update_metadata_artifact_if_needed(
265278

266279
# Create missing
267280
if not cas:
268-
metadata_batch.append((package, main_artifact))
281+
metadata_batch.append((package, metadata_content))
269282
# Fix existing
270283
elif new_metadata_sha256 != original_metadata_sha256:
271284
ca = cas.first()
272285
metadata_artifact = ca.artifact
273286
if metadata_artifact is None or (metadata_artifact.sha256 != new_metadata_sha256):
274-
metadata_batch.append((package, main_artifact))
287+
metadata_batch.append((package, metadata_content))
275288

276289
if len(metadata_batch) == BULK_SIZE:
277290
not_repaired = _process_metadata_batch(metadata_batch)
@@ -288,16 +301,16 @@ def _process_metadata_batch(metadata_batch: list[tuple]) -> set[str]:
288301
and their corresponding ContentArtifacts.
289302
290303
Args:
291-
metadata_batch: List of (package, main_artifact) tuples.
304+
metadata_batch: List of (package, metadata_content) tuples.
292305
293306
Returns:
294307
Set of package PKs for which metadata artifacts could not be created.
295308
"""
296309
not_repaired = set()
297310
content_artifacts = []
298311

299-
for package, main_artifact in metadata_batch:
300-
metadata_artifact = artifact_to_metadata_artifact(package.filename, main_artifact)
312+
for package, metadata_content in metadata_batch:
313+
metadata_artifact = metadata_content_to_artifact(metadata_content)
301314
if metadata_artifact:
302315
ca = ContentArtifact(
303316
artifact=metadata_artifact,

pulp_python/app/utils.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,18 +240,37 @@ def compute_metadata_sha256(filename: str) -> str | None:
240240
return hashlib.sha256(metadata_content).hexdigest() if metadata_content else None
241241

242242

243-
def artifact_to_python_content_data(filename, artifact, domain=None):
243+
def copy_artifact_to_temp_file(artifact, filename, tmp_dir="."):
244+
"""
245+
Copy an artifact's file to a temporary file on disk.
246+
247+
Returns the path to the temp file. The caller is responsible for cleanup.
248+
"""
249+
temp_file = tempfile.NamedTemporaryFile("wb", dir=tmp_dir, suffix=filename, delete=False)
250+
artifact.file.seek(0)
251+
shutil.copyfileobj(artifact.file, temp_file)
252+
temp_file.flush()
253+
temp_file.close()
254+
return temp_file.name
255+
256+
257+
def artifact_to_python_content_data(filename, artifact, domain=None, temp_path=None):
244258
"""
245259
Takes the artifact/filename and returns the metadata needed to create a PythonPackageContent.
260+
261+
If temp_path is provided, uses it instead of copying the artifact to a new temp file.
246262
"""
247263
# Copy file to a temp directory under the user provided filename, we do this
248264
# because pkginfo validates that the filename has a valid extension before
249265
# reading it
250-
with tempfile.NamedTemporaryFile("wb", dir=".", suffix=filename) as temp_file:
251-
artifact.file.seek(0)
252-
shutil.copyfileobj(artifact.file, temp_file)
253-
temp_file.flush()
254-
metadata = get_project_metadata_from_file(temp_file.name)
266+
if temp_path:
267+
metadata = get_project_metadata_from_file(temp_path)
268+
else:
269+
with tempfile.NamedTemporaryFile("wb", dir=".", suffix=filename) as temp_file:
270+
artifact.file.seek(0)
271+
shutil.copyfileobj(artifact.file, temp_file)
272+
temp_file.flush()
273+
metadata = get_project_metadata_from_file(temp_file.name)
255274
data = parse_project_metadata(vars(metadata))
256275
data["sha256"] = artifact.sha256
257276
data["size"] = artifact.size
@@ -280,6 +299,16 @@ def artifact_to_metadata_artifact(
280299
if not metadata_content:
281300
return None
282301

302+
return metadata_content_to_artifact(metadata_content, tmp_dir)
303+
304+
305+
def metadata_content_to_artifact(metadata_content: bytes, tmp_dir: str = ".") -> Artifact | None:
306+
"""
307+
Creates an Artifact from raw metadata content bytes.
308+
"""
309+
if not metadata_content:
310+
return None
311+
283312
with tempfile.NamedTemporaryFile(
284313
"wb", dir=tmp_dir, suffix=".metadata", delete=False
285314
) as temp_md:

0 commit comments

Comments
 (0)