Skip to content

Commit 03a9189

Browse files
committed
refactor: pass metadata bytes through batch, not temp path
Address review feedback round 2: - Extract metadata_content bytes via extract_wheel_metadata while temp file exists, delete temp file immediately, pass bytes through the metadata batch instead of a stale file path - Add metadata_content_to_artifact in utils.py — creates Artifact from raw bytes, shared by artifact_to_metadata_artifact and the repair batch - Revert artifact_to_metadata_artifact to original signature (remove unused temp_path param) - Remove redundant test_metadata_repair_batch_boundary JIRA: PULP-1573
1 parent 01e7240 commit 03a9189

3 files changed

Lines changed: 38 additions & 83 deletions

File tree

pulp_python/app/tasks/repair.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@
99
from django.db.models.query import QuerySet
1010
from pulp_python.app.models import PythonPackageContent, PythonRepository
1111
from pulp_python.app.utils import (
12-
artifact_to_metadata_artifact,
1312
artifact_to_python_content_data,
1413
copy_artifact_to_temp_file,
14+
extract_wheel_metadata,
1515
fetch_json_release_metadata,
16+
metadata_content_to_artifact,
1617
parse_metadata,
1718
)
1819
from pulpcore.plugin.models import Artifact, ContentArtifact, ProgressReport
@@ -120,24 +121,27 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
120121
.first()
121122
.artifact
122123
)
123-
# Copy artifact to temp file once, reuse for both content data and metadata
124+
# Copy artifact to temp file once, extract both content data and metadata
124125
temp_path = copy_artifact_to_temp_file(main_artifact, package.filename)
125126
try:
126127
new_data = artifact_to_python_content_data(
127128
package.filename, main_artifact, domain, temp_path=temp_path
128129
)
129-
total_metadata_repaired += update_metadata_artifact_if_needed(
130-
package,
131-
new_data.get("metadata_sha256"),
132-
main_artifact,
133-
metadata_batch,
134-
pkgs_metadata_not_repaired,
135-
)
136-
total_repaired += update_package_if_needed(
137-
package, new_data, batch, set_of_update_fields
130+
metadata_content = (
131+
extract_wheel_metadata(temp_path) if package.filename.endswith(".whl") else None
138132
)
139133
finally:
140134
os.unlink(temp_path)
135+
total_metadata_repaired += update_metadata_artifact_if_needed(
136+
package,
137+
new_data.get("metadata_sha256"),
138+
metadata_content,
139+
metadata_batch,
140+
pkgs_metadata_not_repaired,
141+
)
142+
total_repaired += update_package_if_needed(
143+
package, new_data, batch, set_of_update_fields
144+
)
141145

142146
# For on-demand content, we expect that:
143147
# 1. PythonPackageContent always has correct name and version
@@ -245,7 +249,7 @@ def update_package_if_needed(
245249
def update_metadata_artifact_if_needed(
246250
package: PythonPackageContent,
247251
new_metadata_sha256: str | None,
248-
main_artifact: Artifact,
252+
metadata_content: bytes | None,
249253
metadata_batch: list[tuple],
250254
pkgs_metadata_not_repaired: set[str],
251255
) -> int:
@@ -257,7 +261,7 @@ def update_metadata_artifact_if_needed(
257261
Args:
258262
package: Package to check for metadata changes.
259263
new_metadata_sha256: The correct metadata_sha256 extracted from the main artifact, or None.
260-
main_artifact: The main package artifact used to generate metadata.
264+
metadata_content: Raw metadata bytes extracted from the wheel, or None.
261265
metadata_batch: List of tuples for batch processing (updated in-place).
262266
pkgs_metadata_not_repaired: Set of package PKs that failed repair (updated in-place).
263267
@@ -274,13 +278,13 @@ def update_metadata_artifact_if_needed(
274278

275279
# Create missing
276280
if not cas:
277-
metadata_batch.append((package, main_artifact))
281+
metadata_batch.append((package, metadata_content))
278282
# Fix existing
279283
elif new_metadata_sha256 != original_metadata_sha256:
280284
ca = cas.first()
281285
metadata_artifact = ca.artifact
282286
if metadata_artifact is None or (metadata_artifact.sha256 != new_metadata_sha256):
283-
metadata_batch.append((package, main_artifact))
287+
metadata_batch.append((package, metadata_content))
284288

285289
if len(metadata_batch) == BULK_SIZE:
286290
not_repaired = _process_metadata_batch(metadata_batch)
@@ -297,16 +301,16 @@ def _process_metadata_batch(metadata_batch: list[tuple]) -> set[str]:
297301
and their corresponding ContentArtifacts.
298302
299303
Args:
300-
metadata_batch: List of (package, main_artifact) tuples.
304+
metadata_batch: List of (package, metadata_content) tuples.
301305
302306
Returns:
303307
Set of package PKs for which metadata artifacts could not be created.
304308
"""
305309
not_repaired = set()
306310
content_artifacts = []
307311

308-
for package, main_artifact in metadata_batch:
309-
metadata_artifact = artifact_to_metadata_artifact(package.filename, main_artifact)
312+
for package, metadata_content in metadata_batch:
313+
metadata_artifact = metadata_content_to_artifact(metadata_content)
310314
if metadata_artifact:
311315
ca = ContentArtifact(
312316
artifact=metadata_artifact,

pulp_python/app/utils.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -281,31 +281,34 @@ def artifact_to_python_content_data(filename, artifact, domain=None, temp_path=N
281281

282282

283283
def artifact_to_metadata_artifact(
284-
filename: str, artifact: Artifact, tmp_dir: str = ".", temp_path: str | None = None
284+
filename: str, artifact: Artifact, tmp_dir: str = "."
285285
) -> Artifact | None:
286286
"""
287287
Creates artifact for metadata from the provided wheel artifact.
288-
289-
If temp_path is provided, uses it instead of copying the artifact to a new temp file.
290288
"""
291289
if not filename.endswith(".whl"):
292290
return None
293291

294-
if temp_path:
295-
temp_wheel_path = temp_path
296-
else:
297-
with tempfile.NamedTemporaryFile(
298-
"wb", dir=tmp_dir, suffix=filename, delete=False
299-
) as temp_file:
300-
temp_wheel_path = temp_file.name
301-
artifact.file.seek(0)
302-
shutil.copyfileobj(artifact.file, temp_file)
303-
temp_file.flush()
292+
with tempfile.NamedTemporaryFile("wb", dir=tmp_dir, suffix=filename, delete=False) as temp_file:
293+
temp_wheel_path = temp_file.name
294+
artifact.file.seek(0)
295+
shutil.copyfileobj(artifact.file, temp_file)
296+
temp_file.flush()
304297

305298
metadata_content = extract_wheel_metadata(temp_wheel_path)
306299
if not metadata_content:
307300
return None
308301

302+
return metadata_content_to_artifact(metadata_content, tmp_dir)
303+
304+
305+
def metadata_content_to_artifact(metadata_content: bytes, tmp_dir: str = ".") -> Artifact | None:
306+
"""
307+
Creates an Artifact from raw metadata content bytes.
308+
"""
309+
if not metadata_content:
310+
return None
311+
309312
with tempfile.NamedTemporaryFile(
310313
"wb", dir=tmp_dir, suffix=".metadata", delete=False
311314
) as temp_md:

pulp_python/tests/functional/api/test_repair.py

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -331,55 +331,3 @@ def test_metadata_artifact_repair_endpoint(
331331
# Check deduplication
332332
assert len(main_artifact_hrefs) == 4
333333
assert len(metadata_artifact_hrefs) == 3
334-
335-
336-
def test_metadata_repair_batch_boundary(
337-
create_content_direct,
338-
delete_orphans_pre,
339-
download_python_file,
340-
monitor_task,
341-
move_to_repository,
342-
python_bindings,
343-
python_repo_factory,
344-
):
345-
"""
346-
Test that repair_metadata correctly handles packages across batch boundaries.
347-
348-
Verifies that the batch flush (at BULK_SIZE) does not lose or corrupt metadata
349-
for packages processed before and after the flush point.
350-
"""
351-
python_repo = python_repo_factory()
352-
353-
# Create multiple wheel packages with wrong metadata
354-
wheel_files = [
355-
("scipy-1.1.0-cp27-none-win32.whl", "ME"),
356-
("scipy-1.1.0-cp27-none-win_amd64.whl", "ME"),
357-
("scipy-1.1.0-cp27-cp27m-manylinux1_x86_64.whl", "ME"),
358-
]
359-
content_hrefs = []
360-
for filename, author in wheel_files:
361-
url = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename)
362-
file_path = download_python_file(filename, url)
363-
data = {
364-
"name": "scipy",
365-
"version": "1.1.0",
366-
"filename": filename,
367-
"author": author,
368-
"packagetype": "bdist",
369-
}
370-
content = create_content_direct(file_path, data)
371-
assert content.author == "ME"
372-
content_hrefs.append(content.pulp_href)
373-
374-
move_to_repository(python_repo.pulp_href, content_hrefs)
375-
376-
# Repair
377-
response = python_bindings.RepositoriesPythonApi.repair_metadata(python_repo.pulp_href)
378-
monitor_task(response.task)
379-
380-
# All packages should have repaired metadata (author != "ME")
381-
for href in content_hrefs:
382-
content = python_bindings.ContentPackagesApi.read(href)
383-
assert content.author != "ME", f"Package {content.filename} was not repaired"
384-
assert content.packagetype == "bdist_wheel"
385-
assert content.metadata_sha256 is not None

0 commit comments

Comments
 (0)