Skip to content

Commit 708f5a6

Browse files
deckoclaude
andcommitted
fix: eliminate double S3 read and close file handles in repair_metadata
During repair_metadata, each wheel was read from S3 twice: once for content metadata extraction and again for metadata artifact creation. Now the temp file from the first read is reused for the second, halving S3 reads and temp file I/O for wheels. Also explicitly closes artifact.file after each iteration to release S3 buffer memory immediately instead of waiting for GC. JIRA: PULP-1573 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6ee9358 commit 708f5a6

1 file changed

Lines changed: 31 additions & 6 deletions

File tree

pulp_python/app/tasks/repair.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import os
23
from collections import defaultdict
34
from gettext import gettext as _
45
from itertools import groupby
@@ -119,18 +120,35 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
119120
.first()
120121
.artifact
121122
)
122-
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
123+
# For wheels, keep the temp file so we can reuse it for metadata artifact
124+
is_wheel = package.filename.endswith(".whl")
125+
if is_wheel:
126+
new_data, temp_path = artifact_to_python_content_data(
127+
package.filename, main_artifact, domain, keep_temp_file=True
128+
)
129+
else:
130+
new_data = artifact_to_python_content_data(
131+
package.filename, main_artifact, domain
132+
)
133+
temp_path = None
134+
123135
total_metadata_repaired += update_metadata_artifact_if_needed(
124136
package,
125137
new_data.get("metadata_sha256"),
126138
main_artifact,
127139
metadata_batch,
128140
pkgs_metadata_not_repaired,
141+
temp_path=temp_path,
129142
)
130143
total_repaired += update_package_if_needed(
131144
package, new_data, batch, set_of_update_fields
132145
)
133146

147+
# Clean up temp file and close artifact file handle
148+
if temp_path and os.path.exists(temp_path):
149+
os.unlink(temp_path)
150+
main_artifact.file.close()
151+
134152
# For on-demand content, we expect that:
135153
# 1. PythonPackageContent always has correct name and version
136154
# 2. RemoteArtifact always has correct sha256
@@ -240,6 +258,7 @@ def update_metadata_artifact_if_needed(
240258
main_artifact: Artifact,
241259
metadata_batch: list[tuple],
242260
pkgs_metadata_not_repaired: set[str],
261+
temp_path: str | None = None,
243262
) -> int:
244263
"""
245264
Repairs metadata artifacts for wheel packages by creating missing metadata artifacts
@@ -252,6 +271,7 @@ def update_metadata_artifact_if_needed(
252271
main_artifact: The main package artifact used to generate metadata.
253272
metadata_batch: List of tuples for batch processing (updated in-place).
254273
pkgs_metadata_not_repaired: Set of package PKs that failed repair (updated in-place).
274+
temp_path: Path to already-extracted temp wheel file, avoids re-reading from S3.
255275
256276
Returns:
257277
Number of repaired metadata artifacts (only when batch is flushed at BULK_SIZE).
@@ -266,13 +286,13 @@ def update_metadata_artifact_if_needed(
266286

267287
# Create missing
268288
if not cas:
269-
metadata_batch.append((package, main_artifact))
289+
metadata_batch.append((package, main_artifact, temp_path))
270290
# Fix existing
271291
elif new_metadata_sha256 != original_metadata_sha256:
272292
ca = cas.first()
273293
metadata_artifact = ca.artifact
274294
if metadata_artifact is None or (metadata_artifact.sha256 != new_metadata_sha256):
275-
metadata_batch.append((package, main_artifact))
295+
metadata_batch.append((package, main_artifact, temp_path))
276296

277297
if len(metadata_batch) == BULK_SIZE:
278298
not_repaired = _process_metadata_batch(metadata_batch)
@@ -289,16 +309,21 @@ def _process_metadata_batch(metadata_batch: list[tuple]) -> set[str]:
289309
and their corresponding ContentArtifacts.
290310
291311
Args:
292-
metadata_batch: List of (package, main_artifact) tuples.
312+
metadata_batch: List of (package, main_artifact, temp_path) tuples.
293313
294314
Returns:
295315
Set of package PKs for which metadata artifacts could not be created.
296316
"""
297317
not_repaired = set()
298318
content_artifacts = []
299319

300-
for package, main_artifact in metadata_batch:
301-
metadata_artifact = artifact_to_metadata_artifact(package.filename, main_artifact)
320+
for package, main_artifact, temp_path in metadata_batch:
321+
if temp_path and os.path.exists(temp_path):
322+
metadata_artifact = artifact_to_metadata_artifact_from_path(
323+
package.filename, temp_path
324+
)
325+
else:
326+
metadata_artifact = artifact_to_metadata_artifact(package.filename, main_artifact)
302327
if metadata_artifact:
303328
ca = ContentArtifact(
304329
artifact=metadata_artifact,

0 commit comments

Comments
 (0)