Skip to content

Commit 8206cf5

Browse files
committed
feat(pypi): store PyPI results as facts v2
Whilst this is working locally the laundry list is: - [ ] Add tests for the cache. - [ ] Decide if filtering the versions in the cache is OK. It feels like we should not need to do it. - [ ] Adapt the tests and test the version filtering - [ ] Split out the parsing improvements for `parse_simpleapi_html` into a separate PR. I discovered a bug there where the `data-yanked=""` is interpreted as `yanked` package and I want to avoid misunderstandings where the lines would appear in the lock file and then people would be scratching their heads. Superseeds bazel-contrib#3559 Fixes bazel-contrib#2731
1 parent 8c2cae6 commit 8206cf5

6 files changed

Lines changed: 244 additions & 30 deletions

File tree

python/private/pypi/extension.bzl

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ You cannot use both the additive_build_content and additive_build_content_file a
225225
# dict[str repo, HubBuilder]
226226
# See `hub_builder.bzl%hub_builder()` for `HubBuilder`
227227
pip_hub_map = {}
228-
simpleapi_cache = pypi_cache()
228+
simpleapi_cache = pypi_cache(module_ctx = module_ctx)
229229

230230
for mod in module_ctx.modules:
231231
for pip_attr in mod.tags.parse:
@@ -293,6 +293,7 @@ You cannot use both the additive_build_content and additive_build_content_file a
293293
config = config,
294294
exposed_packages = exposed_packages,
295295
extra_aliases = extra_aliases,
296+
facts = simpleapi_cache.get_facts(),
296297
hub_group_map = hub_group_map,
297298
hub_whl_map = hub_whl_map,
298299
whl_libraries = whl_libraries,
@@ -372,7 +373,11 @@ def _pip_impl(module_ctx):
372373
module_ctx: module contents
373374
"""
374375

375-
mods = parse_modules(module_ctx, enable_pipstar = rp_config.enable_pipstar, enable_pipstar_extract = rp_config.enable_pipstar and rp_config.bazel_8_or_later)
376+
mods = parse_modules(
377+
module_ctx,
378+
enable_pipstar = rp_config.enable_pipstar,
379+
enable_pipstar_extract = rp_config.enable_pipstar and rp_config.bazel_8_or_later,
380+
)
376381

377382
# Build all of the wheel modifications if the tag class is called.
378383
_whl_mods_impl(mods.whl_mods)
@@ -396,6 +401,7 @@ def _pip_impl(module_ctx):
396401

397402
return module_ctx.extension_metadata(
398403
reproducible = True,
404+
facts = mods.facts or {},
399405
)
400406

401407
_default_attrs = {

python/private/pypi/hub_builder.bzl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,11 +395,11 @@ def _set_get_index_urls(self, pip_attr):
395395
index_url = pip_attr.experimental_index_url,
396396
extra_index_urls = pip_attr.experimental_extra_index_urls or [],
397397
index_url_overrides = pip_attr.experimental_index_url_overrides or {},
398-
sources = [
399-
d
400-
for d in distributions
398+
sources = {
399+
d: versions
400+
for d, versions in distributions.items()
401401
if _use_downloader(self, python_version, d)
402-
],
402+
},
403403
envsubst = pip_attr.envsubst,
404404
# Auth related info
405405
netrc = pip_attr.netrc,

python/private/pypi/parse_requirements.bzl

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def parse_requirements(
5353
os, arch combinations.
5454
extra_pip_args (string list): Extra pip arguments to perform extra validations and to
5555
be joined with args found in files.
56-
get_index_urls: Callable[[ctx, list[str]], dict], a callable to get all
56+
get_index_urls: Callable[[ctx, dict[str, list[str]]], dict], a callable to get all
5757
of the distribution URLs from a PyPI index. Accepts ctx and
5858
distribution names to query.
5959
evaluate_markers: A function to use to evaluate the requirements.
@@ -170,15 +170,17 @@ def parse_requirements(
170170

171171
index_urls = {}
172172
if get_index_urls:
173+
distributions = {}
174+
for reqs in requirements_by_platform.values():
175+
for req in reqs.values():
176+
if req.srcs.url:
177+
continue
178+
179+
distributions.setdefault(req.distribution, []).append(req.srcs.version)
180+
173181
index_urls = get_index_urls(
174182
ctx,
175-
# Use list({}) as a way to have a set
176-
list({
177-
req.distribution: None
178-
for reqs in requirements_by_platform.values()
179-
for req in reqs.values()
180-
if not req.srcs.url
181-
}),
183+
distributions,
182184
)
183185

184186
ret = []

python/private/pypi/parse_simpleapi_html.bzl

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,37 @@ def parse_simpleapi_html(*, content):
5353

5454
# Each line follows the following pattern
5555
# <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
56+
#
57+
# Sometimes the lines may be split, so we should seek until `<br />`
5658
sha256s_by_version = {}
5759
for line in lines[1:]:
58-
dist_url, _, tail = line.partition("#sha256=")
59-
60-
sha256, _, tail = tail.partition("\"")
60+
attrs, _, _ = line.rpartition("</a><br")
61+
attrs, _, _ = attrs.rpartition(">") # Strip the name as well
62+
dist_url, sep, attrs = attrs.partition("#sha256=")
63+
if sep:
64+
# TODO @aignas 2026-03-08: tests
65+
sha256, _, attrs = attrs.partition("\"")
66+
else:
67+
# TODO @aignas 2026-03-08: tests
68+
sha256 = ""
6169

6270
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
63-
yanked = "data-yanked" in line
6471

65-
head, _, _ = tail.rpartition("</a>")
72+
# Yank reason
73+
# TODO @aignas 2026-03-08: add unit tests
74+
_, _, maybe_yanked = attrs.partition(" data-yanked=")
75+
yanked, _, maybe_yanked = maybe_yanked.strip('"').partition('"')
76+
77+
# if yanked endswith '\', then we know this is an escaped yank reason containing '\"' in it
78+
for _ in range(1000):
79+
if not yanked.endswith("\\"):
80+
break
81+
82+
c, _, maybe_yanked = maybe_yanked.strip('"').partition('"')
83+
yanked = "{}\"{}".format(yanked, c)
84+
yanked = yanked.strip(" \"")
85+
86+
head, _, _ = line.rpartition("</a>")
6687
maybe_metadata, _, filename = head.rpartition(">")
6788
version = version_from_filename(filename)
6889
sha256s_by_version.setdefault(version, []).append(sha256)
@@ -73,8 +94,8 @@ def parse_simpleapi_html(*, content):
7394
metadata_marker = metadata_marker + "=\"sha256="
7495
if metadata_marker in maybe_metadata:
7596
# Implement https://peps.python.org/pep-0714/
76-
_, _, tail = maybe_metadata.partition(metadata_marker)
77-
metadata_sha256, _, _ = tail.partition("\"")
97+
_, _, attrs = maybe_metadata.partition(metadata_marker)
98+
metadata_sha256, _, _ = attrs.partition("\"")
7899
metadata_url = dist_url + ".metadata"
79100
break
80101

python/private/pypi/pypi_cache.bzl

Lines changed: 186 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,34 @@ In the future the same will be used to:
88
- Store PyPI index query results as facts in the MODULE.bazel.lock file
99
"""
1010

11-
def pypi_cache(store = None):
11+
load(":version_from_filename.bzl", "version_from_filename")
12+
13+
_FACT_VERSION = "v1"
14+
15+
def pypi_cache(module_ctx = None, store = None):
1216
"""The cache for PyPI index queries.
1317
1418
Currently the key is of the following structure:
15-
(url, real_url)
19+
(url, real_url, versions)
20+
21+
Args:
22+
module_ctx: The module context
23+
store: The in-memory store, should implement dict interface for get and setdefault
24+
25+
Returns:
26+
A cache struct
1627
"""
28+
mcache = memory_cache(store)
29+
facts = {}
30+
fcache = facts_cache(getattr(module_ctx, "facts", None), facts)
1731

1832
# buildifier: disable=uninitialized
1933
self = struct(
20-
_store = store or {},
34+
_mcache = mcache,
35+
_facts = fcache,
2136
setdefault = lambda key, parsed_result: _pypi_cache_setdefault(self, key, parsed_result),
2237
get = lambda key: _pypi_cache_get(self, key),
38+
get_facts = lambda: _get_facts(facts),
2339
)
2440

2541
# buildifier: enable=uninitialized
@@ -40,7 +56,13 @@ def _pypi_cache_setdefault(self, key, parsed_result):
4056
Returns:
4157
The `parse_result`.
4258
"""
43-
return self._store.setdefault(key, parsed_result)
59+
index_url, real_url, versions = key
60+
self._mcache.setdefault(real_url, None, parsed_result)
61+
if not versions or not self._facts:
62+
return parsed_result
63+
64+
filtered = _filter_packages(parsed_result, versions)
65+
return self._facts.setdefault(index_url, filtered)
4466

4567
def _pypi_cache_get(self, key):
4668
"""Return the parsed result from the cache.
@@ -52,4 +74,163 @@ def _pypi_cache_get(self, key):
5274
Returns:
5375
The {type}`struct` or `None` based on if the result is in the cache or not.
5476
"""
55-
return self._store.get(key)
77+
index_url, real_url, versions = key
78+
cached = self._mcache.get(real_url, versions)
79+
if not self._facts:
80+
return cached
81+
82+
if not cached and versions:
83+
# Could not get from in-memory, read from lockfile facts
84+
cached = self._facts.get(index_url, versions)
85+
86+
return cached
87+
88+
def _get_facts(facts):
89+
return facts
90+
91+
def memory_cache(cache = None):
92+
"""SimpleAPI cache for making fewer calls.
93+
94+
Args:
95+
cache: the storage to store things in memory.
96+
97+
Returns:
98+
struct with 2 methods, `get` and `setdefault`.
99+
"""
100+
if cache == None:
101+
cache = {}
102+
103+
return struct(
104+
get = lambda real_url, versions: _filter_packages(cache.get(real_url), versions),
105+
setdefault = lambda real_url, versions, value: _filter_packages(cache.get(real_url), versions),
106+
)
107+
108+
def _filter_packages(dists, requested_versions):
109+
if dists == None:
110+
return None
111+
112+
if not requested_versions:
113+
return dists
114+
115+
sha256s_by_version = {}
116+
whls = {}
117+
sdists = {}
118+
119+
for sha256, d in dists.sdists.items():
120+
if d.version not in requested_versions:
121+
continue
122+
123+
sdists[sha256] = d
124+
sha256s_by_version.setdefault(d.version, []).append(sha256)
125+
126+
for sha256, d in dists.whls.items():
127+
if d.version not in requested_versions:
128+
continue
129+
130+
whls[sha256] = d
131+
sha256s_by_version.setdefault(d.version, []).append(sha256)
132+
133+
if not whls and not sdists:
134+
# TODO @aignas 2026-03-08: add logging
135+
#print("WARN: no dists matched for versions {}".format(requested_versions))
136+
return None
137+
138+
return struct(
139+
whls = whls,
140+
sdists = sdists,
141+
sha256s_by_version = sha256s_by_version,
142+
)
143+
144+
def facts_cache(known_facts, facts, facts_version = _FACT_VERSION):
145+
if known_facts == None:
146+
return None
147+
148+
return struct(
149+
get = lambda index_url, versions: _get_from_facts(
150+
facts,
151+
known_facts,
152+
index_url,
153+
versions,
154+
facts_version,
155+
),
156+
setdefault = lambda url, value: _store_facts(facts, facts_version, url, value),
157+
known_facts = known_facts,
158+
facts = facts,
159+
)
160+
161+
def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_version):
162+
if known_facts.get("fact_version") != facts_version:
163+
# cannot trust known facts, different version that we know how to parse
164+
return None
165+
166+
known_sources = {}
167+
168+
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
169+
distribution = distribution.rstrip("/")
170+
root_url = root_url.rstrip("/")
171+
172+
for url, sha256 in known_facts.get("dist_hashes", {}).get(root_url, {}).get(distribution, {}).items():
173+
filename = known_facts.get("dist_filenames", {}).get(root_url, {}).get(distribution, {}).get(sha256)
174+
if not filename:
175+
_, _, filename = url.rpartition("/")
176+
177+
version = version_from_filename(filename)
178+
if version not in requested_versions:
179+
# TODO @aignas 2026-01-21: do the check by requested shas at some point
180+
# We don't have sufficient info in the lock file, need to call the API
181+
#
182+
continue
183+
184+
if filename.endswith(".whl"):
185+
dists = known_sources.setdefault("whls", {})
186+
else:
187+
dists = known_sources.setdefault("sdists", {})
188+
189+
known_sources.setdefault("sha256s_by_version", {}).setdefault(version, []).append(sha256)
190+
191+
dists.setdefault(sha256, struct(
192+
sha256 = sha256,
193+
filename = filename,
194+
version = version,
195+
url = url,
196+
yanked = known_facts.get("dist_yanked", {}).get(root_url, {}).get(distribution, {}).get(sha256, ""),
197+
))
198+
199+
if not known_sources:
200+
# We found nothing in facts
201+
return None
202+
203+
output = struct(
204+
whls = known_sources.get("whls", {}),
205+
sdists = known_sources.get("sdists", {}),
206+
sha256s_by_version = known_sources.get("sha256s_by_version", {}),
207+
)
208+
209+
# Persist these facts for the next run because we have used them.
210+
return _store_facts(facts, facts_version, index_url, output)
211+
212+
def _store_facts(facts, fact_version, index_url, value):
213+
"""Store values as facts in the lock file.
214+
215+
The main idea is to ensure that the lock file is small and it is only storing what
216+
we would need to fetch from the internet. Any derivative information we can
217+
from this that can be achieved using pure Starlark functions should be done in
218+
Starlark.
219+
"""
220+
if not value:
221+
return value
222+
223+
facts["fact_version"] = fact_version
224+
225+
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
226+
distribution = distribution.rstrip("/")
227+
root_url = root_url.rstrip("/")
228+
229+
for sha256, d in (value.sdists | value.whls).items():
230+
facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
231+
if not d.url.endswith(d.filename):
232+
facts.setdefault("dist_filenames", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, d.filename)
233+
if d.yanked:
234+
facts.setdefault("dist_yanked", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(sha256, d.yanked)
235+
236+
return value

0 commit comments

Comments
 (0)