Skip to content

Commit 8c2cae6

Browse files
authored
refactor(pypi): move absolute_url to whl_library (#3652)
With this PR we move the processing of the `index_url` to the `whl_library` as a preparatory step for easier `facts` implementation. The motivation is many-fold: 1. Do not have too much duplication in the facts file by potentially naturally eliminating the `index_url` prefix from the `whls` if it appears like so on the index contents. 2. Avoid doing `envsubst` too early and have logic that has to deal with it. 3. Make the cache just return fact values from the lock file in the future instead of needing to change to an absolute URL and do envsubst on it. 4. We should have a better performance because we should be doing way fewer calls to make the URL absolute during parsing of the index. 5. With the `index_url` passed to the `whl_library`, we can help out the `purl` construction as what has been discussed in #3531 about wheels from non-public indexes. Summary: - Attempt to put the `index_url` in the fewest structs possible. - Extract the `urllib` utilities file for manipulation of the URLs. - Simplify tests testing the `absolute_url` logic. Work towards #2731
1 parent 7ca9e3f commit 8c2cae6

File tree

14 files changed

+310
-252
lines changed

14 files changed

+310
-252
lines changed

python/private/pypi/BUILD.bazel

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,13 +418,19 @@ bzl_library(
418418
srcs = ["simpleapi_download.bzl"],
419419
deps = [
420420
":parse_simpleapi_html_bzl",
421+
":urllib_bzl",
421422
"//python/private:auth_bzl",
422423
"//python/private:normalize_name_bzl",
423424
"//python/private:text_util_bzl",
424425
"@bazel_features//:features",
425426
],
426427
)
427428

429+
bzl_library(
430+
name = "urllib_bzl",
431+
srcs = ["urllib.bzl"],
432+
)
433+
428434
bzl_library(
429435
name = "version_from_filename_bzl",
430436
srcs = ["version_from_filename.bzl"],
@@ -474,6 +480,7 @@ bzl_library(
474480
":patch_whl_bzl",
475481
":pep508_requirement_bzl",
476482
":pypi_repo_utils_bzl",
483+
":urllib_bzl",
477484
":whl_extract_bzl",
478485
":whl_metadata_bzl",
479486
":whl_target_platforms_bzl",

python/private/pypi/hub_builder.bzl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,7 @@ def _create_whl_repos(
599599
for src in whl.srcs:
600600
repo = _whl_repo(
601601
src = src,
602+
index_url = whl.index_url,
602603
whl_library_args = whl_library_args,
603604
download_only = pip_attr.download_only,
604605
netrc = self._config.netrc or pip_attr.netrc,
@@ -678,6 +679,7 @@ def _whl_repo(
678679
*,
679680
src,
680681
whl_library_args,
682+
index_url,
681683
is_multiple_versions,
682684
download_only,
683685
netrc,
@@ -731,6 +733,8 @@ def _whl_repo(
731733
args["netrc"] = netrc
732734
if auth_patterns:
733735
args["auth_patterns"] = auth_patterns
736+
if index_url:
737+
args["index_url"] = index_url
734738

735739
args["urls"] = [src.url]
736740
args["sha256"] = src.sha256

python/private/pypi/parse_requirements.bzl

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,10 +188,11 @@ def parse_requirements(
188188
for p in r.target_platforms:
189189
requirement_target_platforms[p] = None
190190

191+
pkg_sources = index_urls.get(name)
191192
package_srcs = _package_srcs(
192193
name = name,
193194
reqs = reqs,
194-
index_urls = index_urls,
195+
pkg_sources = pkg_sources,
195196
platforms = platforms,
196197
extract_url_srcs = extract_url_srcs,
197198
logger = logger,
@@ -216,6 +217,7 @@ def parse_requirements(
216217
name = normalize_name(name),
217218
is_exposed = len(requirement_target_platforms) == len(requirements),
218219
is_multiple_versions = len(reqs.values()) > 1,
220+
index_url = pkg_sources.index_url if pkg_sources else "",
219221
srcs = package_srcs,
220222
)
221223
ret.append(item)
@@ -234,7 +236,7 @@ def _package_srcs(
234236
*,
235237
name,
236238
reqs,
237-
index_urls,
239+
pkg_sources,
238240
platforms,
239241
logger,
240242
extract_url_srcs):
@@ -253,7 +255,7 @@ def _package_srcs(
253255
dist, can_fallback = _add_dists(
254256
requirement = r,
255257
target_platform = platforms.get(target_platform),
256-
index_urls = index_urls.get(name),
258+
index_urls = pkg_sources,
257259
logger = logger,
258260
)
259261
logger.debug(lambda: "The whl dist is: {}".format(dist.filename if dist else dist))

python/private/pypi/parse_simpleapi_html.bzl

Lines changed: 2 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,10 @@ Parse SimpleAPI HTML in Starlark.
1818

1919
load(":version_from_filename.bzl", "version_from_filename")
2020

21-
def parse_simpleapi_html(*, url, content):
21+
def parse_simpleapi_html(*, content):
2222
"""Get the package URLs for given shas by parsing the Simple API HTML.
2323
2424
Args:
25-
url(str): The URL that the HTML content can be downloaded from.
2625
content(str): The Simple API HTML content.
2726
2827
Returns:
@@ -57,7 +56,6 @@ def parse_simpleapi_html(*, url, content):
5756
sha256s_by_version = {}
5857
for line in lines[1:]:
5958
dist_url, _, tail = line.partition("#sha256=")
60-
dist_url = _absolute_url(url, dist_url)
6159

6260
sha256, _, tail = tail.partition("\"")
6361

@@ -87,7 +85,7 @@ def parse_simpleapi_html(*, url, content):
8785
url = dist_url,
8886
sha256 = sha256,
8987
metadata_sha256 = metadata_sha256,
90-
metadata_url = _absolute_url(url, metadata_url) if metadata_url else "",
88+
metadata_url = metadata_url,
9189
yanked = yanked,
9290
)
9391
else:
@@ -106,47 +104,3 @@ def parse_simpleapi_html(*, url, content):
106104
whls = whls,
107105
sha256s_by_version = sha256s_by_version,
108106
)
109-
110-
def _get_root_directory(url):
111-
scheme_end = url.find("://")
112-
if scheme_end == -1:
113-
fail("Invalid URL format")
114-
115-
scheme = url[:scheme_end]
116-
host_end = url.find("/", scheme_end + 3)
117-
if host_end == -1:
118-
host_end = len(url)
119-
host = url[scheme_end + 3:host_end]
120-
121-
return "{}://{}".format(scheme, host)
122-
123-
def _is_downloadable(url):
124-
"""Checks if the URL would be accepted by the Bazel downloader.
125-
126-
This is based on Bazel's HttpUtils::isUrlSupportedByDownloader
127-
"""
128-
return url.startswith("http://") or url.startswith("https://") or url.startswith("file://")
129-
130-
def _absolute_url(index_url, candidate):
131-
if candidate == "":
132-
return candidate
133-
134-
if _is_downloadable(candidate):
135-
return candidate
136-
137-
if candidate.startswith("/"):
138-
# absolute path
139-
root_directory = _get_root_directory(index_url)
140-
return "{}{}".format(root_directory, candidate)
141-
142-
if candidate.startswith(".."):
143-
# relative path with up references
144-
candidate_parts = candidate.split("..")
145-
last = candidate_parts[-1]
146-
for _ in range(len(candidate_parts) - 1):
147-
index_url, _, _ = index_url.rstrip("/").rpartition("/")
148-
149-
return "{}/{}".format(index_url, last.strip("/"))
150-
151-
# relative path without up-references
152-
return "{}/{}".format(index_url.rstrip("/"), candidate)

python/private/pypi/pypi_cache.bzl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@ In the future the same will be used to:
99
"""
1010

1111
def pypi_cache(store = None):
12-
"""The cache for PyPI index queries."""
12+
"""The cache for PyPI index queries.
13+
14+
Currently the key is of the following structure:
15+
(url, real_url)
16+
"""
1317

1418
# buildifier: disable=uninitialized
1519
self = struct(
@@ -29,6 +33,10 @@ def _pypi_cache_setdefault(self, key, parsed_result):
2933
key: {type}`str` The cache key, can be any string.
3034
parsed_result: {type}`struct` The result of `parse_simpleapi_html` function.
3135
36+
index_url and distribution is used to write to the MODULE.bazel.lock file as facts
37+
real_index_url and distribution is used to write to in-memory cache to ensure that there are
38+
no duplicate calls to the PyPI indexes
39+
3240
Returns:
3341
The `parse_result`.
3442
"""

python/private/pypi/simpleapi_download.bzl

Lines changed: 44 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ load("//python/private:envsubst.bzl", "envsubst")
2222
load("//python/private:normalize_name.bzl", "normalize_name")
2323
load("//python/private:text_util.bzl", "render")
2424
load(":parse_simpleapi_html.bzl", "parse_simpleapi_html")
25+
load(":urllib.bzl", "urllib")
2526

2627
def simpleapi_download(
2728
ctx,
@@ -92,13 +93,14 @@ def simpleapi_download(
9293
sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
9394
for pkg in sources:
9495
pkg_normalized = normalize_name(pkg)
96+
url = urllib.strip_empty_path_segments("{index_url}/{distribution}/".format(
97+
index_url = index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
98+
distribution = pkg,
99+
))
95100
result = read_simpleapi(
96101
ctx = ctx,
97-
url = "{}/{}/".format(
98-
index_url_overrides.get(pkg_normalized, index_url).rstrip("/"),
99-
pkg,
100-
),
101102
attr = attr,
103+
url = url,
102104
cache = cache,
103105
get_auth = get_auth,
104106
**download_kwargs
@@ -108,9 +110,10 @@ def simpleapi_download(
108110
async_downloads[pkg] = struct(
109111
pkg_normalized = pkg_normalized,
110112
wait = result.wait,
113+
url = url,
111114
)
112115
elif result.success:
113-
contents[pkg_normalized] = result.output
116+
contents[pkg_normalized] = _with_index_url(url, result.output)
114117
found_on_index[pkg] = index_url
115118

116119
if not async_downloads:
@@ -122,7 +125,7 @@ def simpleapi_download(
122125
result = download.wait()
123126

124127
if result.success:
125-
contents[download.pkg_normalized] = result.output
128+
contents[download.pkg_normalized] = _with_index_url(download.url, result.output)
126129
found_on_index[pkg] = index_url
127130

128131
failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index]
@@ -168,14 +171,14 @@ def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
168171
169172
Args:
170173
ctx: The module_ctx or repository_ctx.
171-
url: str, the url parameter that can be passed to ctx.download.
174+
url: {type}`str`, the url parameter that can be passed to ctx.download.
172175
attr: The attribute that contains necessary info for downloading. The
173176
following attributes must be present:
174-
* envsubst: The envsubst values for performing substitutions in the URL.
175-
* netrc: The netrc parameter for ctx.download, see http_file for docs.
177+
* envsubst: {type}`dict[str, str]` for performing substitutions in the URL.
178+
* netrc: The netrc parameter for ctx.download, see {obj}`http_file` for docs.
176179
* auth_patterns: The auth_patterns parameter for ctx.download, see
177-
http_file for docs.
178-
cache: A dict for storing the results.
180+
{obj}`http_file` for docs.
181+
cache: {type}`struct` the `pypi_cache` instance.
179182
get_auth: A function to get auth information. Used in tests.
180183
**download_kwargs: Any extra params to ctx.download.
181184
Note that output and auth will be passed for you.
@@ -189,9 +192,9 @@ def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
189192
# them to ctx.download if we want to correctly handle the relative URLs.
190193
# TODO: Add a test that env subbed index urls do not leak into the lock file.
191194

192-
real_url = strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv))
195+
real_url = urllib.strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv))
193196

194-
cache_key = real_url
197+
cache_key = (url, real_url)
195198
cached_result = cache.get(cache_key)
196199
if cached_result:
197200
return struct(success = True, output = cached_result)
@@ -225,41 +228,43 @@ def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs):
225228
if download_kwargs.get("block") == False:
226229
# Simulate the same API as ctx.download has
227230
return struct(
228-
wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key),
231+
wait = lambda: _read_index_result(
232+
ctx,
233+
result = download.wait(),
234+
output = output,
235+
cache = cache,
236+
cache_key = cache_key,
237+
),
229238
)
230239

231-
return _read_index_result(ctx, download, output, real_url, cache, cache_key)
232-
233-
def strip_empty_path_segments(url):
234-
"""Removes empty path segments from a URL. Does nothing for urls with no scheme.
235-
236-
Public only for testing.
237-
238-
Args:
239-
url: The url to remove empty path segments from
240-
241-
Returns:
242-
The url with empty path segments removed and any trailing slash preserved.
243-
If the url had no scheme it is returned unchanged.
244-
"""
245-
scheme, _, rest = url.partition("://")
246-
if rest == "":
247-
return url
248-
stripped = "/".join([p for p in rest.split("/") if p])
249-
if url.endswith("/"):
250-
return "{}://{}/".format(scheme, stripped)
251-
else:
252-
return "{}://{}".format(scheme, stripped)
240+
return _read_index_result(
241+
ctx,
242+
result = download,
243+
output = output,
244+
cache = cache,
245+
cache_key = cache_key,
246+
)
253247

254-
def _read_index_result(ctx, result, output, url, cache, cache_key):
248+
def _read_index_result(ctx, *, result, output, cache, cache_key):
255249
if not result.success:
256250
return struct(success = False)
257251

258252
content = ctx.read(output)
259253

260-
output = parse_simpleapi_html(url = url, content = content)
254+
output = parse_simpleapi_html(content = content)
261255
if output:
262256
cache.setdefault(cache_key, output)
263-
return struct(success = True, output = output, cache_key = cache_key)
257+
return struct(success = True, output = output)
264258
else:
265259
return struct(success = False)
260+
261+
def _with_index_url(index_url, values):
262+
if not values:
263+
return values
264+
265+
return struct(
266+
sdists = values.sdists,
267+
whls = values.whls,
268+
sha256s_by_version = values.sha256s_by_version,
269+
index_url = index_url,
270+
)

0 commit comments

Comments
 (0)