|
16 | 16 | Parse SimpleAPI HTML in Starlark. |
17 | 17 | """ |
18 | 18 |
|
| 19 | +load("//python/private:normalize_name.bzl", "normalize_name") |
19 | 20 | load(":version_from_filename.bzl", "version_from_filename") |
20 | 21 |
|
21 | | -def parse_simpleapi_html(*, content): |
| 22 | +def parse_simpleapi_html(*, content, parse_index = False): |
22 | 23 | """Get the package URLs for given shas by parsing the Simple API HTML. |
23 | 24 |
|
24 | 25 | Args: |
25 | | - content(str): The Simple API HTML content. |
| 26 | + content: {type}`str` The Simple API HTML content. |
| 27 | + parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index, |
| 28 | + e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package. |
26 | 29 |
|
27 | 30 | Returns: |
28 | | - A list of structs with: |
| 31 | + If it is the index page, return the map of package to URL it can be queried from. |
| 32 | + Otherwise, a list of structs with: |
29 | 33 | * filename: {type}`str` The filename of the artifact. |
30 | 34 | * version: {type}`str` The version of the artifact. |
31 | 35 | * url: {type}`str` The URL to download the artifact. |
@@ -59,32 +63,44 @@ def parse_simpleapi_html(*, content): |
59 | 63 | # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api |
60 | 64 | fail("Unsupported API version: {}".format(api_version)) |
61 | 65 |
|
| 66 | + packages = {} |
| 67 | + |
62 | 68 | # 2. Iterate using find() to avoid huge list allocations from .split("<a ") |
63 | 69 | cursor = 0 |
64 | 70 | for _ in range(1000000): # Safety break for Starlark |
65 | 71 | start_tag = content.find("<a ", cursor) |
66 | 72 | if start_tag == -1: |
67 | 73 | break |
68 | 74 |
|
69 | | - # Find the end of the opening tag and the closing </a> |
70 | | - tag_end = content.find(">", start_tag) |
71 | | - end_tag = content.find("</a>", tag_end) |
72 | | - if tag_end == -1 or end_tag == -1: |
| 75 | + # Find the closing </a> tag first, then find the end of the opening |
| 76 | + # <a ...> tag using rfind. This correctly handles attributes that |
| 77 | + # contain > characters, e.g. data-requires-python=">=3.6". |
| 78 | + end_tag = content.find("</a>", start_tag) |
| 79 | + if end_tag == -1: |
73 | 80 | break |
| 81 | + tag_end = content.rfind(">", start_tag, end_tag) |
| 82 | + if tag_end == -1 or tag_end <= start_tag: |
| 83 | + cursor = end_tag + 4 |
| 84 | + continue |
74 | 85 |
|
75 | 86 | # Extract only the necessary slices |
76 | | - attr_part = content[start_tag + 3:tag_end] |
77 | 87 | filename = content[tag_end + 1:end_tag].strip() |
| 88 | + attr_part = content[start_tag + 3:tag_end] |
78 | 89 |
|
79 | 90 | # Update cursor for next iteration |
80 | 91 | cursor = end_tag + 4 |
81 | 92 |
|
82 | | - # 3. Efficient Attribute Parsing |
83 | 93 | attrs = _parse_attrs(attr_part) |
84 | 94 | href = attrs.get("href", "") |
85 | 95 | if not href: |
86 | 96 | continue |
87 | 97 |
|
| 98 | + if parse_index: |
| 99 | + pkg_name = filename |
| 100 | + packages[normalize_name(pkg_name)] = href |
| 101 | + continue |
| 102 | + |
| 103 | + # 3. Efficient Attribute Parsing |
88 | 104 | dist_url, _, sha256 = href.partition("#sha256=") |
89 | 105 |
|
90 | 106 | # Handle Yanked status |
@@ -121,6 +137,9 @@ def parse_simpleapi_html(*, content): |
121 | 137 | else: |
122 | 138 | sdists[sha256] = dist |
123 | 139 |
|
| 140 | + if parse_index: |
| 141 | + return packages |
| 142 | + |
124 | 143 | return struct( |
125 | 144 | sdists = sdists, |
126 | 145 | whls = whls, |
|
0 commit comments