1616Parse SimpleAPI HTML in Starlark.
1717"""
1818
19- def parse_simpleapi_html (* , url , content ):
19+ def parse_simpleapi_html (* , content , distribution ):
2020 """Get the package URLs for given shas by parsing the Simple API HTML.
2121
2222 Args:
23- url (str): The URL that the HTML content can be downloaded from .
23+ distribution (str): Distribution name for which we are parsing the HTML .
2424 content(str): The Simple API HTML content.
2525
2626 Returns:
@@ -55,16 +55,14 @@ def parse_simpleapi_html(*, url, content):
5555 sha256s_by_version = {}
5656 for line in lines [1 :]:
5757 dist_url , _ , tail = line .partition ("#sha256=" )
58- dist_url = _absolute_url (url , dist_url )
59-
6058 sha256 , _ , tail = tail .partition ("\" " )
6159
6260 # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
6361 yanked = "data-yanked" in line
6462
6563 head , _ , _ = tail .rpartition ("</a>" )
6664 maybe_metadata , _ , filename = head .rpartition (">" )
67- version = _version (filename )
65+ version = pkg_version (filename , distribution )
6866 sha256s_by_version .setdefault (version , []).append (sha256 )
6967
7068 metadata_sha256 = ""
@@ -79,13 +77,14 @@ def parse_simpleapi_html(*, url, content):
7977 break
8078
8179 if filename .endswith (".whl" ):
80+ metadata_url = metadata_url or ""
8281 whls [sha256 ] = struct (
8382 filename = filename ,
8483 version = version ,
8584 url = dist_url ,
8685 sha256 = sha256 ,
8786 metadata_sha256 = metadata_sha256 ,
88- metadata_url = _absolute_url ( url , metadata_url ) if metadata_url else "" ,
87+ metadata_url = metadata_url ,
8988 yanked = yanked ,
9089 )
9190 else :
@@ -110,18 +109,36 @@ _SDIST_EXTS = [
110109 ".zip" ,
111110]
112111
113- def _version (filename ):
112+ def pkg_version (filename , distribution = None ):
113+ """pkg_version extracts the version from the filename.
114+
115+ TODO: move this to a different location
116+
117+ Args:
118+ filename: TODO
119+ distribution: TODO
120+
121+ Returns:
122+ version string
123+ """
114124 # See https://packaging.python.org/en/latest/specifications/binary-distribution-format/#binary-distribution-format
115125
116- _ , _ , tail = filename .partition ("-" )
117- version , _ , _ = tail .partition ("-" )
118- if version != tail :
119- # The format is {name}-{version}-{whl_specifiers}.whl
120- return version
126+ if filename .endswith (".whl" ):
127+ _ , _ , tail = filename .partition ("-" )
128+ version , _ , _ = tail .partition ("-" )
129+ if version != tail :
130+ # The format is {name}-{version}-{whl_specifiers}.whl
131+ return version
132+
133+ if not distribution :
134+ fail ("for parsing sdists passing 'distribution' is mandatory" )
121135
122136 # NOTE @aignas 2025-03-29: most of the files are wheels, so this is not the common path
123137
124138 # {name}-{version}.{ext}
139+ # TODO @aignas 2026-01-20: test for handling dashes in names, can't think of any other way to
140+ # get the version from the filename but to pass in the distribution name to this function.
141+ version = filename [len (distribution ) + 1 :]
125142 for ext in _SDIST_EXTS :
126143 version , _ , _ = version .partition (ext ) # build or name
127144
@@ -147,26 +164,35 @@ def _is_downloadable(url):
147164 """
148165 return url .startswith ("http://" ) or url .startswith ("https://" ) or url .startswith ("file://" )
149166
150- def _absolute_url (index_url , candidate ):
151- if candidate == "" :
152- return candidate
167+ def absolute_url (* , index_url , url ):
168+ """Return an absolute URL in case the url is not absolute.
169+
170+ Args:
171+ index_url: {type}`str` The index_url.
172+ url: {type}`str` The url of the artifact.
173+
174+ Returns:
175+ `url` if it is absolute, or absolute URL based on the `index_url`.
176+ """
177+ if url == "" :
178+ return url
153179
154- if _is_downloadable (candidate ):
155- return candidate
180+ if _is_downloadable (url ):
181+ return url
156182
157- if candidate .startswith ("/" ):
183+ if url .startswith ("/" ):
158184 # absolute path
159185 root_directory = _get_root_directory (index_url )
160- return "{}{}" .format (root_directory , candidate )
186+ return "{}{}" .format (root_directory , url )
161187
162- if candidate .startswith (".." ):
188+ if url .startswith (".." ):
163189 # relative path with up references
164- candidate_parts = candidate .split (".." )
190+ candidate_parts = url .split (".." )
165191 last = candidate_parts [- 1 ]
166192 for _ in range (len (candidate_parts ) - 1 ):
167193 index_url , _ , _ = index_url .rstrip ("/" ).rpartition ("/" )
168194
169195 return "{}/{}" .format (index_url , last .strip ("/" ))
170196
171197 # relative path without up-references
172- return "{}/{}" .format (index_url .rstrip ("/" ), candidate )
198+ return "{}/{}" .format (index_url .rstrip ("/" ), url )
0 commit comments