1616Parse SimpleAPI HTML in Starlark.
1717"""
1818
19- def parse_simpleapi_html (* , url , content ):
19+ def parse_simpleapi_html (* , url , content , distribution = None , return_absolute = True ):
2020 """Get the package URLs for given shas by parsing the Simple API HTML.
2121
2222 Args:
2323 url(str): The URL that the HTML content can be downloaded from.
24+ distribution(str): TODO
2425 content(str): The Simple API HTML content.
26+ return_absolute: {type}`bool` TODO
2527
2628 Returns:
2729 A list of structs with:
@@ -33,6 +35,9 @@ def parse_simpleapi_html(*, url, content):
3335 present, then the 'metadata_url' is also present. Defaults to "".
3436 * metadata_url: The URL for the METADATA if we can download it. Defaults to "".
3537 """
38+ if not distribution :
39+ _ , _ , distribution = url .strip ("/" ).rpartition ("/" )
40+
3641 sdists = {}
3742 whls = {}
3843 lines = content .split ("<a href=\" " )
@@ -55,7 +60,8 @@ def parse_simpleapi_html(*, url, content):
5560 sha256s_by_version = {}
5661 for line in lines [1 :]:
5762 dist_url , _ , tail = line .partition ("#sha256=" )
58- dist_url = _absolute_url (url , dist_url )
63+ if return_absolute :
64+ dist_url = absolute_url (index_url = url , url = dist_url )
5965
6066 sha256 , _ , tail = tail .partition ("\" " )
6167
@@ -64,7 +70,7 @@ def parse_simpleapi_html(*, url, content):
6470
6571 head , _ , _ = tail .rpartition ("</a>" )
6672 maybe_metadata , _ , filename = head .rpartition (">" )
67- version = _version (filename )
73+ version = pkg_version (filename , distribution )
6874 sha256s_by_version .setdefault (version , []).append (sha256 )
6975
7076 metadata_sha256 = ""
@@ -79,13 +85,17 @@ def parse_simpleapi_html(*, url, content):
7985 break
8086
8187 if filename .endswith (".whl" ):
88+ metadata_url = metadata_url or ""
89+ if return_absolute and metadata_url :
90+ metadata_url = absolute_url (index_url = url , url = metadata_url )
91+
8292 whls [sha256 ] = struct (
8393 filename = filename ,
8494 version = version ,
8595 url = dist_url ,
8696 sha256 = sha256 ,
8797 metadata_sha256 = metadata_sha256 ,
88- metadata_url = _absolute_url ( url , metadata_url ) if metadata_url else "" ,
98+ metadata_url = metadata_url ,
8999 yanked = yanked ,
90100 )
91101 else :
@@ -110,18 +120,36 @@ _SDIST_EXTS = [
110120 ".zip" ,
111121]
112122
113- def _version (filename ):
123+ def pkg_version (filename , distribution = None ):
124+ """pkg_version extracts the version from the filename.
125+
126+ TODO: move this to a different location
127+
128+ Args:
129+ filename: TODO
130+ distribution: TODO
131+
132+ Returns:
133+ version string
134+ """
114135 # See https://packaging.python.org/en/latest/specifications/binary-distribution-format/#binary-distribution-format
115136
116- _ , _ , tail = filename .partition ("-" )
117- version , _ , _ = tail .partition ("-" )
118- if version != tail :
119- # The format is {name}-{version}-{whl_specifiers}.whl
120- return version
137+ if filename .endswith (".whl" ):
138+ _ , _ , tail = filename .partition ("-" )
139+ version , _ , _ = tail .partition ("-" )
140+ if version != tail :
141+ # The format is {name}-{version}-{whl_specifiers}.whl
142+ return version
143+
144+ if not distribution :
145+ fail ("for parsing sdists passing 'distribution' is mandatory" )
121146
122147 # NOTE @aignas 2025-03-29: most of the files are wheels, so this is not the common path
123148
124149 # {name}-{version}.{ext}
150+ # TODO @aignas 2026-01-20: test for handling dashes in names, can't think of any other way to
151+ # get the version from the filename but to pass in the distribution name to this function.
152+ version = filename [len (distribution ) + 1 :]
125153 for ext in _SDIST_EXTS :
126154 version , _ , _ = version .partition (ext ) # build or name
127155
@@ -147,26 +175,35 @@ def _is_downloadable(url):
147175 """
148176 return url .startswith ("http://" ) or url .startswith ("https://" ) or url .startswith ("file://" )
149177
150- def _absolute_url (index_url , candidate ):
151- if candidate == "" :
152- return candidate
178+ def absolute_url (* , index_url , url ):
179+ """Return an absolute URL in case the url is not absolute.
180+
181+ Args:
182+ index_url: {type}`str` The index_url.
183+ url: {type}`str` The url of the artifact.
184+
185+ Returns:
186+ `url` if it is absolute, or absolute URL based on the `index_url`.
187+ """
188+ if url == "" :
189+ return url
153190
154- if _is_downloadable (candidate ):
155- return candidate
191+ if _is_downloadable (url ):
192+ return url
156193
157- if candidate .startswith ("/" ):
194+ if url .startswith ("/" ):
158195 # absolute path
159196 root_directory = _get_root_directory (index_url )
160- return "{}{}" .format (root_directory , candidate )
197+ return "{}{}" .format (root_directory , url )
161198
162- if candidate .startswith (".." ):
199+ if url .startswith (".." ):
163200 # relative path with up references
164- candidate_parts = candidate .split (".." )
201+ candidate_parts = url .split (".." )
165202 last = candidate_parts [- 1 ]
166203 for _ in range (len (candidate_parts ) - 1 ):
167204 index_url , _ , _ = index_url .rstrip ("/" ).rpartition ("/" )
168205
169206 return "{}/{}" .format (index_url , last .strip ("/" ))
170207
171208 # relative path without up-references
172- return "{}/{}" .format (index_url .rstrip ("/" ), candidate )
209+ return "{}/{}" .format (index_url .rstrip ("/" ), url )
0 commit comments