bazel-contrib · aignas · Mar 10, 2026 · Mar 20, 2026 · Mar 20, 2026 · aignas
@@ -16,16 +16,20 @@
 Parse SimpleAPI HTML in Starlark.
 """
 
+load("//python/private:normalize_name.bzl", "normalize_name")
 load(":version_from_filename.bzl", "version_from_filename")
 
-def parse_simpleapi_html(*, content):
+def parse_simpleapi_html(*, content, parse_index = False):
     """Get the package URLs for given shas by parsing the Simple API HTML.
 
     Args:
-        content(str): The Simple API HTML content.
+        content: {type}`str` The Simple API HTML content.
+        parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index,
+            e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package.
 
     Returns:
-        A list of structs with:
+        If it is the index page, return the map of package to URL it can be queried from.
+        Otherwise, a list of structs with:
           * filename: {type}`str` The filename of the artifact.
           * version: {type}`str` The version of the artifact.
           * url: {type}`str` The URL to download the artifact.
@@ -59,6 +63,8 @@ def parse_simpleapi_html(*, content):
         # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
         fail("Unsupported API version: {}".format(api_version))
 
+    packages = {}
+
     # 2. Iterate using find() to avoid huge list allocations from .split("<a ")
     cursor = 0
     for _ in range(1000000):  # Safety break for Starlark
@@ -73,18 +79,23 @@ def parse_simpleapi_html(*, content):
             break
 
         # Extract only the necessary slices
-        attr_part = content[start_tag + 3:tag_end]
         filename = content[tag_end + 1:end_tag].strip()
+        attr_part = content[start_tag + 3:tag_end]
 
         # Update cursor for next iteration
         cursor = end_tag + 4
 
-        # 3. Efficient Attribute Parsing
         attrs = _parse_attrs(attr_part)
         href = attrs.get("href", "")
         if not href:
             continue
 
+        if parse_index:
+            pkg_name = filename
+            packages[normalize_name(pkg_name)] = href
+            continue
+
+        # 3. Efficient Attribute Parsing
         dist_url, _, sha256 = href.partition("#sha256=")
 
         # Handle Yanked status
@@ -121,6 +132,9 @@ def parse_simpleapi_html(*, content):
         else:
             sdists[sha256] = dist
 
+    if packages:
+        return packages
+
     return struct(
         sdists = sdists,
         whls = whls,

@@ -89,6 +89,9 @@ def _pypi_cache_get(self, key):
     if not cached and versions:
         # Could not get from in-memory, read from lockfile facts
         cached = self._facts.get(index_url, versions)
+    else:
+        # TODO @aignas 2026-03-20: add a test here
+        self._facts.setdefault(index_url, cached)
 
     return cached
 
@@ -122,6 +125,13 @@ def _filter_packages(dists, requested_versions):
     if dists == None or not requested_versions:
         return dists
 
+    if type(dists) == "dict":
+        return {
+            pkg: url
+            for pkg, url in dists.items()
+            if pkg in requested_versions
+        }
+
     sha256s_by_version = {}
     whls = {}
     sdists = {}
@@ -193,6 +203,12 @@ def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_ver
         # cannot trust known facts, different version that we know how to parse
         return None
 
+    if type(requested_versions) == "dict":
+        return _filter_packages(
+            dists = known_facts.get("index_urls", {}).get(index_url, {}),
+            requested_versions = requested_versions,
+        )
+
     known_sources = {}
 
     root_url, _, distribution = index_url.rstrip("/").rpartition("/")
@@ -266,10 +282,46 @@ def _store_facts(facts, fact_version, index_url, value):
 
     facts["fact_version"] = fact_version
 
+    if type(value) == "dict":
+        # facts: {
+        #   "index_urls": {
+        #     "<index_url>": {
+        #       "<pkg_normalized>": "<dist_url>",
+        #     },
+        #   },
+        # },
+        for pkg, url in value.items():
+            facts.setdefault("index_urls", {}).setdefault(index_url, {})[pkg] = url
+        return value
+
     root_url, _, distribution = index_url.rstrip("/").rpartition("/")
     distribution = distribution.rstrip("/")
     root_url = root_url.rstrip("/")
 
+    # The schema is
+    # facts: {
+    #   "dist_hashes": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<dist url>": "<sha256>",
+    #       },
+    #     },
+    #   },
+    #   "dist_filenames": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<dist url>": "<filename>",   # if it is different from the URL
+    #       },
+    #     },
+    #   },
+    #   "dist_yanked": {
+    #     "<index_url>": {
+    #       "<last segment>": {
+    #         "<sha256>": "<reason>",   # if the package is yanked
+    #       },
+    #     },
+    #   },
+    # },
     for sha256, d in (value.sdists | value.whls).items():
         facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
         if not d.url.endswith(d.filename):