From d4cea676e998b291961f0f0b953a7de7b7a35940 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Tue, 10 Mar 2026 20:05:54 +0900 Subject: [PATCH 1/3] wip wip --- python/private/pypi/parse_simpleapi_html.bzl | 24 ++- python/private/pypi/simpleapi_download.bzl | 175 +++++++++++-------- 2 files changed, 123 insertions(+), 76 deletions(-) diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl index 563130791e..78669d5ff8 100644 --- a/python/private/pypi/parse_simpleapi_html.bzl +++ b/python/private/pypi/parse_simpleapi_html.bzl @@ -16,16 +16,20 @@ Parse SimpleAPI HTML in Starlark. """ +load("//python/private:normalize_name.bzl", "normalize_name") load(":version_from_filename.bzl", "version_from_filename") -def parse_simpleapi_html(*, content): +def parse_simpleapi_html(*, content, parse_index = False): """Get the package URLs for given shas by parsing the Simple API HTML. Args: - content(str): The Simple API HTML content. + content: {type}`str` The Simple API HTML content. + parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index, + e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package. Returns: - A list of structs with: + If it is the index page, return the map of package to URL it can be queried from. + Otherwise, a list of structs with: * filename: {type}`str` The filename of the artifact. * version: {type}`str` The version of the artifact. * url: {type}`str` The URL to download the artifact. @@ -59,6 +63,8 @@ def parse_simpleapi_html(*, content): # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api fail("Unsupported API version: {}".format(api_version)) + packages = {} + # 2. Iterate using find() to avoid huge list allocations from .split("": { + # "": "", + # } + # } + # } + download = read_simpleapi( + ctx = ctx, + attr = attr, + url = urllib.strip_empty_path_segments("{index_url}/".format( + index_url = index_url, + )), + parse_index = True, + versions = None, + block = block, + allow_fail = False, + **kwargs + ) + if hasattr(download, "wait"): + downloads[index_url] = download + else: + results[index_url] = download + + for index_url, download in downloads.items(): + results[index_url] = download.wait() + + found_on_index = {} + for index_url, result in results.items(): + sources = [pkg for pkg in attr.sources if pkg not in found_on_index] + + available_packages = result.output + sources = [pkg for pkg in sources if normalize_name(pkg) in available_packages] + found_on_index.update({ + pkg: urllib.absolute_url(index_url, available_packages[normalize_name(pkg)]) + for pkg in sources + }) + + failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index] if failed_sources: pkg_index_urls = { pkg: index_url_overrides.get( @@ -148,7 +191,7 @@ def simpleapi_download( _fail( """ -Failed to download metadata of the following packages from urls: +Failed to find packages on PyPI of the following packages from urls: {pkg_index_urls} If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call. @@ -159,22 +202,9 @@ If you would like to skip downloading metadata for these packages please add 'si ) return None - if warn_overrides: - index_url_overrides = { - pkg: found_on_index[pkg] - for pkg in attr.sources - if found_on_index[pkg] != attr.index_url - } - - if index_url_overrides: - # buildifier: disable=print - print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format( - render.dict(index_url_overrides), - )) - - return contents + return {normalize_name(pkg): url for pkg, url in found_on_index.items()} -def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download_kwargs): +def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs): """Read SimpleAPI. Args: @@ -189,6 +219,7 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download cache: {type}`struct` the `pypi_cache` instance. versions: {type}`list[str] The versions that have been requested. get_auth: A function to get auth information. Used in tests. + parse_index: TODO **download_kwargs: Any extra params to ctx.download. Note that output and auth will be passed for you. @@ -242,6 +273,7 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download output = output, cache = cache, cache_key = cache_key, + parse_index = parse_index, ), ) @@ -251,15 +283,16 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download output = output, cache = cache, cache_key = cache_key, + parse_index = parse_index, ) -def _read_index_result(ctx, *, result, output, cache, cache_key): +def _read_index_result(ctx, *, result, output, cache, cache_key, parse_index): if not result.success: return struct(success = False) content = ctx.read(output) - output = parse_simpleapi_html(content = content) + output = parse_simpleapi_html(content = content, parse_index = parse_index) if output: cache.setdefault(cache_key, output) return struct(success = True, output = output) From 0107a54801377f16ea1caf6e206076d201d13aec Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:09:51 +0900 Subject: [PATCH 2/3] add facts --- python/private/pypi/pypi_cache.bzl | 51 +++++++++++++++ python/private/pypi/simpleapi_download.bzl | 73 ++++++++-------------- 2 files changed, 76 insertions(+), 48 deletions(-) diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl index 28c6cbeafb..747bf6a7a1 100644 --- a/python/private/pypi/pypi_cache.bzl +++ b/python/private/pypi/pypi_cache.bzl @@ -122,6 +122,15 @@ def _filter_packages(dists, requested_versions): if dists == None or not requested_versions: return dists + if type(dists) == "dict": + pkgs = requested_versions + filtered = { + pkg: url + for pkg, url in dists.items() + if pkg in pkgs + } + return filtered + sha256s_by_version = {} whls = {} sdists = {} @@ -193,6 +202,12 @@ def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_ver # cannot trust known facts, different version that we know how to parse return None + if type(requested_versions) == "dict": + return _filter_packages( + dists = known_facts.get("index_urls", {}).get(index_url, {}), + requested_versions = requested_versions, + ) + known_sources = {} root_url, _, distribution = index_url.rstrip("/").rpartition("/") @@ -266,10 +281,46 @@ def _store_facts(facts, fact_version, index_url, value): facts["fact_version"] = fact_version + if type(value) == "dict": + # facts: { + # "index_urls": { + # "": { + # "": "", + # }, + # }, + # }, + for pkg, url in value.items(): + facts.setdefault("index_urls", {}).setdefault(index_url, {}).setdefault(pkg, url) + return value + root_url, _, distribution = index_url.rstrip("/").rpartition("/") distribution = distribution.rstrip("/") root_url = root_url.rstrip("/") + # The schema is + # facts: { + # "dist_hashes": { + # "": { + # "": { + # "": "", + # }, + # }, + # }, + # "dist_filenames": { + # "": { + # "": { + # "": "", # if it is different from the URL + # }, + # }, + # }, + # "dist_yanked": { + # "": { + # "": { + # "": "", # if the package is yanked + # }, + # }, + # }, + # }, for sha256, d in (value.sdists | value.whls).items(): facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256) if not d.url.endswith(d.filename): diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index f15d835a48..a1dd447e1e 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -75,6 +75,11 @@ def simpleapi_download( for p, i in (attr.index_url_overrides or {}).items() } + sources = { + normalize_name(pkg): versions + for pkg, versions in attr.sources.items() + } + # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes # to replicate how `pip` would handle this case. contents = {} @@ -83,8 +88,9 @@ def simpleapi_download( dist_urls = _get_dist_urls( ctx, - index_urls, - index_url_overrides, + index_urls = index_urls, + index_url_overrides = index_url_overrides, + sources = sources, read_simpleapi = read_simpleapi, cache = cache, get_auth = get_auth, @@ -95,11 +101,6 @@ def simpleapi_download( ctx.report_progress("Fetch package lists from PyPI index") - sources = { - normalize_name(pkg): versions - for pkg, versions in attr.sources.items() - } - downloads = {} contents = {} for pkg, url in dist_urls.items(): @@ -125,29 +126,10 @@ def simpleapi_download( return contents -def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr, block, _fail = fail, **kwargs): - if index_url_overrides: - first_index = index_urls[0] - return { - pkg: urllib.strip_empty_path_segments("{index_url}/{distribution}/".format( - index_url = index_url_overrides.get(normalize_name(pkg), first_index).rstrip("/"), - distribution = pkg, - )) - for pkg in attr.sources - } - +def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs): downloads = {} results = {} for index_url in index_urls: - # TODO @aignas 2026-03-20: pull from the cache/facts - # we can store the following schema: - # facts: { - # "index_urls": { - # "": { - # "": "", - # } - # } - # } download = read_simpleapi( ctx = ctx, attr = attr, @@ -155,7 +137,7 @@ def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr index_url = index_url, )), parse_index = True, - versions = None, + versions = {pkg: None for pkg in sources}, block = block, allow_fail = False, **kwargs @@ -170,25 +152,25 @@ def _get_dist_urls(ctx, index_urls, index_url_overrides, read_simpleapi, *, attr found_on_index = {} for index_url, result in results.items(): - sources = [pkg for pkg in attr.sources if pkg not in found_on_index] - - available_packages = result.output - sources = [pkg for pkg in sources if normalize_name(pkg) in available_packages] + # Filter out the things that we have already found found_on_index.update({ - pkg: urllib.absolute_url(index_url, available_packages[normalize_name(pkg)]) + pkg: urllib.absolute_url(index_url, result.output[pkg]) for pkg in sources }) + sources = [ + pkg + for pkg in sources + if pkg not in found_on_index + ] - failed_sources = [pkg for pkg in attr.sources if pkg not in found_on_index] - if failed_sources: + if sources: pkg_index_urls = { - pkg: index_url_overrides.get( - normalize_name(pkg), - index_urls, - ) - for pkg in failed_sources + pkg: index_url_overrides.get(pkg, index_urls) + for pkg in sources } + # TODO @aignas 2026-03-20: we haven't found these pkgs on the index, so we can + # print a warning, or we can fallback to PyPI. For now let's fail _fail( """ Failed to find packages on PyPI of the following packages from urls: @@ -196,13 +178,13 @@ Failed to find packages on PyPI of the following packages from urls: If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call. """.format( - pkg_index_urls = render.dict(pkg_index_urls), - failed_sources = render.list(failed_sources), + pkg_index_urls = render.dict(dict(sorted(pkg_index_urls.items()))), + failed_sources = render.list(sources), ), ) return None - return {normalize_name(pkg): url for pkg, url in found_on_index.items()} + return found_on_index def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs): """Read SimpleAPI. @@ -227,11 +209,6 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_inde A similar object to what `download` would return except that in result.out will be the parsed simple api contents. """ - # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for - # the whl location and we cannot handle multiple URLs at once by passing - # them to ctx.download if we want to correctly handle the relative URLs. - # TODO: Add a test that env subbed index urls do not leak into the lock file. - real_url = urllib.strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv)) cache_key = (url, real_url, versions) From 0492f31707349406229ffc12109dd6eab82bcea5 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Fri, 20 Mar 2026 13:48:46 +0900 Subject: [PATCH 3/3] finish POC --- python/private/pypi/pypi_cache.bzl | 11 ++++++----- python/private/pypi/simpleapi_download.bzl | 4 +--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/python/private/pypi/pypi_cache.bzl b/python/private/pypi/pypi_cache.bzl index 747bf6a7a1..bc92de0bde 100644 --- a/python/private/pypi/pypi_cache.bzl +++ b/python/private/pypi/pypi_cache.bzl @@ -89,6 +89,9 @@ def _pypi_cache_get(self, key): if not cached and versions: # Could not get from in-memory, read from lockfile facts cached = self._facts.get(index_url, versions) + else: + # TODO @aignas 2026-03-20: add a test here + self._facts.setdefault(index_url, cached) return cached @@ -123,13 +126,11 @@ def _filter_packages(dists, requested_versions): return dists if type(dists) == "dict": - pkgs = requested_versions - filtered = { + return { pkg: url for pkg, url in dists.items() - if pkg in pkgs + if pkg in requested_versions } - return filtered sha256s_by_version = {} whls = {} @@ -290,7 +291,7 @@ def _store_facts(facts, fact_version, index_url, value): # }, # }, for pkg, url in value.items(): - facts.setdefault("index_urls", {}).setdefault(index_url, {}).setdefault(pkg, url) + facts.setdefault("index_urls", {}).setdefault(index_url, {})[pkg] = url return value root_url, _, distribution = index_url.rstrip("/").rpartition("/") diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index a1dd447e1e..b8caacef82 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -82,13 +82,11 @@ def simpleapi_download( # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes # to replicate how `pip` would handle this case. - contents = {} - index_urls = [attr.index_url] + attr.extra_index_urls read_simpleapi = read_simpleapi or _read_simpleapi dist_urls = _get_dist_urls( ctx, - index_urls = index_urls, + index_urls = [attr.index_url] + attr.extra_index_urls, index_url_overrides = index_url_overrides, sources = sources, read_simpleapi = read_simpleapi,