diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl index 563130791e..78669d5ff8 100644 --- a/python/private/pypi/parse_simpleapi_html.bzl +++ b/python/private/pypi/parse_simpleapi_html.bzl @@ -16,16 +16,20 @@ Parse SimpleAPI HTML in Starlark. """ +load("//python/private:normalize_name.bzl", "normalize_name") load(":version_from_filename.bzl", "version_from_filename") -def parse_simpleapi_html(*, content): +def parse_simpleapi_html(*, content, parse_index = False): """Get the package URLs for given shas by parsing the Simple API HTML. Args: - content(str): The Simple API HTML content. + content: {type}`str` The Simple API HTML content. + parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index, + e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package. Returns: - A list of structs with: + If it is the index page, return the map of package to URL it can be queried from. + Otherwise, a list of structs with: * filename: {type}`str` The filename of the artifact. * version: {type}`str` The version of the artifact. * url: {type}`str` The URL to download the artifact. @@ -59,6 +63,8 @@ def parse_simpleapi_html(*, content): # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api fail("Unsupported API version: {}".format(api_version)) + packages = {} + # 2. Iterate using find() to avoid huge list allocations from .split("": { + # "": "", + # }, + # }, + # }, + for pkg, url in value.items(): + facts.setdefault("index_urls", {}).setdefault(index_url, {})[pkg] = url + return value + root_url, _, distribution = index_url.rstrip("/").rpartition("/") distribution = distribution.rstrip("/") root_url = root_url.rstrip("/") + # The schema is + # facts: { + # "dist_hashes": { + # "": { + # "": { + # "": "", + # }, + # }, + # }, + # "dist_filenames": { + # "": { + # "": { + # "": "", # if it is different from the URL + # }, + # }, + # }, + # "dist_yanked": { + # "": { + # "": { + # "": "", # if the package is yanked + # }, + # }, + # }, + # }, for sha256, d in (value.sdists | value.whls).items(): facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256) if not d.url.endswith(d.filename): diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 20d79ba9b4..b8caacef82 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -16,7 +16,6 @@ A file that houses private functions used in the `bzlmod` extension with the same name. """ -load("@bazel_features//:features.bzl", "bazel_features") load("//python/private:auth.bzl", _get_auth = "get_auth") load("//python/private:envsubst.bzl", "envsubst") load("//python/private:normalize_name.bzl", "normalize_name") @@ -35,6 +34,11 @@ def simpleapi_download( _fail = fail): """Download Simple API HTML. + First it queries all of the indexes for available packages and then it downloads the contents of + the per-package URLs and sha256 values. This is to enable us to use bazel_downloader with + `requirements.txt` files. As a side effect we also are able to "cross-compile" by fetching the + right wheel for the right target platform through the information that we retrieve here. + Args: ctx: The module_ctx or repository_ctx. attr: Contains the parameters for the download. They are grouped into a @@ -71,110 +75,116 @@ def simpleapi_download( for p, i in (attr.index_url_overrides or {}).items() } + sources = { + normalize_name(pkg): versions + for pkg, versions in attr.sources.items() + } + # NOTE @aignas 2024-03-31: we are not merging results from multiple indexes # to replicate how `pip` would handle this case. - contents = {} - index_urls = [attr.index_url] + attr.extra_index_urls read_simpleapi = read_simpleapi or _read_simpleapi - download_kwargs = {} - if bazel_features.external_deps.download_has_block_param: - download_kwargs["block"] = not parallel_download - - if len(index_urls) == 1 or index_url_overrides: - download_kwargs["allow_fail"] = False - else: - download_kwargs["allow_fail"] = True - - input_sources = attr.sources + dist_urls = _get_dist_urls( + ctx, + index_urls = [attr.index_url] + attr.extra_index_urls, + index_url_overrides = index_url_overrides, + sources = sources, + read_simpleapi = read_simpleapi, + cache = cache, + get_auth = get_auth, + attr = attr, + block = not parallel_download, + _fail = _fail, + ) - found_on_index = {} - warn_overrides = False ctx.report_progress("Fetch package lists from PyPI index") - for i, index_url in enumerate(index_urls): - if i != 0: - # Warn the user about a potential fix for the overrides - warn_overrides = True - - async_downloads = {} - sources = {pkg: versions for pkg, versions in input_sources.items() if pkg not in found_on_index} - for pkg, versions in sources.items(): - pkg_normalized = normalize_name(pkg) - url = urllib.strip_empty_path_segments("{index_url}/{distribution}/".format( - index_url = index_url_overrides.get(pkg_normalized, index_url).rstrip("/"), - distribution = pkg, - )) - result = read_simpleapi( - ctx = ctx, - attr = attr, - versions = versions, - url = url, - cache = cache, - get_auth = get_auth, - **download_kwargs - ) - if hasattr(result, "wait"): - # We will process it in a separate loop: - async_downloads[pkg] = struct( - pkg_normalized = pkg_normalized, - wait = result.wait, - url = url, - ) - elif result.success: - contents[pkg_normalized] = _with_index_url(url, result.output) - found_on_index[pkg] = index_url - - if not async_downloads: - continue + downloads = {} + contents = {} + for pkg, url in dist_urls.items(): + result = read_simpleapi( + ctx = ctx, + attr = attr, + url = url, + cache = cache, + versions = sources[pkg], + get_auth = get_auth, + block = not parallel_download, + ) + if hasattr(result, "wait"): + # We will process it in a separate loop: + downloads[pkg] = result + else: + contents[pkg] = _with_index_url(url, result.output) + + for pkg, d in downloads.items(): # If we use `block` == False, then we need to have a second loop that is # collecting all of the results as they were being downloaded in parallel. - for pkg, download in async_downloads.items(): - result = download.wait() + contents[pkg] = _with_index_url(dist_urls[pkg], d.wait().output) + + return contents + +def _get_dist_urls(ctx, *, index_urls, index_url_overrides, sources, read_simpleapi, attr, block, _fail = fail, **kwargs): + downloads = {} + results = {} + for index_url in index_urls: + download = read_simpleapi( + ctx = ctx, + attr = attr, + url = urllib.strip_empty_path_segments("{index_url}/".format( + index_url = index_url, + )), + parse_index = True, + versions = {pkg: None for pkg in sources}, + block = block, + allow_fail = False, + **kwargs + ) + if hasattr(download, "wait"): + downloads[index_url] = download + else: + results[index_url] = download - if result.success: - contents[download.pkg_normalized] = _with_index_url(download.url, result.output) - found_on_index[pkg] = index_url + for index_url, download in downloads.items(): + results[index_url] = download.wait() - failed_sources = [pkg for pkg in input_sources if pkg not in found_on_index] - if failed_sources: + found_on_index = {} + for index_url, result in results.items(): + # Filter out the things that we have already found + found_on_index.update({ + pkg: urllib.absolute_url(index_url, result.output[pkg]) + for pkg in sources + }) + sources = [ + pkg + for pkg in sources + if pkg not in found_on_index + ] + + if sources: pkg_index_urls = { - pkg: index_url_overrides.get( - normalize_name(pkg), - index_urls, - ) - for pkg in failed_sources + pkg: index_url_overrides.get(pkg, index_urls) + for pkg in sources } + # TODO @aignas 2026-03-20: we haven't found these pkgs on the index, so we can + # print a warning, or we can fallback to PyPI. For now let's fail _fail( """ -Failed to download metadata of the following packages from urls: +Failed to find packages on PyPI of the following packages from urls: {pkg_index_urls} If you would like to skip downloading metadata for these packages please add 'simpleapi_skip={failed_sources}' to your 'pip.parse' call. """.format( - pkg_index_urls = render.dict(pkg_index_urls), - failed_sources = render.list(failed_sources), + pkg_index_urls = render.dict(dict(sorted(pkg_index_urls.items()))), + failed_sources = render.list(sources), ), ) return None - if warn_overrides: - index_url_overrides = { - pkg: found_on_index[pkg] - for pkg in attr.sources - if found_on_index[pkg] != attr.index_url - } - - if index_url_overrides: - # buildifier: disable=print - print("You can use the following `index_url_overrides` to avoid the 404 warnings:\n{}".format( - render.dict(index_url_overrides), - )) + return found_on_index - return contents - -def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download_kwargs): +def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, parse_index = False, **download_kwargs): """Read SimpleAPI. Args: @@ -189,6 +199,7 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download cache: {type}`struct` the `pypi_cache` instance. versions: {type}`list[str] The versions that have been requested. get_auth: A function to get auth information. Used in tests. + parse_index: TODO **download_kwargs: Any extra params to ctx.download. Note that output and auth will be passed for you. @@ -196,11 +207,6 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download A similar object to what `download` would return except that in result.out will be the parsed simple api contents. """ - # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for - # the whl location and we cannot handle multiple URLs at once by passing - # them to ctx.download if we want to correctly handle the relative URLs. - # TODO: Add a test that env subbed index urls do not leak into the lock file. - real_url = urllib.strip_empty_path_segments(envsubst(url, attr.envsubst, ctx.getenv)) cache_key = (url, real_url, versions) @@ -242,6 +248,7 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download output = output, cache = cache, cache_key = cache_key, + parse_index = parse_index, ), ) @@ -251,15 +258,16 @@ def _read_simpleapi(ctx, url, attr, cache, versions, get_auth = None, **download output = output, cache = cache, cache_key = cache_key, + parse_index = parse_index, ) -def _read_index_result(ctx, *, result, output, cache, cache_key): +def _read_index_result(ctx, *, result, output, cache, cache_key, parse_index): if not result.success: return struct(success = False) content = ctx.read(output) - output = parse_simpleapi_html(content = content) + output = parse_simpleapi_html(content = content, parse_index = parse_index) if output: cache.setdefault(cache_key, output) return struct(success = True, output = output)