Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions python/private/pypi/parse_simpleapi_html.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,20 @@
Parse SimpleAPI HTML in Starlark.
"""

load("//python/private:normalize_name.bzl", "normalize_name")
load(":version_from_filename.bzl", "version_from_filename")

def parse_simpleapi_html(*, content):
def parse_simpleapi_html(*, content, parse_index = False):
"""Get the package URLs for given shas by parsing the Simple API HTML.

Args:
content(str): The Simple API HTML content.
content: {type}`str` The Simple API HTML content.
parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index,
e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package.

Returns:
A list of structs with:
If it is the index page, return the map of package to URL it can be queried from.
Otherwise, a list of structs with:
* filename: {type}`str` The filename of the artifact.
* version: {type}`str` The version of the artifact.
* url: {type}`str` The URL to download the artifact.
Expand Down Expand Up @@ -59,6 +63,8 @@ def parse_simpleapi_html(*, content):
# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
fail("Unsupported API version: {}".format(api_version))

packages = {}

# 2. Iterate using find() to avoid huge list allocations from .split("<a ")
cursor = 0
for _ in range(1000000): # Safety break for Starlark
Expand All @@ -73,18 +79,23 @@ def parse_simpleapi_html(*, content):
break

# Extract only the necessary slices
attr_part = content[start_tag + 3:tag_end]
filename = content[tag_end + 1:end_tag].strip()
attr_part = content[start_tag + 3:tag_end]

# Update cursor for next iteration
cursor = end_tag + 4

# 3. Efficient Attribute Parsing
attrs = _parse_attrs(attr_part)
href = attrs.get("href", "")
if not href:
continue

if parse_index:
pkg_name = filename
packages[normalize_name(pkg_name)] = href
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check if we have to normalize all of the package names here. It might be a little bit of throw away work that we are doing here.

continue

# 3. Efficient Attribute Parsing
dist_url, _, sha256 = href.partition("#sha256=")

# Handle Yanked status
Expand Down Expand Up @@ -121,6 +132,9 @@ def parse_simpleapi_html(*, content):
else:
sdists[sha256] = dist

if packages:
return packages

return struct(
sdists = sdists,
whls = whls,
Expand Down
52 changes: 52 additions & 0 deletions python/private/pypi/pypi_cache.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ def _pypi_cache_get(self, key):
if not cached and versions:
# Could not get from in-memory, read from lockfile facts
cached = self._facts.get(index_url, versions)
else:
# TODO @aignas 2026-03-20: add a test here
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: split into a separate PR, because it is a bug in the current implementation.

self._facts.setdefault(index_url, cached)

return cached

Expand Down Expand Up @@ -122,6 +125,13 @@ def _filter_packages(dists, requested_versions):
if dists == None or not requested_versions:
return dists

if type(dists) == "dict":
return {
pkg: url
for pkg, url in dists.items()
if pkg in requested_versions
}

sha256s_by_version = {}
whls = {}
sdists = {}
Expand Down Expand Up @@ -193,6 +203,12 @@ def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_ver
# cannot trust known facts, different version that we know how to parse
return None

if type(requested_versions) == "dict":
return _filter_packages(
dists = known_facts.get("index_urls", {}).get(index_url, {}),
requested_versions = requested_versions,
)

known_sources = {}

root_url, _, distribution = index_url.rstrip("/").rpartition("/")
Expand Down Expand Up @@ -266,10 +282,46 @@ def _store_facts(facts, fact_version, index_url, value):

facts["fact_version"] = fact_version

if type(value) == "dict":
# facts: {
# "index_urls": {
# "<index_url>": {
# "<pkg_normalized>": "<dist_url>",
# },
# },
# },
for pkg, url in value.items():
facts.setdefault("index_urls", {}).setdefault(index_url, {})[pkg] = url
return value

root_url, _, distribution = index_url.rstrip("/").rpartition("/")
distribution = distribution.rstrip("/")
root_url = root_url.rstrip("/")

# The schema is
# facts: {
# "dist_hashes": {
# "<index_url>": {
# "<last segment>": {
# "<dist url>": "<sha256>",
# },
# },
# },
# "dist_filenames": {
# "<index_url>": {
# "<last segment>": {
# "<dist url>": "<filename>", # if it is different from the URL
# },
# },
# },
# "dist_yanked": {
# "<index_url>": {
# "<last segment>": {
# "<sha256>": "<reason>", # if the package is yanked
# },
# },
# },
# },
for sha256, d in (value.sdists | value.whls).items():
facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
if not d.url.endswith(d.filename):
Expand Down
Loading
Loading