From 1e9d58125845b6621948c5984c81655574276ea4 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 2 Jun 2026 16:46:56 -0700 Subject: [PATCH 1/2] revision: introduce --toplevel-branches In large repositories which are served at central hubs (such as at hosting forges), there tend to be huge numbers of refs, and even huge numbers of branches. These branches are often of forms like refs/heads/${username}/${topic} refs/heads/${bot}/${unique_name} Sometimes, the sheer number of branches is itself a scaling challenge. There are cases where we'd like a cheap approximation of "the union of the main integration branches" (e.g. "main develop next maint"), and it's okay to also take other branches along for the ride but it's nice to limit how many extra refs we include. Introduce a new pseudo-revision `--toplevel-branches` for this purpose. It is similar to the existing `--branches` pseudo-revision and its `--branches=` sibling, but there's been no ergonomic way to exclude the large maze of branches in sub-hierarchies (`*` as a pattern matches across `/`). The next commit will use this to speed up push time reachability checks. Add tests to `t6018-rev-list-glob.sh`, which already constructs a fixture with a mix of top-level branches (`main`, `someref`, `subspace-x`) and sub-directory branches (`other/three`, `subspace/one`, `subspace/two`), making it a natural home for the new coverage. Signed-off-by: Elijah Newren --- Documentation/git-rev-parse.adoc | 4 ++++ Documentation/rev-list-options.adoc | 6 +++++ builtin/rev-parse.c | 10 +++++++++ refs.c | 35 +++++++++++++++++++++++++++++ refs.h | 2 ++ revision.c | 7 ++++++ t/t6018-rev-list-glob.sh | 26 +++++++++++++++++++++ 7 files changed, 90 insertions(+) diff --git a/Documentation/git-rev-parse.adoc b/Documentation/git-rev-parse.adoc index 5398691f3f15f7..f84659c809759a 100644 --- a/Documentation/git-rev-parse.adoc +++ b/Documentation/git-rev-parse.adoc @@ -199,6 +199,10 @@ If a `pattern` is given, only refs matching the given shell glob are shown. If the pattern does not contain a globbing character (`?`, `*`, or `[`), it is turned into a prefix match by appending `/*`. +--toplevel-branches:: + Show every ref directly under `refs/heads/` (that is, a branch + whose short name does not contain a `/`). + --glob=:: Show all refs matching the shell glob pattern `pattern`. If the pattern does not start with `refs/`, this is automatically diff --git a/Documentation/rev-list-options.adoc b/Documentation/rev-list-options.adoc index 94a7b1c065dba8..a1e0c7838535f9 100644 --- a/Documentation/rev-list-options.adoc +++ b/Documentation/rev-list-options.adoc @@ -170,6 +170,12 @@ endif::git-log[] branches to ones matching given shell glob. If __ lacks '?', '{asterisk}', or '[', '/{asterisk}' at the end is implied. +`--toplevel-branches`:: + Pretend as if every ref directly under `refs/heads/` is listed on + the command line as __. This is a subset of --branches + which excludes branches in deeper hierarchies, i.e. excluding + branches whose short name contains a `/`. + `--tags[=]`:: Pretend as if all the refs in `refs/tags` are listed on the command line as __. If __ is given, limit diff --git a/builtin/rev-parse.c b/builtin/rev-parse.c index bb882678fe2a9e..a2ce2c5078495a 100644 --- a/builtin/rev-parse.c +++ b/builtin/rev-parse.c @@ -70,6 +70,7 @@ static int is_rev_argument(const char *arg) "--dense", "--branches=", "--branches", + "--toplevel-branches", "--header", "--ignore-missing", "--max-age=", @@ -960,6 +961,15 @@ int cmd_rev_parse(int argc, free(term_bad); continue; } + if (!strcmp(arg, "--toplevel-branches")) { + if (ref_excludes.hidden_refs_configured) + return error(_("options '%s' and '%s' cannot be used together"), + "--exclude-hidden", "--toplevel-branches"); + refs_for_each_toplevel_branch_ref(get_main_ref_store(the_repository), + show_reference, NULL); + clear_ref_exclusions(&ref_excludes); + continue; + } if (opt_with_value(arg, "--branches", &arg)) { if (ref_excludes.hidden_refs_configured) return error(_("options '%s' and '%s' cannot be used together"), diff --git a/refs.c b/refs.c index 0f3355d2ee0be1..f7e18cdf6d216d 100644 --- a/refs.c +++ b/refs.c @@ -552,6 +552,41 @@ int refs_for_each_branch_ref(struct ref_store *refs, refs_for_each_cb cb, void * return refs_for_each_ref_ext(refs, cb, cb_data, &opts); } +struct toplevel_branch_filter { + refs_for_each_cb *fn; + void *cb_data; +}; + +static int filter_toplevel_branch(const struct reference *ref, void *data) +{ + struct toplevel_branch_filter *filter = data; + + /* + * ref->name has had the "refs/heads/" prefix trimmed, so a + * top-level branch like refs/heads/main appears as "main", + * while a sub-directory branch like refs/heads/dscho/wip + * appears as "dscho/wip". + */ + if (strchr(ref->name, '/')) + return 0; + return filter->fn(ref, filter->cb_data); +} + +int refs_for_each_toplevel_branch_ref(struct ref_store *refs, + refs_for_each_cb cb, void *cb_data) +{ + struct toplevel_branch_filter filter = { + .fn = cb, + .cb_data = cb_data, + }; + struct refs_for_each_ref_options opts = { + .prefix = "refs/heads/", + .trim_prefix = strlen("refs/heads/"), + }; + return refs_for_each_ref_ext(refs, filter_toplevel_branch, &filter, + &opts); +} + int refs_for_each_remote_ref(struct ref_store *refs, refs_for_each_cb cb, void *cb_data) { struct refs_for_each_ref_options opts = { diff --git a/refs.h b/refs.h index 71d5c186d044bb..449751b3a1e8dd 100644 --- a/refs.h +++ b/refs.h @@ -506,6 +506,8 @@ int refs_for_each_tag_ref(struct ref_store *refs, refs_for_each_cb fn, void *cb_data); int refs_for_each_branch_ref(struct ref_store *refs, refs_for_each_cb fn, void *cb_data); +int refs_for_each_toplevel_branch_ref(struct ref_store *refs, + refs_for_each_cb fn, void *cb_data); int refs_for_each_remote_ref(struct ref_store *refs, refs_for_each_cb fn, void *cb_data); int refs_for_each_replace_ref(struct ref_store *refs, diff --git a/revision.c b/revision.c index 5693618be4ec81..c527bae0ff920a 100644 --- a/revision.c +++ b/revision.c @@ -2328,6 +2328,7 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg /* pseudo revision arguments */ if (!strcmp(arg, "--all") || !strcmp(arg, "--branches") || + !strcmp(arg, "--toplevel-branches") || !strcmp(arg, "--tags") || !strcmp(arg, "--remotes") || !strcmp(arg, "--reflog") || !strcmp(arg, "--not") || !strcmp(arg, "--no-walk") || !strcmp(arg, "--do-walk") || @@ -2807,6 +2808,12 @@ static int handle_revision_pseudo_opt(struct rev_info *revs, "--exclude-hidden", "--branches"); handle_refs(refs, revs, *flags, refs_for_each_branch_ref); clear_ref_exclusions(&revs->ref_excludes); + } else if (!strcmp(arg, "--toplevel-branches")) { + if (revs->ref_excludes.hidden_refs_configured) + return error(_("options '%s' and '%s' cannot be used together"), + "--exclude-hidden", "--toplevel-branches"); + handle_refs(refs, revs, *flags, refs_for_each_toplevel_branch_ref); + clear_ref_exclusions(&revs->ref_excludes); } else if (!strcmp(arg, "--bisect")) { read_bisect_terms(&term_bad, &term_good); handle_refs(refs, revs, *flags, for_each_bad_bisect_ref); diff --git a/t/t6018-rev-list-glob.sh b/t/t6018-rev-list-glob.sh index bb55c7e3c3c30d..77d6965fd5f0a0 100755 --- a/t/t6018-rev-list-glob.sh +++ b/t/t6018-rev-list-glob.sh @@ -423,4 +423,30 @@ test_expect_failure 'shortlog --glob is not confused by option-like argument' ' ' +test_expect_success 'rev-parse --toplevel-branches matches only top-level branches' ' + + git rev-parse main someref subspace-x | sort >expect && + git rev-parse --toplevel-branches | sort >actual && + test_cmp expect actual + +' + +test_expect_success 'rev-list --toplevel-branches matches only top-level branches' ' + + git rev-list --no-walk main someref subspace-x | sort >expect && + git rev-list --no-walk --toplevel-branches | sort >actual && + test_cmp expect actual + +' + +test_expect_success 'rev-list --not --toplevel-branches excludes top-level reachable' ' + + # "main" itself is a top-level branch, so excluding all + # top-level branches from the walk starting at main leaves + # nothing to enumerate. + git rev-list main --not --toplevel-branches >actual && + test_must_be_empty actual + +' + test_done From 01e9fe2e9f930d910991b9f9a7eea93e9fd5defe Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 2 Jun 2026 16:56:32 -0700 Subject: [PATCH 2/2] receive-pack: use a smaller set of refs for the connectivity check When git-receive-pack(1) verifies that a pushed pack is fully connected, it invokes git rev-list --objects --stdin --not --exclude-hidden=receive --all with the new tips piped in on stdin. The purpose of this check is to ensure that ref updates sent with the receive-pack are well connected, i.e. that they don't depend upon objects in the repository that reference non-existent objects. The portion following "--not" is just a performance optimization that relies on the assumption that existing refs in the repository are well connected. (Thus, we assume that refs are well connected, but not necessarily all objects in the repository are.) Prior to bcec6780b2 (receive-pack: only use visible refs for connectivity check, 2022-11-17) the only flag after "--not" was "--all", but it was noted that this "optimization" was far from optimal and the "--exclude-hidden=receive" argument was added. In fact, having huge numbers of refs after "--not" is generally suboptimal. See also commit 68cb0b5253a0 (builtin/receive-pack: add option to skip connectivity check, 2025-05-20), which suggests that others besides us are still getting connectivity checks that are taking too long. If we still need a connectivity check, the optimal choice is probably just the O(1) set of "primary integration branches" that exist for the repository, e.g. "main next seen maint develop", but discovering that list of branch names automatically for any given repository is difficult. My timings suggest that using "HEAD" is preferable to using "--all" and is likely near optimal in most cases, but I know there are a few repositories out there that leave HEAD pointing to a non-existent branch or which never update HEAD and update other branches instead. Thus, I propose a middle ground: "--toplevel-branches HEAD". Some timings across 86 pushes in a monorepo: Group Mean Median Min Max 1: --not --exclude-hidden=receive --all 2.219 2.520 0.33 8.30 2: --not --branches 2.099 2.430 0.24 8.43 3: --not 0.883 0.770 0.05 2.11 4: --not HEAD 0.853 0.700 0.00 3.55 where was more specifically: $(git for-each-ref --format='%(objectname)' 'refs/heads/[^/]*') and thus was also paying for the overhead of invoking a subprocess. (For reference, this repo has about 6 times as many refs as branches, and about 13 times as many total branches as toplevel branches.) Note also that the biggest slowdown from group 1->3 on a given push was 0.34s -> 1.05s (approximately 3.1x) and the biggest speedup from group 1->3 on a given push was 4.52s -> 0.69s (approximately 6.5x). While we could use the above for-each-ref invocation, that'd be an extra subprocess, and it'd run the risk of violating command line length limits (big monorepos often still have enough toplevel branches that we have to worry about even those). Instead, use the new --toplevel-branches flag added in the previous commit, updating our rev-list reachability command to git rev-list --objects --stdin --not --toplevel-branches HEAD Signed-off-by: Elijah Newren --- builtin/receive-pack.c | 2 +- connected.c | 16 ++++++++++++---- connected.h | 12 +++++++++++- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c index 71e1f3dcd4a204..0ae75b9c210973 100644 --- a/builtin/receive-pack.c +++ b/builtin/receive-pack.c @@ -2058,7 +2058,7 @@ static void execute_commands(struct command *commands, opt.err_fd = err_fd; opt.progress = err_fd && !quiet; opt.env = tmp_objdir_env(tmp_objdir); - opt.exclude_hidden_refs_section = "receive"; + opt.use_toplevel_branches_for_reachability = 1; if (check_connected(iterate_receive_command_list, &data, &opt)) set_connectivity_errors(commands, si); diff --git a/connected.c b/connected.c index 7e269768327238..3d1708075760a2 100644 --- a/connected.c +++ b/connected.c @@ -8,6 +8,7 @@ #include "sigchain.h" #include "connected.h" #include "transport.h" +#include "object-name.h" #include "packfile.h" #include "promisor-remote.h" @@ -93,10 +94,17 @@ int check_connected(oid_iterate_fn fn, void *cb_data, strvec_push(&rev_list.args, "--exclude-promisor-objects"); if (!opt->is_deepening_fetch) { strvec_push(&rev_list.args, "--not"); - if (opt->exclude_hidden_refs_section) - strvec_pushf(&rev_list.args, "--exclude-hidden=%s", - opt->exclude_hidden_refs_section); - strvec_push(&rev_list.args, "--all"); + if (opt->use_toplevel_branches_for_reachability) { + struct object_id head_oid; + strvec_push(&rev_list.args, "--toplevel-branches"); + if (!repo_get_oid(the_repository, "HEAD", &head_oid)) + strvec_push(&rev_list.args, "HEAD"); + } else { + if (opt->exclude_hidden_refs_section) + strvec_pushf(&rev_list.args, "--exclude-hidden=%s", + opt->exclude_hidden_refs_section); + strvec_push(&rev_list.args, "--all"); + } } strvec_push(&rev_list.args, "--quiet"); strvec_push(&rev_list.args, "--alternate-refs"); diff --git a/connected.h b/connected.h index 16b2c84f2e35fc..0ab84e5c1013f0 100644 --- a/connected.h +++ b/connected.h @@ -50,9 +50,19 @@ struct check_connected_options { /* * If not NULL, use `--exclude-hidden=$section` to exclude all refs * hidden via the `$section.hideRefs` config from the set of - * already-reachable refs. + * already-reachable refs; irrelevant if + * use_toplevel_branches_for_reachability is set. */ const char *exclude_hidden_refs_section; + + /* + * If set, use only toplevel branches (and HEAD) for the + * reachability check. This avoids the linear-in-refcount + * enumeration of every visible ref in repositories with many + * branches/tags, at the cost of walking a little further into + * already-reachable history. + */ + unsigned use_toplevel_branches_for_reachability : 1; }; #define CHECK_CONNECTED_INIT { 0 }