-
Notifications
You must be signed in to change notification settings - Fork 10
[DO NOT MERGE] feat: linearization tour over the code graph (draft) #127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
jonathanpopham
wants to merge
2
commits into
supermodeltools:main
from
jonathanpopham:feat/linearization-tour
Closed
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| package cmd | ||
|
|
||
| import ( | ||
| "encoding/json" | ||
| "fmt" | ||
| "os" | ||
| "path/filepath" | ||
|
|
||
| "github.com/spf13/cobra" | ||
|
|
||
| "github.com/supermodeltools/cli/internal/api" | ||
| "github.com/supermodeltools/cli/internal/shards" | ||
| "github.com/supermodeltools/cli/internal/ui" | ||
| ) | ||
|
|
||
| func init() { | ||
| var strategyName string | ||
| var seed string | ||
| var narrate bool | ||
| var budgetTokens int | ||
| var dryRun bool | ||
|
|
||
| c := &cobra.Command{ | ||
| Use: "tour [path]", | ||
| Short: "Emit a linearized reading order over the code graph", | ||
| Long: `Generates .supermodel/TOUR.md — a single-file reading spine that walks the | ||
| repository in a strategy-chosen order, grouped by domain/subdomain, with each | ||
| entry linking to its per-file shard. This gives agents a deterministic path | ||
| through the codebase instead of N independent shards with no order. | ||
|
|
||
| Strategies: | ||
| topo reverse-topological over imports (leaves first, roots last) | ||
| bfs-seed breadth-first from --seed outward (focused tours) | ||
| dfs-seed depth-first from --seed outward | ||
| centrality files with the largest blast radius first | ||
|
|
||
| When --narrate is set, each existing .graph.* shard is rewritten with a prose | ||
| preamble describing the file's role as sentences (rather than only structured | ||
| arrows). Same data, different rendering targeted at LLM reading style. | ||
|
|
||
| When --budget-tokens is set and the tour exceeds the budget, TOUR.md becomes an | ||
| index linking to TOUR.01.md, TOUR.02.md, ... sized to fit one chapter per turn. | ||
|
|
||
| Reads .supermodel/shards.json produced by 'supermodel analyze'. No API call. | ||
| See docs/linearization.md for the design rationale.`, | ||
| Args: cobra.MaximumNArgs(1), | ||
| RunE: func(_ *cobra.Command, args []string) error { | ||
| dir := "." | ||
| if len(args) > 0 { | ||
| dir = args[0] | ||
| } | ||
| repoDir, err := filepath.Abs(dir) | ||
| if err != nil { | ||
| return fmt.Errorf("resolving path: %w", err) | ||
| } | ||
| cacheFile := filepath.Join(repoDir, ".supermodel", "shards.json") | ||
| data, err := os.ReadFile(cacheFile) | ||
| if err != nil { | ||
| return fmt.Errorf("reading cache %s: %w (run `supermodel analyze` first)", cacheFile, err) | ||
| } | ||
| var ir api.ShardIR | ||
| if err := json.Unmarshal(data, &ir); err != nil { | ||
| return fmt.Errorf("parsing cache: %w", err) | ||
| } | ||
| cache := shards.NewCache() | ||
| cache.Build(&ir) | ||
|
|
||
| strategy, err := shards.ResolveStrategy(strategyName, seed) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| out, err := shards.WriteTour(repoDir, cache, strategy, budgetTokens, dryRun) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| if !dryRun { | ||
| ui.Success("Wrote tour to %s (strategy: %s)", out, strategy.Name()) | ||
| } | ||
|
|
||
| if narrate { | ||
| files := cache.SourceFiles() | ||
| written, rerr := shards.RenderAll(repoDir, cache, files, true, dryRun) | ||
| if rerr != nil { | ||
| return fmt.Errorf("re-rendering shards with narrative: %w", rerr) | ||
| } | ||
| if !dryRun { | ||
| ui.Success("Re-wrote %d shards with narrative preamble", written) | ||
| } | ||
| } | ||
| return nil | ||
| }, | ||
| } | ||
|
|
||
| c.Flags().StringVar(&strategyName, "strategy", "topo", | ||
| "linearization strategy: topo | bfs-seed | dfs-seed | centrality") | ||
| c.Flags().StringVar(&seed, "seed", "", "seed file path (required for bfs-seed / dfs-seed)") | ||
| c.Flags().BoolVar(&narrate, "narrate", false, "also rewrite existing .graph.* shards with a prose narrative preamble") | ||
| c.Flags().IntVar(&budgetTokens, "budget-tokens", 0, "chunk tour into chapters of this token budget (0 = single file)") | ||
| c.Flags().BoolVar(&dryRun, "dry-run", false, "print what would be written without touching disk") | ||
|
|
||
| rootCmd.AddCommand(c) | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,180 @@ | ||
| # Graph Linearization for Sharding | ||
|
|
||
| ## Thesis | ||
|
|
||
| LLMs are one-dimensional. They consume a token stream and attend to positions | ||
| within it. Graphs are multi-dimensional: nodes are connected by edges that | ||
| don't live on the token axis. A model handed a blob of JSON nodes and edges has | ||
| to do pointer-chasing on UUIDs inside a single attention pass — work that scales | ||
| badly with graph size and burns context. | ||
|
|
||
| **Graph linearization** is the deliberate serialization of a graph into a | ||
| reading order the model can consume left-to-right, with local neighborhoods | ||
| kept close in the token stream and adjacency rendered as prose rather than | ||
| identifiers. See Xypolopoulos et al., *Graph Linearization Methods for Reasoning | ||
| on Graphs with Large Language Models* (arXiv:2410.19494) for the underlying | ||
| principles: centrality and degeneracy-based orderings substantially beat random | ||
| serialization on LLM graph-reasoning tasks. | ||
|
|
||
| ## Where the CLI stands today | ||
|
|
||
| `supermodel analyze` already writes per-file sidecar shards (`.graph.ext` or | ||
| `.calls / .deps / .impact`). Those shards are **file-level linearization**: | ||
| each sidecar collapses a subgraph into a `[deps] / [calls] / [impact]` text | ||
| layout the model reads before touching the source file. | ||
|
|
||
| Two things are missing: | ||
|
|
||
| 1. **No reading order across files.** Agents see N independent shards and have | ||
| to guess which to read first. There is no spine. | ||
| 2. **No prose adjacency inside a shard.** Call relationships are rendered as | ||
| `name ← other path:line` arrows. Accurate and terse, but the model | ||
| reconstructs sentences on the fly every time. | ||
|
|
||
| Sharding produces the units. Linearization produces the **order and | ||
| narrative** over those units. | ||
|
|
||
| ## Design: the Tour | ||
|
|
||
| A *tour* is a single markdown file — `.supermodel/TOUR.md` — that serializes | ||
| the whole repository graph into a linear walk. It is the spine that makes the | ||
| existing shards navigable. | ||
|
|
||
| ``` | ||
| TOUR.md ← linear walk (this feature) | ||
| src/auth/session.go ← source file | ||
| src/auth/session.graph.go ← existing shard (per-file linearization) | ||
| ``` | ||
|
|
||
| Agents read `TOUR.md` once to get the layout, then open shards + source in the | ||
| order the tour presents them. | ||
|
|
||
| ### Structure of TOUR.md | ||
|
|
||
| ```markdown | ||
| # Repository Tour — supermodel-cli | ||
|
|
||
| **Strategy:** reverse-topological over the import graph | ||
| (leaves → roots). Read top-to-bottom to see dependencies before dependents. | ||
|
|
||
| ## Domain: Analyze | ||
| ### Subdomain: Pipeline | ||
| - **internal/analyze/handler.go** — orchestrates upload + render | ||
| reads: api, config, shards · read by: cmd/analyze.go | ||
| risk: MEDIUM · [shard](../internal/analyze/handler.graph.go) | ||
|
|
||
| ## Domain: Shards | ||
| ### Subdomain: Rendering | ||
| - **internal/shards/render.go** — emits .graph sidecars per source file | ||
| reads: api · read by: internal/shards/handler.go | ||
| risk: LOW · [shard](../internal/shards/render.graph.go) | ||
| ... | ||
| ``` | ||
|
|
||
| One prose line per file — name, domain, adjacency, risk, shard pointer. Linear | ||
| order is the strategy's output. The agent reads prefix-to-suffix. | ||
|
|
||
| ### Linearization strategies | ||
|
|
||
| Strategies are interchangeable. The default is `topo` because it matches how | ||
| humans read codebases ("what are the leaves, then what depends on them"). | ||
|
|
||
| | Strategy | Ordering | Best for | | ||
| |--------------|---------------------------------------------------------|-----------------------------------------| | ||
| | `topo` | reverse-topological over imports (leaves first) | whole-codebase onboarding | | ||
| | `bfs-seed` | BFS from `--seed <file>` outward | focused tasks, blast radius walks | | ||
| | `dfs-seed` | DFS from `--seed <file>` — depth-first exploration | tracing a request through layers | | ||
| | `centrality` | PageRank-like over importers (most-depended-on first) | "what's the core of this codebase" | | ||
|
|
||
| Cycles are broken by file-path lexicographic order (deterministic, boring). | ||
|
|
||
| ### Prose narrative preamble (opt-in) | ||
|
|
||
| Tour generation also lets you inject a prose preamble into each existing shard | ||
| with `--narrate`: | ||
|
|
||
| ```go | ||
| // @generated supermodel-shard — do not edit | ||
| // | ||
| // Narrative: parseConfig (Domain Config / Loading) is called by main | ||
| // (cmd/root.go:42) and serverInit (cmd/server.go:18). It calls readFile | ||
| // and json.Unmarshal. Imports: os, encoding/json. Risk: LOW. | ||
| // | ||
| // [deps] | ||
| // imports os | ||
| // imports encoding/json | ||
| // ... | ||
| ``` | ||
|
|
||
| The preamble is a one-paragraph summary derived from the same cache used for | ||
| the structured sections — no new data, just a second rendering targeted at the | ||
| model's native reading style. Flag-gated so users can A/B. | ||
|
|
||
| ## CLI surface (implemented) | ||
|
|
||
| Standalone: | ||
|
|
||
| ``` | ||
| supermodel tour [--strategy topo|bfs-seed|dfs-seed|centrality] | ||
| [--seed <file>] | ||
| [--narrate] | ||
| [--budget-tokens <N>] | ||
| [--dry-run] | ||
| [path] | ||
| ``` | ||
|
|
||
| Integrated with `analyze` so a single command emits shards + spine: | ||
|
|
||
| ``` | ||
| supermodel analyze [--tour] | ||
| [--tour-strategy topo|bfs-seed|dfs-seed|centrality] | ||
| [--tour-seed <file>] | ||
| [--tour-budget <N>] | ||
| [--narrate] | ||
| [path] | ||
| ``` | ||
|
|
||
| - Reads `.supermodel/shards.json` (errors if absent — prompts `analyze` first). | ||
| - Writes `.supermodel/TOUR.md`. | ||
| - With `--narrate`, rewrites existing `.graph.*` shards in place to include a | ||
| prose narrative preamble. | ||
| - `--budget-tokens` chunks the tour into `TOUR.01.md`, `TOUR.02.md`, ... with | ||
| `TOUR.md` becoming an index. Each chapter has prev/next cross-links. | ||
|
|
||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| No API call. No new cache. Pure reshaping of what `analyze` already produced. | ||
|
|
||
| ## Why this shape | ||
|
|
||
| - **Same vertical slice.** Tour lives inside `internal/shards/` — it consumes | ||
| the shard cache and emits a companion artifact. No cross-slice dependency. | ||
| - **Additive.** Default behavior of `analyze` is unchanged. Tour is opt-in. | ||
| - **Deterministic.** Lexicographic tiebreaks, stable sort; tour file is safe to | ||
| commit or diff. | ||
| - **Strategy-pluggable.** The `Strategy` interface is small (one method: | ||
| `Order(cache) []string`), so we can add more orderings without touching the | ||
| renderer. | ||
|
|
||
| ## Open questions | ||
|
|
||
| - Should tour output default-render inline snippets of each shard, or strictly | ||
| link to them? Inline is self-contained (one file to read) but duplicates | ||
| content; linked is DRY but requires the agent to follow pointers. | ||
| - Should there be a `--focus <glob>` filter so tours scope to a subtree? | ||
| - Does `arch-docs` want to consume TOUR.md as its entry point (replacing its | ||
| own traversal)? | ||
| - Running `supermodel tour` with a different `--budget-tokens` should probably | ||
| clean up stale `TOUR.NN.md` files from a prior chunked run. Cosmetic. | ||
| - Benchmark: we need numbers. Plan to wire through | ||
| `supermodeltools/supermodel-benchmarks/shard-ab-test/` to measure | ||
| agent performance with/without TOUR + narrate. | ||
|
|
||
| ## References | ||
|
|
||
| - Xypolopoulos et al., *Graph Linearization Methods for Reasoning on Graphs | ||
| with Large Language Models*, arXiv:2410.19494 | ||
| - `supermodeltools/codegraph-graphrag` — BFS narrative walks, the thesis doc | ||
| in the org | ||
| - `supermodeltools/graph2md` — per-node markdown emission (another | ||
| linearization strategy) | ||
| - `supermodeltools/mcp/src/tools/explore-function.ts` — `describeNode()` | ||
| prose format, cross-subsystem markers | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--narratecan destructively switch shard format.Calling
RenderAllhere rewrites as.graph.*and triggers stale three-file cleanup, so repos using.calls/.deps/.impactcan lose those files duringsupermodel tour --narrate.Safer approach (rewrite only existing .graph shards)
if narrate { files := cache.SourceFiles() - written, rerr := shards.RenderAll(repoDir, cache, files, true, dryRun) + var graphFiles []string + for _, f := range files { + if _, statErr := os.Stat(filepath.Join(repoDir, shards.ShardFilename(f))); statErr == nil { + graphFiles = append(graphFiles, f) + } + } + written, rerr := shards.RenderAll(repoDir, cache, graphFiles, true, dryRun) if rerr != nil { return fmt.Errorf("re-rendering shards with narrative: %w", rerr) }🤖 Prompt for AI Agents