Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 32 additions & 2 deletions cmd/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ func init() {
var opts analyze.Options
var noShards bool
var threeFile bool
var narrate bool
var tour bool
var tourStrategy string
var tourSeed string
var tourBudget int

c := &cobra.Command{
Use: "analyze [path]",
Expand All @@ -25,7 +30,16 @@ Results are cached locally by content hash. Subsequent commands
(dead-code, blast-radius, graph) reuse the cache automatically.

By default, .graph.* shard files are written next to each source file.
Use --no-shards to skip writing graph files.`,
Use --no-shards to skip writing graph files.

Linearization flags:
--narrate prefix each shard with a prose narrative preamble
--tour also emit .supermodel/TOUR.md (the reading spine)
--tour-strategy topo | bfs-seed | dfs-seed | centrality (default: topo)
--tour-seed seed file for bfs-seed/dfs-seed
--tour-budget chunk tour into chapters of this token budget

See docs/linearization.md for design.`,
Args: cobra.MaximumNArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
cfg, err := config.Load()
Expand All @@ -38,6 +52,9 @@ Use --no-shards to skip writing graph files.`,
if noShards && threeFile {
return fmt.Errorf("--three-file cannot be used with --no-shards")
}
if noShards && (narrate || tour) {
return fmt.Errorf("--narrate and --tour require shards (cannot combine with --no-shards)")
}
dir := "."
if len(args) > 0 {
dir = args[0]
Expand All @@ -46,7 +63,15 @@ Use --no-shards to skip writing graph files.`,
// Shard mode: Generate handles the full pipeline (API call +
// cache + shards) in a single upload. Running analyze.Run
// first would duplicate the API call.
return shards.Generate(cmd.Context(), cfg, dir, shards.GenerateOptions{Force: opts.Force, ThreeFile: threeFile})
return shards.Generate(cmd.Context(), cfg, dir, shards.GenerateOptions{
Force: opts.Force,
ThreeFile: threeFile,
Narrate: narrate,
Tour: tour,
TourStrategy: tourStrategy,
TourSeed: tourSeed,
TourBudget: tourBudget,
})
}
return analyze.Run(cmd.Context(), cfg, dir, opts)
},
Expand All @@ -56,6 +81,11 @@ Use --no-shards to skip writing graph files.`,
c.Flags().StringVarP(&opts.Output, "output", "o", "", "output format: human|json")
c.Flags().BoolVar(&noShards, "no-shards", false, "skip writing .graph.* shard files")
c.Flags().BoolVar(&threeFile, "three-file", false, "generate .calls/.deps/.impact files instead of single .graph")
c.Flags().BoolVar(&narrate, "narrate", false, "prefix each shard with a prose narrative preamble")
c.Flags().BoolVar(&tour, "tour", false, "also emit .supermodel/TOUR.md — the linear reading spine")
c.Flags().StringVar(&tourStrategy, "tour-strategy", "topo", "tour ordering: topo | bfs-seed | dfs-seed | centrality")
c.Flags().StringVar(&tourSeed, "tour-seed", "", "seed file for bfs-seed / dfs-seed strategies")
c.Flags().IntVar(&tourBudget, "tour-budget", 0, "chunk tour into chapters of this token budget (0 = single file)")

rootCmd.AddCommand(c)
}
103 changes: 103 additions & 0 deletions cmd/tour.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package cmd

import (
"encoding/json"
"fmt"
"os"
"path/filepath"

"github.com/spf13/cobra"

"github.com/supermodeltools/cli/internal/api"
"github.com/supermodeltools/cli/internal/shards"
"github.com/supermodeltools/cli/internal/ui"
)

func init() {
var strategyName string
var seed string
var narrate bool
var budgetTokens int
var dryRun bool

c := &cobra.Command{
Use: "tour [path]",
Short: "Emit a linearized reading order over the code graph",
Long: `Generates .supermodel/TOUR.md — a single-file reading spine that walks the
repository in a strategy-chosen order, grouped by domain/subdomain, with each
entry linking to its per-file shard. This gives agents a deterministic path
through the codebase instead of N independent shards with no order.

Strategies:
topo reverse-topological over imports (leaves first, roots last)
bfs-seed breadth-first from --seed outward (focused tours)
dfs-seed depth-first from --seed outward
centrality files with the largest blast radius first

When --narrate is set, each existing .graph.* shard is rewritten with a prose
preamble describing the file's role as sentences (rather than only structured
arrows). Same data, different rendering targeted at LLM reading style.

When --budget-tokens is set and the tour exceeds the budget, TOUR.md becomes an
index linking to TOUR.01.md, TOUR.02.md, ... sized to fit one chapter per turn.

Reads .supermodel/shards.json produced by 'supermodel analyze'. No API call.
See docs/linearization.md for the design rationale.`,
Args: cobra.MaximumNArgs(1),
RunE: func(_ *cobra.Command, args []string) error {
dir := "."
if len(args) > 0 {
dir = args[0]
}
repoDir, err := filepath.Abs(dir)
if err != nil {
return fmt.Errorf("resolving path: %w", err)
}
cacheFile := filepath.Join(repoDir, ".supermodel", "shards.json")
data, err := os.ReadFile(cacheFile)
if err != nil {
return fmt.Errorf("reading cache %s: %w (run `supermodel analyze` first)", cacheFile, err)
}
var ir api.ShardIR
if err := json.Unmarshal(data, &ir); err != nil {
return fmt.Errorf("parsing cache: %w", err)
}
cache := shards.NewCache()
cache.Build(&ir)

strategy, err := shards.ResolveStrategy(strategyName, seed)
if err != nil {
return err
}

out, err := shards.WriteTour(repoDir, cache, strategy, budgetTokens, dryRun)
if err != nil {
return err
}
if !dryRun {
ui.Success("Wrote tour to %s (strategy: %s)", out, strategy.Name())
}

if narrate {
files := cache.SourceFiles()
written, rerr := shards.RenderAll(repoDir, cache, files, true, dryRun)
if rerr != nil {
Comment on lines +81 to +84
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

--narrate can destructively switch shard format.

Calling RenderAll here rewrites as .graph.* and triggers stale three-file cleanup, so repos using .calls/.deps/.impact can lose those files during supermodel tour --narrate.

Safer approach (rewrite only existing .graph shards)
 			if narrate {
 				files := cache.SourceFiles()
-				written, rerr := shards.RenderAll(repoDir, cache, files, true, dryRun)
+				var graphFiles []string
+				for _, f := range files {
+					if _, statErr := os.Stat(filepath.Join(repoDir, shards.ShardFilename(f))); statErr == nil {
+						graphFiles = append(graphFiles, f)
+					}
+				}
+				written, rerr := shards.RenderAll(repoDir, cache, graphFiles, true, dryRun)
 				if rerr != nil {
 					return fmt.Errorf("re-rendering shards with narrative: %w", rerr)
 				}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@cmd/tour.go` around lines 81 - 84, When --narrate is set we must not rewrite
other shard formats; instead, before calling shards.RenderAll from the narrate
branch (where cache.SourceFiles() is used), filter the files list to only those
that currently use the .graph shard format (i.e. detect existing .graph.* shards
via the cache/metadata API), or replace the call with a new helper like
shards.RenderExistingGraphShards(repoDir, cache, graphFiles, true, dryRun) that
only rewrites .graph shards; update the narrate branch to use the filtered list
(or new helper) so RenderAll does not convert repos from .calls/.deps/.impact to
.graph and avoid stale-file cleanup deleting those other formats.

return fmt.Errorf("re-rendering shards with narrative: %w", rerr)
}
if !dryRun {
ui.Success("Re-wrote %d shards with narrative preamble", written)
}
}
return nil
},
}

c.Flags().StringVar(&strategyName, "strategy", "topo",
"linearization strategy: topo | bfs-seed | dfs-seed | centrality")
c.Flags().StringVar(&seed, "seed", "", "seed file path (required for bfs-seed / dfs-seed)")
c.Flags().BoolVar(&narrate, "narrate", false, "also rewrite existing .graph.* shards with a prose narrative preamble")
c.Flags().IntVar(&budgetTokens, "budget-tokens", 0, "chunk tour into chapters of this token budget (0 = single file)")
c.Flags().BoolVar(&dryRun, "dry-run", false, "print what would be written without touching disk")

rootCmd.AddCommand(c)
}
180 changes: 180 additions & 0 deletions docs/linearization.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# Graph Linearization for Sharding

## Thesis

LLMs are one-dimensional. They consume a token stream and attend to positions
within it. Graphs are multi-dimensional: nodes are connected by edges that
don't live on the token axis. A model handed a blob of JSON nodes and edges has
to do pointer-chasing on UUIDs inside a single attention pass — work that scales
badly with graph size and burns context.

**Graph linearization** is the deliberate serialization of a graph into a
reading order the model can consume left-to-right, with local neighborhoods
kept close in the token stream and adjacency rendered as prose rather than
identifiers. See Xypolopoulos et al., *Graph Linearization Methods for Reasoning
on Graphs with Large Language Models* (arXiv:2410.19494) for the underlying
principles: centrality and degeneracy-based orderings substantially beat random
serialization on LLM graph-reasoning tasks.

## Where the CLI stands today

`supermodel analyze` already writes per-file sidecar shards (`.graph.ext` or
`.calls / .deps / .impact`). Those shards are **file-level linearization**:
each sidecar collapses a subgraph into a `[deps] / [calls] / [impact]` text
layout the model reads before touching the source file.

Two things are missing:

1. **No reading order across files.** Agents see N independent shards and have
to guess which to read first. There is no spine.
2. **No prose adjacency inside a shard.** Call relationships are rendered as
`name ← other path:line` arrows. Accurate and terse, but the model
reconstructs sentences on the fly every time.

Sharding produces the units. Linearization produces the **order and
narrative** over those units.

## Design: the Tour

A *tour* is a single markdown file — `.supermodel/TOUR.md` — that serializes
the whole repository graph into a linear walk. It is the spine that makes the
existing shards navigable.

```
TOUR.md ← linear walk (this feature)
src/auth/session.go ← source file
src/auth/session.graph.go ← existing shard (per-file linearization)
```

Agents read `TOUR.md` once to get the layout, then open shards + source in the
order the tour presents them.

### Structure of TOUR.md

```markdown
# Repository Tour — supermodel-cli

**Strategy:** reverse-topological over the import graph
(leaves → roots). Read top-to-bottom to see dependencies before dependents.

## Domain: Analyze
### Subdomain: Pipeline
- **internal/analyze/handler.go** — orchestrates upload + render
reads: api, config, shards · read by: cmd/analyze.go
risk: MEDIUM · [shard](../internal/analyze/handler.graph.go)

## Domain: Shards
### Subdomain: Rendering
- **internal/shards/render.go** — emits .graph sidecars per source file
reads: api · read by: internal/shards/handler.go
risk: LOW · [shard](../internal/shards/render.graph.go)
...
```

One prose line per file — name, domain, adjacency, risk, shard pointer. Linear
order is the strategy's output. The agent reads prefix-to-suffix.

### Linearization strategies

Strategies are interchangeable. The default is `topo` because it matches how
humans read codebases ("what are the leaves, then what depends on them").

| Strategy | Ordering | Best for |
|--------------|---------------------------------------------------------|-----------------------------------------|
| `topo` | reverse-topological over imports (leaves first) | whole-codebase onboarding |
| `bfs-seed` | BFS from `--seed <file>` outward | focused tasks, blast radius walks |
| `dfs-seed` | DFS from `--seed <file>` — depth-first exploration | tracing a request through layers |
| `centrality` | PageRank-like over importers (most-depended-on first) | "what's the core of this codebase" |

Cycles are broken by file-path lexicographic order (deterministic, boring).

### Prose narrative preamble (opt-in)

Tour generation also lets you inject a prose preamble into each existing shard
with `--narrate`:

```go
// @generated supermodel-shard — do not edit
//
// Narrative: parseConfig (Domain Config / Loading) is called by main
// (cmd/root.go:42) and serverInit (cmd/server.go:18). It calls readFile
// and json.Unmarshal. Imports: os, encoding/json. Risk: LOW.
//
// [deps]
// imports os
// imports encoding/json
// ...
```

The preamble is a one-paragraph summary derived from the same cache used for
the structured sections — no new data, just a second rendering targeted at the
model's native reading style. Flag-gated so users can A/B.

## CLI surface (implemented)

Standalone:

```
supermodel tour [--strategy topo|bfs-seed|dfs-seed|centrality]
[--seed <file>]
[--narrate]
[--budget-tokens <N>]
[--dry-run]
[path]
```

Integrated with `analyze` so a single command emits shards + spine:

```
supermodel analyze [--tour]
[--tour-strategy topo|bfs-seed|dfs-seed|centrality]
[--tour-seed <file>]
[--tour-budget <N>]
[--narrate]
[path]
```

- Reads `.supermodel/shards.json` (errors if absent — prompts `analyze` first).
- Writes `.supermodel/TOUR.md`.
- With `--narrate`, rewrites existing `.graph.*` shards in place to include a
prose narrative preamble.
- `--budget-tokens` chunks the tour into `TOUR.01.md`, `TOUR.02.md`, ... with
`TOUR.md` becoming an index. Each chapter has prev/next cross-links.

Comment thread
coderabbitai[bot] marked this conversation as resolved.
No API call. No new cache. Pure reshaping of what `analyze` already produced.

## Why this shape

- **Same vertical slice.** Tour lives inside `internal/shards/` — it consumes
the shard cache and emits a companion artifact. No cross-slice dependency.
- **Additive.** Default behavior of `analyze` is unchanged. Tour is opt-in.
- **Deterministic.** Lexicographic tiebreaks, stable sort; tour file is safe to
commit or diff.
- **Strategy-pluggable.** The `Strategy` interface is small (one method:
`Order(cache) []string`), so we can add more orderings without touching the
renderer.

## Open questions

- Should tour output default-render inline snippets of each shard, or strictly
link to them? Inline is self-contained (one file to read) but duplicates
content; linked is DRY but requires the agent to follow pointers.
- Should there be a `--focus <glob>` filter so tours scope to a subtree?
- Does `arch-docs` want to consume TOUR.md as its entry point (replacing its
own traversal)?
- Running `supermodel tour` with a different `--budget-tokens` should probably
clean up stale `TOUR.NN.md` files from a prior chunked run. Cosmetic.
- Benchmark: we need numbers. Plan to wire through
`supermodeltools/supermodel-benchmarks/shard-ab-test/` to measure
agent performance with/without TOUR + narrate.

## References

- Xypolopoulos et al., *Graph Linearization Methods for Reasoning on Graphs
with Large Language Models*, arXiv:2410.19494
- `supermodeltools/codegraph-graphrag` — BFS narrative walks, the thesis doc
in the org
- `supermodeltools/graph2md` — per-node markdown emission (another
linearization strategy)
- `supermodeltools/mcp/src/tools/explore-function.ts` — `describeNode()`
prose format, cross-subsystem markers
6 changes: 3 additions & 3 deletions internal/shards/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ func (d *Daemon) loadOrGenerate(ctx context.Context) error {
d.mu.Unlock()

files := d.cache.SourceFiles()
written, renderErr := RenderAll(d.cfg.RepoDir, d.cache, files, false)
written, renderErr := RenderAll(d.cfg.RepoDir, d.cache, files, false, false)
if renderErr != nil {
return renderErr
}
Expand Down Expand Up @@ -226,7 +226,7 @@ func (d *Daemon) fullGenerate(ctx context.Context) error {
d.mu.Unlock()

files := d.cache.SourceFiles()
written, err := RenderAll(d.cfg.RepoDir, d.cache, files, false)
written, err := RenderAll(d.cfg.RepoDir, d.cache, files, false, false)
if err != nil {
return err
}
Expand Down Expand Up @@ -301,7 +301,7 @@ func (d *Daemon) incrementalUpdate(ctx context.Context, changedFiles []string) {

d.logf("Re-rendering %d affected shards", len(affected))

written, err := RenderAll(d.cfg.RepoDir, cacheSnapshot, affected, false)
written, err := RenderAll(d.cfg.RepoDir, cacheSnapshot, affected, false, false)
if err != nil {
d.logf("Render error: %v", err)
return
Expand Down
Loading
Loading