diff --git a/go.mod b/go.mod index 555605d..0b13e44 100644 --- a/go.mod +++ b/go.mod @@ -1,10 +1,11 @@ module github.com/containerd/continuity -go 1.21 +go 1.23 require ( github.com/Microsoft/go-winio v0.6.2 github.com/containerd/log v0.1.0 + github.com/erofs/go-erofs v0.3.1-0.20260531080512-069dc32d83e6 github.com/opencontainers/go-digest v1.0.0 golang.org/x/sync v0.8.0 golang.org/x/sys v0.26.0 diff --git a/go.sum b/go.sum index 4cd8137..2410be8 100644 --- a/go.sum +++ b/go.sum @@ -5,6 +5,8 @@ github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/erofs/go-erofs v0.3.1-0.20260531080512-069dc32d83e6 h1:a9BU6HU86UHLPxkABcUIoLLClluURHpYLU6fM88VrjU= +github.com/erofs/go-erofs v0.3.1-0.20260531080512-069dc32d83e6/go.mod h1:XkSeN9MHszGd4+3gcEjadJLYHCQpWzJ7/8yznzMuzJs= github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= diff --git a/tarconv/apply.go b/tarconv/apply.go new file mode 100644 index 0000000..1704d5b --- /dev/null +++ b/tarconv/apply.go @@ -0,0 +1,514 @@ +// Package tarconv ingests OCI/Docker tar layer streams into an [erofs.Writer] +// via direct writer calls, without staging an intermediate fs.FS. +// +// The single entry point is [Apply]. It handles all tar entry types (regular +// files, directories, symlinks, hard links, device nodes, FIFOs) and three +// whiteout strategies selectable via options: +// +// - Default (no option): translate AUFS/OCI whiteouts to overlayfs xattrs. +// Suitable for per-layer EROFS images that will be stacked at runtime. +// - [WithMerge]: resolve whiteouts structurally by removing entries. +// Suitable for flat merged images where all layers are applied in sequence. +// - [WithPreserveWhiteouts]: keep .wh.* entries as plain files. +// Suitable for tooling that needs the raw tar content. +package tarconv + +import ( + archivetar "archive/tar" + "errors" + "fmt" + "io" + "io/fs" + "path" + "strings" + + erofs "github.com/erofs/go-erofs" +) + +// Unix inode type bits (S_IF*), matching the values expected by erofs.Writer.Mknod. +const ( + sifChrdev = uint16(0020000) // character device + sifBlkdev = uint16(0060000) // block device + sifFifo = uint16(0010000) // FIFO / named pipe +) + +const ( + whiteoutPrefix = ".wh." + opaqueWhiteout = ".wh..wh..opq" + overlayOpaqueXattr = "trusted.overlay.opaque" + overlayOriginXattr = "trusted.overlay.origin" + xattrPrefix = "SCHILY.xattr." +) + +// whiteoutMode selects how AUFS/OCI whiteout entries are processed. +type whiteoutMode int + +const ( + // whiteoutConvert is the default: translate whiteouts to overlayfs representation. + whiteoutConvert whiteoutMode = iota + // whiteoutMerge resolves whiteouts by removing entries from the writer tree. + whiteoutMerge + // whiteoutPreserve keeps whiteout entries as plain regular files. + whiteoutPreserve +) + +// config holds the parsed options for an Apply call. +type config struct { + whiteouts whiteoutMode +} + +// Option configures an [Apply] call. +type Option func(*config) + +// WithMerge makes Apply resolve AUFS/OCI whiteout entries structurally: +// - .wh. removes the sibling path from the writer's current tree. +// ErrNotExist is silently swallowed (the target may not yet exist in any +// layer seen so far). +// - .wh..wh..opq removes all existing children of the containing directory, +// leaving the directory itself so subsequent entries can repopulate it. +// +// The resulting image is a flat merged filesystem with no overlay xattrs. +// Use WithMerge when calling Apply once per layer to build a single merged image. +func WithMerge() Option { + return func(c *config) { c.whiteouts = whiteoutMerge } +} + +// WithPreserveWhiteouts makes Apply treat .wh.* and .wh..wh..opq entries as +// ordinary regular files, performing no whiteout translation. The raw tar +// content is preserved verbatim. +func WithPreserveWhiteouts() Option { + return func(c *config) { c.whiteouts = whiteoutPreserve } +} + +// pendingLink records a hard link whose target had not yet appeared when the +// link entry was processed. Only header metadata is stored; no payload bytes. +type pendingLink struct { + newname string + oldname string // target path, cleaned + hdr archivetar.Header +} + +// Apply ingests the tar stream r into w, translating each entry into a direct +// [erofs.Writer] call. +// +// By default (no options), AUFS/OCI whiteout entries are translated to +// overlayfs-compatible representation: +// - .wh. becomes a character device 0/0 at the sibling path, and the +// containing directory receives trusted.overlay.origin="". +// - .wh..wh..opq sets trusted.overlay.opaque=y on the containing directory. +// +// This matches the behaviour of mkfs.erofs --aufs and is appropriate for +// single-layer EROFS images that will be stacked by an overlayfs consumer. +// +// Use [WithMerge] to resolve whiteouts structurally instead (flat merged image). +// Use [WithPreserveWhiteouts] to keep whiteout entries as plain files. +// +// Hard links may appear in any order. Links whose targets have not yet appeared +// are queued and resolved as subsequent entries are processed. An unresolved +// hard link at EOF is returned as an error. +func Apply(w *erofs.Writer, r io.Reader, opts ...Option) error { + var cfg config + for _, o := range opts { + o(&cfg) + } + + tr := archivetar.NewReader(r) + + // pending records hard links whose targets haven't appeared yet. + var pending []pendingLink + + // pendingOrigin records directories that need trusted.overlay.origin="" + // set once their TypeDir entry appears (handles whiteout-before-dir order). + var pendingOrigin map[string]bool + + for { + hdr, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + return fmt.Errorf("tarconv: %w", err) + } + + p := cleanTarPath(hdr.Name) + base := path.Base(p) + dir := path.Dir(p) + + // --- Whiteout detection --- + // OCI whiteouts use TypeReg. Detect by name prefix and dispatch + // before the normal type switch so they are never added as real entries + // (unless WithPreserveWhiteouts is active). + if cfg.whiteouts != whiteoutPreserve && strings.HasPrefix(base, whiteoutPrefix) { + if base == opaqueWhiteout { + switch cfg.whiteouts { + case whiteoutMerge: + if err := removeChildren(w, dir); err != nil { + return fmt.Errorf("tarconv: opaque %s: %w", dir, err) + } + default: // whiteoutConvert + if err := setOpaqueXattr(w, dir, hdr); err != nil { + return fmt.Errorf("tarconv: opaque %s: %w", dir, err) + } + } + } else { + target := path.Join(dir, base[len(whiteoutPrefix):]) + switch cfg.whiteouts { + case whiteoutMerge: + if err := w.Remove(target); err != nil && !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("tarconv: whiteout %s: %w", target, err) + } + default: // whiteoutConvert + if err := emitWhiteout(w, target, hdr); err != nil { + return fmt.Errorf("tarconv: whiteout %s: %w", target, err) + } + // Set trusted.overlay.origin="" on the parent directory to + // match mkfs.erofs --aufs behaviour for regular whiteouts. + if _, serr := w.Stat(dir); serr == nil { + if err := w.Setxattr(dir, overlayOriginXattr, ""); err != nil { + return fmt.Errorf("tarconv: whiteout origin %s: %w", dir, err) + } + } else { + // Dir not yet seen — queue for when it appears. + if pendingOrigin == nil { + pendingOrigin = make(map[string]bool) + } + pendingOrigin[dir] = true + } + } + } + // Drain any data bytes (whiteouts are zero-size in practice but be safe). + if _, err := io.Copy(io.Discard, tr); err != nil { + return fmt.Errorf("tarconv: drain %s: %w", p, err) + } + continue + } + + // --- Normal entry dispatch --- + switch hdr.Typeflag { + case archivetar.TypeDir: + if err := addDir(w, p, hdr); err != nil { + return fmt.Errorf("tarconv: %s: %w", p, err) + } + if pendingOrigin[p] { + if err := w.Setxattr(p, overlayOriginXattr, ""); err != nil { + return fmt.Errorf("tarconv: whiteout origin %s: %w", p, err) + } + delete(pendingOrigin, p) + } + + case archivetar.TypeReg, archivetar.TypeRegA: //nolint:staticcheck + // Remove any existing entry to handle tar overwrite semantics. + removeExisting(w, p) + if err := addFile(w, p, hdr, tr); err != nil { + return fmt.Errorf("tarconv: %s: %w", p, err) + } + pending = replayPending(w, pending) + + case archivetar.TypeSymlink: + removeExisting(w, p) + if err := addSymlink(w, p, hdr); err != nil { + return fmt.Errorf("tarconv: %s: %w", p, err) + } + pending = replayPending(w, pending) + + case archivetar.TypeLink: + oldname := cleanTarPath(hdr.Linkname) + err := w.Link(oldname, p) + if err == nil { + if err := applyMetadata(w, p, hdr); err != nil { + return fmt.Errorf("tarconv: %s metadata: %w", p, err) + } + pending = replayPending(w, pending) + } else if isNotExist(err) { + pending = append(pending, pendingLink{newname: p, oldname: oldname, hdr: *hdr}) + } else { + return fmt.Errorf("tarconv: hardlink %s→%s: %w", p, oldname, err) + } + + case archivetar.TypeChar, archivetar.TypeBlock: + removeExisting(w, p) + if err := addDevice(w, p, hdr); err != nil { + return fmt.Errorf("tarconv: %s: %w", p, err) + } + pending = replayPending(w, pending) + + case archivetar.TypeFifo: + removeExisting(w, p) + if err := addFifo(w, p, hdr); err != nil { + return fmt.Errorf("tarconv: %s: %w", p, err) + } + pending = replayPending(w, pending) + + case archivetar.TypeXGlobalHeader: + // archive/tar merges PAX global headers into subsequent entries automatically. + + default: + // Skip unrecognised entry types so future tar extensions don't break consumers. + } + } + + // Drain the remainder of the underlying stream to EOF. Tar archives have + // end-of-archive padding (two 512-byte zero blocks) and callers may wrap r + // in a pipe or network stream that requires the reader side to be fully + // consumed before the writer side can detect a clean close. + _, _ = io.Copy(io.Discard, r) + + if len(pending) > 0 { + return fmt.Errorf("tarconv: unresolved hard link %q → %q (target never appeared)", + pending[0].newname, pending[0].oldname) + } + return nil +} + +// --- Entry creation helpers --- + +func addDir(w *erofs.Writer, p string, hdr *archivetar.Header) error { + if err := w.Mkdir(p, tarModeToGoMode(hdr.Mode)); err != nil { + // Tar archives commonly emit directory entries multiple times (once + // implicitly when a child is created, once explicitly with metadata). + // If the path already exists as a directory treat it as a metadata update. + if isDuplicatePath(err) { + if info, serr := w.Stat(p); serr == nil && info.IsDir() { + return applyMetadata(w, p, hdr) + } + } + return err + } + return applyMetadata(w, p, hdr) +} + +func addFile(w *erofs.Writer, p string, hdr *archivetar.Header, tr *archivetar.Reader) error { + f, err := w.Create(p) + if err != nil { + return err + } + if _, err := io.Copy(f, tr); err != nil { + _ = f.Close() + return fmt.Errorf("copy data: %w", err) + } + if err := f.Chmod(tarModeToGoMode(hdr.Mode)); err != nil { + _ = f.Close() + return err + } + if err := f.Chown(hdr.Uid, hdr.Gid); err != nil { + _ = f.Close() + return err + } + if err := f.Close(); err != nil { + return err + } + return applyMetadata(w, p, hdr) +} + +func addSymlink(w *erofs.Writer, p string, hdr *archivetar.Header) error { + if err := w.Symlink(hdr.Linkname, p); err != nil { + return err + } + return applyMetadata(w, p, hdr) +} + +func addDevice(w *erofs.Writer, p string, hdr *archivetar.Header) error { + typeBits := sifBlkdev + if hdr.Typeflag == archivetar.TypeChar { + typeBits = sifChrdev + } + mode := typeBits | uint16(tarModeToGoMode(hdr.Mode).Perm()) + if err := w.Mknod(p, mode, mkdev(hdr.Devmajor, hdr.Devminor)); err != nil { + return err + } + return applyMetadata(w, p, hdr) +} + +func addFifo(w *erofs.Writer, p string, hdr *archivetar.Header) error { + mode := sifFifo | uint16(tarModeToGoMode(hdr.Mode).Perm()) + if err := w.Mknod(p, mode, 0); err != nil { + return err + } + return applyMetadata(w, p, hdr) +} + +// emitWhiteout creates an overlayfs whiteout device (char 0:0, mode 0) at +// target, used by the default whiteout convert mode. +func emitWhiteout(w *erofs.Writer, target string, hdr *archivetar.Header) error { + removeExisting(w, target) + if err := w.Mknod(target, sifChrdev, 0); err != nil { + return err + } + return w.Chtimes(target, hdr.ModTime, hdr.ModTime) +} + +// setOpaqueXattr sets trusted.overlay.opaque=y on dir, used by the default +// whiteout convert mode for .wh..wh..opq entries. If the directory does not +// yet exist a placeholder is created; a later TypeDir entry will update it. +func setOpaqueXattr(w *erofs.Writer, dir string, hdr *archivetar.Header) error { + if _, err := w.Stat(dir); errors.Is(err, fs.ErrNotExist) { + if err := w.Mkdir(dir, 0o755); err != nil { + return err + } + _ = w.Chtimes(dir, hdr.ModTime, hdr.ModTime) + } + return w.Setxattr(dir, overlayOpaqueXattr, "y") +} + +// removeChildren removes all direct and indirect descendants of dir from w. +// The directory itself is kept. Used by WithMerge for opaque directories. +func removeChildren(w *erofs.Writer, dir string) error { + f, err := w.Open(dir) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + return nil + } + return err + } + defer f.Close() + rdf, ok := f.(fs.ReadDirFile) + if !ok { + return nil + } + children, err := rdf.ReadDir(-1) + if err != nil { + return err + } + for _, child := range children { + childPath := path.Join(dir, child.Name()) + if child.IsDir() { + if err := removeAll(w, childPath); err != nil { + return err + } + } else { + if err := w.Remove(childPath); err != nil && !errors.Is(err, fs.ErrNotExist) { + return err + } + } + } + return nil +} + +// removeAll recursively removes p and all its descendants from w. +func removeAll(w *erofs.Writer, p string) error { + info, err := w.Stat(p) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + return nil + } + return err + } + if info.IsDir() { + if err := removeChildren(w, p); err != nil { + return err + } + } + if err := w.Remove(p); err != nil && !errors.Is(err, fs.ErrNotExist) { + return err + } + return nil +} + +// removeExisting removes p silently. Used before re-creating a path to handle +// tar overwrite semantics. +func removeExisting(w *erofs.Writer, p string) { + _ = removeAll(w, p) +} + +// applyMetadata applies uid/gid, mtime, full mode (including special bits), +// and xattrs from hdr to path p. +func applyMetadata(w *erofs.Writer, p string, hdr *archivetar.Header) error { + if err := w.Chown(p, hdr.Uid, hdr.Gid); err != nil { + return err + } + if err := w.Chmod(p, tarModeToGoMode(hdr.Mode)); err != nil { + return err + } + if !hdr.ModTime.IsZero() { + if err := w.Chtimes(p, hdr.ModTime, hdr.ModTime); err != nil { + return err + } + } + for k, v := range extractXattrs(hdr) { + if err := w.Setxattr(p, k, v); err != nil { + return err + } + } + return nil +} + +// extractXattrs returns PAX xattr records from hdr with the SCHILY.xattr. prefix stripped. +func extractXattrs(hdr *archivetar.Header) map[string]string { + if len(hdr.PAXRecords) == 0 { + return nil + } + var result map[string]string + for k, v := range hdr.PAXRecords { + if strings.HasPrefix(k, xattrPrefix) { + if result == nil { + result = make(map[string]string) + } + result[k[len(xattrPrefix):]] = v + } + } + return result +} + +// replayPending tries to resolve queued hard links. Repeats until no progress +// is made to handle chains of pending links. +func replayPending(w *erofs.Writer, pending []pendingLink) []pendingLink { + for { + var remaining []pendingLink + progress := false + for _, pl := range pending { + if err := w.Link(pl.oldname, pl.newname); err == nil { + _ = applyMetadata(w, pl.newname, &pl.hdr) + progress = true + } else { + remaining = append(remaining, pl) + } + } + pending = remaining + if !progress { + break + } + } + return pending +} + +// isNotExist reports whether err indicates a path does not exist. +func isNotExist(err error) bool { + return errors.Is(err, fs.ErrNotExist) || strings.Contains(err.Error(), "not found") +} + +// isDuplicatePath reports whether err is the "duplicate path" error from erofs.Writer. +func isDuplicatePath(err error) bool { + return err != nil && strings.Contains(err.Error(), "duplicate path") +} + +// tarModeToGoMode converts a tar header Mode (unix mode bits) to fs.FileMode, +// correctly translating the special bits (setuid/setgid/sticky). +func tarModeToGoMode(mode int64) fs.FileMode { + m := fs.FileMode(mode & 0o777) + if mode&0o4000 != 0 { + m |= fs.ModeSetuid + } + if mode&0o2000 != 0 { + m |= fs.ModeSetgid + } + if mode&0o1000 != 0 { + m |= fs.ModeSticky + } + return m +} + +// cleanTarPath converts a tar header name to a cleaned absolute path. +func cleanTarPath(name string) string { + if name == "." || name == "" { + return "/" + } + if name[0] != '/' { + name = "/" + name + } + return path.Clean(name) +} + +// mkdev constructs a Linux device number from major and minor components. +func mkdev(major, minor int64) uint32 { + return uint32((major << 8) | (minor & 0xff) | ((minor & ^int64(0xff)) << 12)) +} diff --git a/tarconv/bench_test.go b/tarconv/bench_test.go new file mode 100644 index 0000000..1b2008e --- /dev/null +++ b/tarconv/bench_test.go @@ -0,0 +1,384 @@ +package tarconv_test + +// Benchmarks for tar.Convert and tar.Merge, with optional comparison against +// mkfs.erofs when it is present in PATH. +// +// Four synthetic workloads model real container image shapes: +// +// Small – ~200 entries, ~1MB data. Typical Alpine base layer. +// Medium – ~1000 entries, ~10MB data. Python/Node package install layer. +// Large – ~5000 entries, ~50MB data. Large app or source-tree layer. +// Huge – ~500 entries, ~100MB data. A few large binary files; exercises +// raw I/O throughput where per-process spawn overhead is ~0%. +// +// Each workload runs: +// BenchmarkConvert/ – tar.Convert (layer mode) +// BenchmarkMerge/ – tar.Merge (merge mode, 2-layer scenario) +// BenchmarkMkfsConvert/ – mkfs.erofs --tar=f (skipped if not in PATH) +// +// Run with (recommended for stable numbers): +// go test ./tar/... -bench=. -benchtime=10s -count=3 -benchmem + +import ( + "archive/tar" + "bytes" + "context" + "fmt" + "io" + "os" + "os/exec" + "testing" + "time" + + erofs "github.com/erofs/go-erofs" + "github.com/containerd/continuity/tarconv" +) + +// workload describes a set of synthetic tar entries to benchmark. +type workload struct { + name string + entries func() []tarEntry +} + +// tarEntry is a single entry to write into a tar. +type tarEntry struct { + hdr tar.Header + data []byte +} + +var benchEpoch = time.Unix(1700000000, 0) + +// smallWorkload simulates an Alpine-like base layer (~200 entries, ~1MB data). +func smallWorkload() []tarEntry { + return syntheticLayer( + layerSpec{dirs: 20, filesPerDir: 5, fileSize: 1024, symlinks: 10, hardLinkFraction: 0.05}, + ) +} + +// mediumWorkload simulates a package install layer (~1000 entries, ~10MB data). +func mediumWorkload() []tarEntry { + return syntheticLayer( + layerSpec{dirs: 50, filesPerDir: 15, fileSize: 4096, symlinks: 50, hardLinkFraction: 0.1}, + ) +} + +// largeWorkload simulates a source-tree or large-app layer (~5000 entries, ~50MB data). +func largeWorkload() []tarEntry { + return syntheticLayer( + layerSpec{dirs: 100, filesPerDir: 40, fileSize: 8192, symlinks: 100, hardLinkFraction: 0.05}, + ) +} + +// hugeWorkload simulates a layer dominated by a few large binary files +// (~500 entries, ~100MB data). This eliminates per-process spawn overhead +// from the mkfs comparison and isolates raw I/O throughput. +func hugeWorkload() []tarEntry { + return syntheticLayer( + layerSpec{dirs: 20, filesPerDir: 20, fileSize: 256 * 1024, symlinks: 20, hardLinkFraction: 0.02}, + ) +} + +type layerSpec struct { + dirs int + filesPerDir int + fileSize int + symlinks int + hardLinkFraction float64 +} + +// syntheticLayer generates a realistic tar layer according to spec. +func syntheticLayer(s layerSpec) []tarEntry { + var entries []tarEntry + + // Root. + entries = append(entries, tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, + Uid: 0, Gid: 0, ModTime: benchEpoch, + }}) + + // Standard directory skeleton. + skeletonDirs := []string{"usr/", "usr/bin/", "usr/lib/", "usr/share/", "etc/", "var/", "var/log/", "tmp/"} + for _, d := range skeletonDirs { + entries = append(entries, tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeDir, Name: d, Mode: 0o755, + Uid: 0, Gid: 0, ModTime: benchEpoch, + }}) + } + + // Generate payload data (reused across entries to avoid huge allocations). + fileData := make([]byte, s.fileSize) + for i := range fileData { + fileData[i] = byte(i % 251) + } + + var regularFiles []string + + for d := 0; d < s.dirs; d++ { + dirName := fmt.Sprintf("pkg%04d/", d) + entries = append(entries, tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeDir, Name: dirName, Mode: 0o755, + Uid: 1000, Gid: 1000, ModTime: benchEpoch, + }}) + + for f := 0; f < s.filesPerDir; f++ { + name := fmt.Sprintf("%sfile%04d.dat", dirName, f) + regularFiles = append(regularFiles, name) + + var pax map[string]string + if f%10 == 0 { + // Occasionally add an xattr (capabilities). + pax = map[string]string{"SCHILY.xattr.security.capability": "\x01\x00\x00\x02\x00 \x00\x00"} + } + + e := tarEntry{ + hdr: tar.Header{ + Typeflag: tar.TypeReg, + Name: name, + Size: int64(s.fileSize), + Mode: 0o644, + Uid: 1000, + Gid: 1000, + ModTime: benchEpoch, + PAXRecords: pax, + }, + data: fileData, + } + entries = append(entries, e) + } + } + + // Add hard links. + hlCount := int(float64(len(regularFiles)) * s.hardLinkFraction) + for i := 0; i < hlCount && i < len(regularFiles); i++ { + target := regularFiles[i] + linkName := fmt.Sprintf("links/hardlink%04d", i) + // Ensure the links/ directory exists (add it once). + if i == 0 { + entries = append(entries, tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeDir, Name: "links/", Mode: 0o755, + Uid: 0, Gid: 0, ModTime: benchEpoch, + }}) + } + entries = append(entries, tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeLink, Name: linkName, Linkname: target, + Uid: 0, Gid: 0, ModTime: benchEpoch, + }}) + } + + // Add symlinks. + if len(regularFiles) > 0 { + for i := 0; i < s.symlinks; i++ { + target := regularFiles[i%len(regularFiles)] + entries = append(entries, tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeSymlink, + Name: fmt.Sprintf("symlinks/link%04d", i), + Linkname: "/" + target, + Mode: 0o777, + ModTime: benchEpoch, + }}) + } + // Make sure the symlinks/ dir was emitted first. + symlinkDir := tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeDir, Name: "symlinks/", Mode: 0o755, + Uid: 0, Gid: 0, ModTime: benchEpoch, + }} + // Prepend before the symlink entries by splicing. + // Find the first symlink entry index. + firstSym := len(entries) - s.symlinks + if firstSym < 0 { + firstSym = 0 + } + rest := make([]tarEntry, len(entries)-firstSym) + copy(rest, entries[firstSym:]) + entries = append(entries[:firstSym], symlinkDir) + entries = append(entries, rest...) + } + + return entries +} + +// buildTarBytes serialises entries to an in-memory tar. +func buildTarBytes(t testing.TB, entries []tarEntry) []byte { + t.Helper() + var out bytes.Buffer + tw := tar.NewWriter(&out) + for _, e := range entries { + hdr := e.hdr // copy so we don't mutate + if err := tw.WriteHeader(&hdr); err != nil { + t.Fatalf("WriteHeader %s: %v", e.hdr.Name, err) + } + if len(e.data) > 0 { + if _, err := tw.Write(e.data); err != nil { + t.Fatalf("Write %s: %v", e.hdr.Name, err) + } + } + } + if err := tw.Close(); err != nil { + t.Fatalf("tar Close: %v", err) + } + return out.Bytes() +} + +// discardWriter is an io.WriteSeeker that discards output but tracks position. +type discardWriter struct{ pos int64 } + +func (d *discardWriter) Write(p []byte) (int, error) { d.pos += int64(len(p)); return len(p), nil } +func (d *discardWriter) Seek(offset int64, whence int) (int64, error) { + switch whence { + case io.SeekStart: + d.pos = offset + case io.SeekCurrent: + d.pos += offset + case io.SeekEnd: + // Not used by writer for the final seek. + d.pos = offset + } + return d.pos, nil +} + +// --- Benchmarks --- + +var workloads = []workload{ + {"Small", smallWorkload}, + {"Medium", mediumWorkload}, + {"Large", largeWorkload}, + {"Huge", hugeWorkload}, +} + +// makeMergeLayer2 builds a second tar layer over the given base entries: +// 20% new files + whiteouts for every 20th regular file in the base. +func makeMergeLayer2(b testing.TB, base []tarEntry) []byte { + b.Helper() + var layer2 []tarEntry + layer2 = append(layer2, tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeDir, Name: "layer2/", Mode: 0o755, ModTime: benchEpoch, + }}) + fileData := make([]byte, 512) + for i := 0; i < len(base)/5; i++ { + layer2 = append(layer2, tarEntry{ + hdr: tar.Header{ + Typeflag: tar.TypeReg, + Name: fmt.Sprintf("layer2/newfile%04d", i), + Size: int64(len(fileData)), Mode: 0o644, ModTime: benchEpoch, + }, + data: fileData, + }) + } + for i, e := range base { + if i%20 == 1 && e.hdr.Typeflag == tar.TypeReg { + // Construct whiteout path: same directory, .wh. prefix on filename. + p := e.hdr.Name + slash := len(p) - 1 + for slash >= 0 && p[slash] != '/' { + slash-- + } + dir, name := p[:slash+1], p[slash+1:] + layer2 = append(layer2, tarEntry{hdr: tar.Header{ + Typeflag: tar.TypeReg, + Name: dir + ".wh." + name, + ModTime: benchEpoch, + }}) + } + } + return buildTarBytes(b, layer2) +} + +// BenchmarkConvert benchmarks tarconv.Apply across all workload sizes. +// Reports throughput in MB/s of tar input processed. +func BenchmarkConvert(b *testing.B) { + for _, wl := range workloads { + wl := wl + b.Run(wl.name, func(b *testing.B) { + tarData := buildTarBytes(b, wl.entries()) + b.SetBytes(int64(len(tarData))) + // Validate the image once before the timed loop. + if b.N > 0 { + dw := &buf{} + w := erofs.Create(dw) + if err := tarconv.Apply(w, bytes.NewReader(tarData)); err != nil { + b.Fatalf("Convert (validation): %v", err) + } + if err := w.Close(); err != nil { + b.Fatalf("Close (validation): %v", err) + } + fsckErofsBytes(b, dw.b) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + dw := &discardWriter{} + w := erofs.Create(dw) + if err := tarconv.Apply(w, bytes.NewReader(tarData)); err != nil { + b.Fatalf("Convert: %v", err) + } + if err := w.Close(); err != nil { + b.Fatalf("Close: %v", err) + } + } + }) + } +} + +// BenchmarkMerge benchmarks tarconv.Apply(WithMerge) (two-layer merge). +// Layer 1 is the base; layer 2 adds new files and whiteouts. +func BenchmarkMerge(b *testing.B) { + for _, wl := range workloads { + wl := wl + b.Run(wl.name, func(b *testing.B) { + base := wl.entries() + layer1 := buildTarBytes(b, base) + layer2 := makeMergeLayer2(b, base) + b.SetBytes(int64(len(layer1) + len(layer2))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + dw := &discardWriter{} + w := erofs.Create(dw) + if err := tarconv.Apply(w, bytes.NewReader(layer1), tarconv.WithMerge()); err != nil { + b.Fatalf("Apply(WithMerge) layer1: %v", err) + } + if err := tarconv.Apply(w, bytes.NewReader(layer2), tarconv.WithMerge()); err != nil { + b.Fatalf("Apply(WithMerge) layer2: %v", err) + } + if err := w.Close(); err != nil { + b.Fatalf("Close: %v", err) + } + } + }) + } +} + +// BenchmarkMkfsConvert benchmarks mkfs.erofs --tar=f as a reference point. +// Skipped if mkfs.erofs is not in PATH. Uses the same fixed timestamp as +// other tests for fair comparison. Reports throughput in MB/s of tar input. +// +// Note: mkfs.erofs writes to a real file on disk and has fork/exec overhead +// per iteration. The throughput figures will be lower than tar.Convert for +// small inputs due to spawn cost, but converge as tar size grows. +func BenchmarkMkfsConvert(b *testing.B) { + if _, err := exec.LookPath("mkfs.erofs"); err != nil { + b.Skip("mkfs.erofs not found in PATH") + } + for _, wl := range workloads { + wl := wl + b.Run(wl.name, func(b *testing.B) { + tarData := buildTarBytes(b, wl.entries()) + outFile, err := os.CreateTemp("", "mkfs-bench-*.erofs") + if err != nil { + b.Fatal(err) + } + outPath := outFile.Name() + _ = outFile.Close() + defer os.Remove(outPath) + + b.SetBytes(int64(len(tarData))) + b.ResetTimer() + + ctx := context.Background() + for i := 0; i < b.N; i++ { + if err := convertTarMkfs(ctx, b, tarData, outPath, nil); err != nil { + b.Fatalf("mkfs.erofs: %v", err) + } + _ = os.Remove(outPath) + } + }) + } +} diff --git a/tarconv/compare_test.go b/tarconv/compare_test.go new file mode 100644 index 0000000..ea39dab --- /dev/null +++ b/tarconv/compare_test.go @@ -0,0 +1,1217 @@ +package tarconv_test + +// Image comparison tests. +// +// These tests build the same tar with both tar.Convert and mkfs.erofs, then +// walk both images with the go-erofs reader and assert that every entry has +// identical: type, permissions (rawMode), uid, gid, mtime, size, file +// content, symlink target, rdev, and xattrs. +// +// Inode numbers (nid) and block layout are deliberately excluded: they are +// implementation-specific and will legitimately differ. +// +// All builds use a fixed timestamp (-T / WithBuildTime) so mtime values are +// deterministic. + +import ( + "archive/tar" + "bytes" + "context" + "io" + "io/fs" + "os" + "os/exec" + "path/filepath" + "sort" + "strings" + "testing" + "time" + + erofs "github.com/erofs/go-erofs" + + "github.com/containerd/continuity/tarconv" +) + +// fixedBuildTime is used for all comparison builds so mtime is deterministic. +var fixedBuildTime = time.Unix(1700000000, 0) +var fixedBuildTimeStr = "1700000000" + +// lstater is the interface for lstat on the erofs image. +type lstater interface { + Lstat(name string) (fs.FileInfo, error) +} + +// readLinker is the interface for reading symlink targets. +type readLinker interface { + ReadLink(name string) (string, error) +} + +// readDirer is the interface for reading directory contents. +type readDirer interface { + ReadDir(name string) ([]fs.DirEntry, error) +} + +// buildGoImage builds an EROFS image using tarconv.Apply (default convert-whiteouts mode). +// The build time is set to fixedBuildTime so compact inodes match mkfs.erofs -T output. +func buildGoImage(t testing.TB, tarData []byte) []byte { + t.Helper() + out := &buf{} + w := erofs.Create(out, + erofs.WithBuildTime(uint64(fixedBuildTime.Unix()), 0), + ) + if err := tarconv.Apply(w, bytes.NewReader(tarData)); err != nil { + t.Fatalf("Convert: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Writer.Close: %v", err) + } + return out.b +} + +// buildMkfsImage builds an EROFS image using mkfs.erofs. +// Skips the test if mkfs.erofs is not in PATH. +// Uses --T (fixed build time), --aufs, --tar=f, -Enoinline_data. +func buildMkfsImage(t testing.TB, tarData []byte) []byte { + t.Helper() + if _, err := exec.LookPath("mkfs.erofs"); err != nil { + t.Skip("mkfs.erofs not found in PATH") + } + + // Write tar to a temp file (mkfs.erofs reads from stdin via --tar=f). + tarFile, err := os.CreateTemp("", "compare-*.tar") + if err != nil { + t.Fatalf("create tar temp: %v", err) + } + defer os.Remove(tarFile.Name()) + if _, err := tarFile.Write(tarData); err != nil { + tarFile.Close() + t.Fatalf("write tar temp: %v", err) + } + if _, err := tarFile.Seek(0, io.SeekStart); err != nil { + tarFile.Close() + t.Fatalf("seek tar temp: %v", err) + } + + outFile, err := os.CreateTemp("", "compare-*.erofs") + if err != nil { + tarFile.Close() + t.Fatalf("create img temp: %v", err) + } + outPath := outFile.Name() + outFile.Close() + defer os.Remove(outPath) + + // -T sets the EROFS image build time (compact-inode threshold). + // Do NOT pass --all-time: that would override per-entry mtimes from the tar, + // causing all entries to show the build time instead of their own mtime. + args := []string{ + "--tar=f", + "--aufs", + "--quiet", + "-Enoinline_data", + "-T" + fixedBuildTimeStr, + outPath, + } + cmd := exec.CommandContext(context.Background(), "mkfs.erofs", args...) + cmd.Stdin = tarFile + out, err := cmd.CombinedOutput() + tarFile.Close() + if err != nil { + t.Fatalf("mkfs.erofs: %v\n%s", err, out) + } + + imgBytes, err := os.ReadFile(outPath) + if err != nil { + t.Fatalf("read mkfs image: %v", err) + } + return imgBytes +} + +// fsckImage runs fsck.erofs on data (writes to a temp file). Skips if +// fsck.erofs is not in PATH. Calls t.Errorf on failure (not Fatal so other +// checks can run). +func fsckImageBytes(t testing.TB, label string, data []byte) { + t.Helper() + if _, err := exec.LookPath("fsck.erofs"); err != nil { + return + } + f, err := os.CreateTemp("", "fsck-*.erofs") + if err != nil { + t.Errorf("%s fsck: create temp: %v", label, err) + return + } + defer os.Remove(f.Name()) + if _, err := f.Write(data); err != nil { + f.Close() + t.Errorf("%s fsck: write: %v", label, err) + return + } + f.Close() + out, err := exec.Command("fsck.erofs", f.Name()).CombinedOutput() + if err != nil { + t.Errorf("%s fsck.erofs FAILED: %v\n%s", label, err, out) + } +} + +// imageEntry is a fully normalized representation of one filesystem entry, +// collected via the go-erofs reader. Every field that can be compared between +// two images derived from the same tar source is stored here. +type imageEntry struct { + path string + rawMode uint16 // Unix mode bits (type + perms + special) from erofs.Stat + uid uint32 + gid uint32 + mtime uint64 + mtimeNs uint32 + size int64 + rdev uint32 + nlink int // exact nlink value — must match between images + symlink string // target for symlinks, empty otherwise + xattrs map[string]string + // dirChildren holds the ordered list of child names as returned by ReadDir. + // Order matters: EROFS stores directory entries sorted, so both images + // should report identical order for the same directory contents. + dirChildren []string + // content holds the full file data for regular files. Always read in full + // regardless of size so content correctness is always verified. + content []byte +} + +// collectImage walks an EROFS image opened with erofs.Open and returns a +// sorted slice of imageEntry for every path including the root ("."). +// +// It uses Lstat (not Stat) for every entry so symlinks are captured as-is. +// It reads every regular file in full so content is always compared. +// It records the ReadDir order of every directory so ordering is compared. +func collectImage(t testing.TB, img fs.FS, label string) []imageEntry { + t.Helper() + + ls, ok := img.(lstater) + if !ok { + t.Fatalf("%s: image does not implement Lstat", label) + } + rl, _ := img.(readLinker) + rd, ok := img.(readDirer) + if !ok { + t.Fatalf("%s: image does not implement ReadDir", label) + } + + var entries []imageEntry + + // Collect one entry. path is the fs.FS path (relative, no leading slash). + // "." refers to the root directory. + collect := func(p string) { + var fi fs.FileInfo + var err error + if p == "." { + fi, err = ls.Lstat(".") + } else { + fi, err = ls.Lstat(p) + } + if err != nil { + t.Errorf("%s Lstat %q: %v", label, p, err) + return + } + st, ok := fi.Sys().(*erofs.Stat) + if !ok { + t.Errorf("%s %q: Sys() is %T not *erofs.Stat", label, p, fi.Sys()) + return + } + + e := imageEntry{ + path: p, + rawMode: goModeToRaw(st.Mode), + uid: st.UID, + gid: st.GID, + mtime: st.Mtime, + mtimeNs: st.MtimeNs, + size: fi.Size(), + rdev: st.Rdev, + nlink: st.Nlink, + xattrs: st.Xattrs, + } + + if fi.Mode()&fs.ModeSymlink != 0 && rl != nil { + target, err := rl.ReadLink(p) + if err != nil { + t.Errorf("%s ReadLink %q: %v", label, p, err) + } + e.symlink = target + } + + if fi.Mode().IsDir() { + des, err := rd.ReadDir(p) + if err != nil { + t.Errorf("%s ReadDir %q: %v", label, p, err) + } else { + e.dirChildren = make([]string, len(des)) + for i, de := range des { + e.dirChildren[i] = de.Name() + } + } + } + + if fi.Mode().IsRegular() && fi.Size() > 0 { + f, err := img.Open(p) + if err != nil { + t.Errorf("%s Open %q: %v", label, p, err) + } else { + data, err := io.ReadAll(f) + f.Close() + if err != nil { + t.Errorf("%s ReadAll %q: %v", label, p, err) + } else { + e.content = data + } + } + } + + entries = append(entries, e) + } + + // Walk using fs.WalkDir which uses Stat (follows symlinks for type), but we + // want to visit symlinks as entries too. Use a manual recursive walk that + // calls Lstat directly so we see symlinks as-is. + var walk func(dir string) + walk = func(dir string) { + des, err := rd.ReadDir(dir) + if err != nil { + t.Errorf("%s ReadDir %q: %v", label, dir, err) + return + } + for _, de := range des { + var p string + if dir == "." { + p = de.Name() + } else { + p = dir + "/" + de.Name() + } + collect(p) + // Recurse into real directories only (not symlinks to dirs). + if de.Type().IsDir() { + walk(p) + } + } + } + + // Include the root itself. + collect(".") + walk(".") + + sort.Slice(entries, func(i, j int) bool { return entries[i].path < entries[j].path }) + return entries +} + +// goModeToRaw converts a Go fs.FileMode (as returned by erofs.Stat.Mode, which +// uses EroFSModeToGoFileMode and correctly carries ModeSetuid/Sticky/etc) back +// to Unix mode bits for comparison. This is the inverse of EroFSModeToGoFileMode. +func goModeToRaw(m fs.FileMode) uint16 { + var raw uint16 + raw |= uint16(m.Perm()) + if m&fs.ModeSetuid != 0 { + raw |= 0o4000 + } + if m&fs.ModeSetgid != 0 { + raw |= 0o2000 + } + if m&fs.ModeSticky != 0 { + raw |= 0o1000 + } + switch m.Type() { + case fs.ModeDir: + raw |= 0o040000 + case fs.ModeSymlink: + raw |= 0o120000 + case fs.ModeDevice | fs.ModeCharDevice: + raw |= 0o020000 + case fs.ModeDevice: + raw |= 0o060000 + case fs.ModeNamedPipe: + raw |= 0o010000 + case fs.ModeSocket: + raw |= 0o140000 + default: // regular file + raw |= 0o100000 + } + return raw +} + +// isDeviceType returns true if rawMode describes a character or block device. +func isDeviceType(rawMode uint16) bool { + typ := rawMode & 0xF000 + return typ == 0o020000 || typ == 0o060000 +} + +// compareImages asserts that two EROFS images contain exactly the same +// filesystem: same paths, same metadata on every entry, same file content, +// same directory child order, same xattrs. Differences are reported via +// t.Errorf so all mismatches are collected before the test fails. +func compareImages(t testing.TB, goImg, mkfsImg []byte) { + t.Helper() + + goFS, err := erofs.Open(bytes.NewReader(goImg)) + if err != nil { + t.Fatalf("open go image: %v", err) + } + mkFS, err := erofs.Open(bytes.NewReader(mkfsImg)) + if err != nil { + t.Fatalf("open mkfs image: %v", err) + } + + goEntries := collectImage(t, goFS, "go") + mkEntries := collectImage(t, mkFS, "mkfs") + + // Build path-keyed maps for fast lookup. + goMap := make(map[string]imageEntry, len(goEntries)) + for _, e := range goEntries { + goMap[e.path] = e + } + mkMap := make(map[string]imageEntry, len(mkEntries)) + for _, e := range mkEntries { + mkMap[e.path] = e + } + + // Every path in go image must exist in mkfs image with identical fields. + for _, ge := range goEntries { + me, ok := mkMap[ge.path] + if !ok { + t.Errorf("path %q: in go image but missing from mkfs image", ge.path) + continue + } + diffEntries(t, ge.path, ge, me) + } + + // Every path in mkfs image must exist in go image. + for _, me := range mkEntries { + if _, ok := goMap[me.path]; !ok { + t.Errorf("path %q: in mkfs image but missing from go image", me.path) + } + } +} + +// diffEntries reports every difference between two imageEntry values for the +// same path. All fields are compared exactly unless noted. +func diffEntries(t testing.TB, p string, got, want imageEntry) { + t.Helper() + + // Mode: compare full unix bits (type + perms + special bits). + if got.rawMode != want.rawMode { + t.Errorf("%s: mode: go=0o%o mkfs=0o%o", p, got.rawMode, want.rawMode) + } + if got.uid != want.uid { + t.Errorf("%s: uid: go=%d mkfs=%d", p, got.uid, want.uid) + } + if got.gid != want.gid { + t.Errorf("%s: gid: go=%d mkfs=%d", p, got.gid, want.gid) + } + if got.mtime != want.mtime { + t.Errorf("%s: mtime: go=%d mkfs=%d", p, got.mtime, want.mtime) + } + // mtimeNs: compare only when both are non-zero; mkfs.erofs may not + // preserve sub-second precision in all versions. + if got.mtimeNs != 0 && want.mtimeNs != 0 && got.mtimeNs != want.mtimeNs { + t.Errorf("%s: mtime_ns: go=%d mkfs=%d", p, got.mtimeNs, want.mtimeNs) + } + if got.size != want.size { + t.Errorf("%s: size: go=%d mkfs=%d", p, got.size, want.size) + } + if got.symlink != want.symlink { + t.Errorf("%s: symlink target: go=%q mkfs=%q", p, got.symlink, want.symlink) + } + // rdev: compare for device nodes only. + if isDeviceType(got.rawMode) && got.rdev != want.rdev { + t.Errorf("%s: rdev: go=%d mkfs=%d", p, got.rdev, want.rdev) + } + // nlink: exact comparison. Both images are built from the same tar so every + // hard-link group must have the same nlink count. + if got.nlink != want.nlink { + t.Errorf("%s: nlink: go=%d mkfs=%d", p, got.nlink, want.nlink) + } + // xattrs: exact match — same keys, same values, no extras on either side. + for k, gv := range got.xattrs { + mv, ok := want.xattrs[k] + if !ok { + t.Errorf("%s: xattr %q in go image, absent in mkfs image", p, k) + } else if gv != mv { + t.Errorf("%s: xattr %q: go=%q mkfs=%q", p, k, gv, mv) + } + } + for k := range want.xattrs { + if _, ok := got.xattrs[k]; !ok { + t.Errorf("%s: xattr %q in mkfs image, absent in go image", p, k) + } + } + // Directory child order: EROFS always stores entries lexicographically, so + // both images must report the same order. + if len(got.dirChildren) != len(want.dirChildren) { + t.Errorf("%s: dir child count: go=%d mkfs=%d (%v vs %v)", + p, len(got.dirChildren), len(want.dirChildren), got.dirChildren, want.dirChildren) + } else { + for i := range got.dirChildren { + if got.dirChildren[i] != want.dirChildren[i] { + t.Errorf("%s: dir child[%d]: go=%q mkfs=%q", p, i, got.dirChildren[i], want.dirChildren[i]) + } + } + } + // File content: exact byte comparison. + if !bytes.Equal(got.content, want.content) { + n := 64 + if len(got.content) < n { + n = len(got.content) + } + wn := n + if len(want.content) < wn { + wn = len(want.content) + } + t.Errorf("%s: content mismatch (len go=%d mkfs=%d); go[:%d]=%x mkfs[:%d]=%x", + p, len(got.content), len(want.content), n, got.content[:n], wn, want.content[:wn]) + } +} + +// buildComparisonTar creates a comprehensive deterministic tar that exercises +// every path through tar.Convert and every erofs.Writer call it makes: +// +// - Directories with varied uid/gid/mtime/mode including sticky bits +// (forces Mkdir + Chown + Chtimes + Chmod on dirs) +// - Regular files with varied uid/gid/mtime/mode including setuid/setgid +// (forces Create + Chown + Chtimes + Chmod on files) +// - Regular files with PAX xattrs on multiple entry types +// (forces Setxattr on files, dirs, symlinks, and device nodes) +// - A 3-way hard-link group (canonical + 2 aliases, nlink=3) +// (forces Link x2 and exact nlink=3 match) +// - A 2-way hard-link group in a different directory (cross-dir links) +// - Symlinks with non-root uid/gid and non-default mtime +// (forces Chown + Chtimes on symlinks) +// - An opaque directory (.wh..wh..opq) which must appear in both images +// as trusted.overlay.opaque=y + trusted.overlay.origin="" +// - A plain whiteout (.wh.) which must appear as a char device 0/0 +// - Char device (major/minor), block device (major/minor), FIFO +// (forces Mknod for all three types) +// - A multi-block file whose content spans more than one EROFS block +// - An empty regular file +// +// Every directory has an explicit entry in the tar so root metadata is +// deterministic across both converters. +func buildComparisonTar(t testing.TB) []byte { + t.Helper() + + // Use a single timestamp for all entries. mkfs.erofs 1.9 applies its -T + // build time to every entry regardless of per-entry tar mtime, so a + // deterministic comparison requires matching timestamps throughout. + // Chown/Chmod/Setxattr are verified via uid/gid/mode/xattr fields, not mtime. + ts := fixedBuildTime // 1700000000 + + return makeTar(t, func(tw *tar.Writer) { + must := func(err error) { + t.Helper() + if err != nil { + t.Fatalf("write tar: %v", err) + } + } + hdr := func(h tar.Header) { must(tw.WriteHeader(&h)) } + data := func(b []byte) { _, err := tw.Write(b); must(err) } + + // --- Root and top-level directories --- + // Root: uid=0 gid=0, ts + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + + // bin/: uid=0 gid=0 — standard mode + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "bin/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + + // etc/: uid=0 gid=0 — different mtime to exercise Chtimes + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "etc/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + + // usr/ and usr/bin/ owned by uid=0 gid=0 + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "usr/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "usr/bin/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + + // lib/ and lib/shared/: uid=0, gid=0, different timestamps + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "lib/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "lib/shared/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + + // home/: uid=0, gid=0, ts + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "home/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + // home/user/: non-root uid/gid, restricted perms — exercises Chown on dir + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "home/user/", Mode: 0o700, Uid: 1000, Gid: 1000, ModTime: ts}) + + // tmp/: sticky bit (0o1777) — exercises Chmod for special bits on dir + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "tmp/", Mode: 0o1777, Uid: 0, Gid: 0, ModTime: ts}) + + // var/ and var/log/: gid=4 (adm), exercises Chown with non-standard gid + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "var/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "var/log/", Mode: 0o755, Uid: 0, Gid: 4, ModTime: ts}) + + // dev/: uid=0, gid=0 — must be explicit so metadata matches mkfs.erofs + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "dev/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + + // --- Regular files: varied uid/gid/mtime/mode --- + + // etc/hostname: uid=0, gid=0, ts, 0o644 + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "etc/hostname", Size: 8, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + data([]byte("myhost\n\n")) + + // etc/shadow: uid=0, gid=42 (shadow), ts, 0o640 — Chown with non-root gid + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "etc/shadow", Size: 5, Mode: 0o640, Uid: 0, Gid: 42, ModTime: ts}) + data([]byte("root:")) + + // etc/motd: empty file, different mtime + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "etc/motd", Size: 0, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + + // bin/sudo: setuid (0o4755) — exercises Chmod for setuid bit + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "bin/sudo", Size: 4, Mode: 0o4755, Uid: 0, Gid: 0, ModTime: ts}) + data([]byte("sudo")) + + // bin/wall: setgid (0o2755), gid=5 (tty) — exercises Chmod for setgid + Chown + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "bin/wall", Size: 4, Mode: 0o2755, Uid: 0, Gid: 5, ModTime: ts}) + data([]byte("wall")) + + // bin/ping: capability xattr + ts + uid=0 gid=0 — exercises Setxattr on regular file + hdr(tar.Header{ + Typeflag: tar.TypeReg, Name: "bin/ping", Size: 4, Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts, + PAXRecords: map[string]string{ + "SCHILY.xattr.security.capability": "\x01\x00\x00\x02\x00\x20\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "SCHILY.xattr.user.role": "network-tool", + }, + }) + data([]byte("ping")) + + // usr/bin/env: uid=0 gid=0 ts — plain executable + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "usr/bin/env", Size: 3, Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + data([]byte("env")) + + // home/user/notes.txt: uid=1000 gid=1000 ts — non-root owner + Chtimes + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "home/user/notes.txt", Size: 5, Mode: 0o600, Uid: 1000, Gid: 1000, ModTime: ts}) + data([]byte("hello")) + + // home/user/bigfile: multi-block (>4096 bytes), uid=1000 gid=1000 + bigData := make([]byte, 3*4096+512) + for i := range bigData { bigData[i] = byte(i % 251) } + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "home/user/bigfile", Size: int64(len(bigData)), Mode: 0o600, Uid: 1000, Gid: 1000, ModTime: ts}) + data(bigData) + + // var/log/syslog: uid=0 gid=4 ts — Chown with adm gid + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "var/log/syslog", Size: 0, Mode: 0o640, Uid: 0, Gid: 4, ModTime: ts}) + + // --- Symlinks --- + + // bin/sh → /bin/busybox: uid=0 gid=0 ts + hdr(tar.Header{Typeflag: tar.TypeSymlink, Name: "bin/sh", Linkname: "/bin/busybox", Mode: 0o777, Uid: 0, Gid: 0, ModTime: ts}) + + // etc/localtime → /usr/share/zoneinfo/UTC: uid=0 gid=0 ts — Chtimes on symlink + hdr(tar.Header{Typeflag: tar.TypeSymlink, Name: "etc/localtime", Linkname: "/usr/share/zoneinfo/UTC", Mode: 0o777, Uid: 0, Gid: 0, ModTime: ts}) + + // home/user/link → ../usr/bin/env: non-root uid/gid — Chown on symlink + hdr(tar.Header{Typeflag: tar.TypeSymlink, Name: "home/user/myenv", Linkname: "../../usr/bin/env", Mode: 0o777, Uid: 1000, Gid: 1000, ModTime: ts}) + + // --- Hard links --- + + // 3-way hard-link group: lib/shared/data.bin (canonical) + 2 aliases. + // nlink must be exactly 3 in both images. + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "lib/shared/data.bin", Size: 8, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + data([]byte("sharedXX")) + hdr(tar.Header{Typeflag: tar.TypeLink, Name: "lib/shared/data.bin.1", Linkname: "lib/shared/data.bin", Uid: 0, Gid: 0, ModTime: ts}) + hdr(tar.Header{Typeflag: tar.TypeLink, Name: "lib/shared/data.bin.2", Linkname: "lib/shared/data.bin", Uid: 0, Gid: 0, ModTime: ts}) + + // 2-way cross-directory hard link: canonical in etc/, alias in var/log/ + // exercises Link across directory boundaries. + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "etc/group", Size: 6, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + data([]byte("root:x")) + hdr(tar.Header{Typeflag: tar.TypeLink, Name: "var/log/group.bak", Linkname: "etc/group", Uid: 0, Gid: 0, ModTime: ts}) + + // --- Opaque directory --- + // app/ is opaque: it contains .wh..wh..opq which signals that any lower-layer + // contents of app/ are hidden. In Convert mode this sets + // trusted.overlay.opaque=y and trusted.overlay.origin="" on app/. + // The directory also has a file so the image is non-trivial. + hdr(tar.Header{Typeflag: tar.TypeDir, Name: "app/", Mode: 0o755, Uid: 1000, Gid: 1000, ModTime: ts}) + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "app/.wh..wh..opq", Size: 0, Uid: 0, Gid: 0, ModTime: ts}) + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "app/main", Size: 4, Mode: 0o755, Uid: 1000, Gid: 1000, ModTime: ts}) + data([]byte("main")) + + // --- Plain whiteout --- + // etc/.wh.removed-file converts to a char device 0/0 (mode 0) at etc/removed-file. + hdr(tar.Header{Typeflag: tar.TypeReg, Name: "etc/.wh.removed-file", Size: 0, Uid: 0, Gid: 0, ModTime: ts}) + + // --- Device nodes (Mknod) --- + + // char device: /dev/null (1,3) — standard whiteout device + hdr(tar.Header{Typeflag: tar.TypeChar, Name: "dev/null", Mode: 0o666, Uid: 0, Gid: 0, Devmajor: 1, Devminor: 3, ModTime: ts}) + + // char device: /dev/zero (1,5) + hdr(tar.Header{Typeflag: tar.TypeChar, Name: "dev/zero", Mode: 0o666, Uid: 0, Gid: 0, Devmajor: 1, Devminor: 5, ModTime: ts}) + + // char device with non-root uid/gid and ts — exercises Chown+Chtimes on mknod + hdr(tar.Header{Typeflag: tar.TypeChar, Name: "dev/tty1", Mode: 0o620, Uid: 0, Gid: 5, Devmajor: 4, Devminor: 1, ModTime: ts}) + + // block device: /dev/sda (8,0) — exercises Mknod with block type + hdr(tar.Header{Typeflag: tar.TypeBlock, Name: "dev/sda", Mode: 0o660, Uid: 0, Gid: 6, Devmajor: 8, Devminor: 0, ModTime: ts}) + + // block device: /dev/sda1 (8,1) + hdr(tar.Header{Typeflag: tar.TypeBlock, Name: "dev/sda1", Mode: 0o660, Uid: 0, Gid: 6, Devmajor: 8, Devminor: 1, ModTime: ts}) + + // FIFO: uid=1000 gid=1000 — exercises Mknod for fifo + Chown + hdr(tar.Header{Typeflag: tar.TypeFifo, Name: "tmp/pipe", Mode: 0o600, Uid: 1000, Gid: 1000, ModTime: ts}) + + // Another FIFO with different permissions — confirms mode bits for fifo + hdr(tar.Header{Typeflag: tar.TypeFifo, Name: "tmp/ctrl", Mode: 0o640, Uid: 0, Gid: 1000, ModTime: ts}) + + // --- Directory with xattrs (SELinux label) --- + // var/log/ has an xattr, exercising Setxattr on a directory. + // We set it here by re-emitting var/log/ — the duplicate Mkdir is handled + // by the idempotent addDir path, and applyMetadata sets the xattr. + hdr(tar.Header{ + Typeflag: tar.TypeDir, Name: "var/log/", Mode: 0o755, Uid: 0, Gid: 4, ModTime: ts, + PAXRecords: map[string]string{ + "SCHILY.xattr.security.selinux": "system_u:object_r:var_log_t:s0\x00", + }, + }) + }) +} + +// ---------------------------------------------------------------------------- +// Comparison tests +// ---------------------------------------------------------------------------- + +// TestCompareWithMkfs builds the same tar with both tar.Convert and mkfs.erofs +// and asserts the resulting images are semantically identical. +func TestCompareWithMkfs(t *testing.T) { + tarData := buildComparisonTar(t) + goImg := buildGoImage(t, tarData) + mkfsImg := buildMkfsImage(t, tarData) + + fsckImageBytes(t, "go", goImg) + fsckImageBytes(t, "mkfs", mkfsImg) + compareImages(t, goImg, mkfsImg) +} + +// TestCompareWithMkfsSymlinkDir builds a tar containing a directory that is +// also a symlink target, to exercise the Lstat path. +func TestCompareWithMkfsSymlinkDir(t *testing.T) { + ts := fixedBuildTime + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "real/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "real/a", Size: 1, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("a")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeSymlink, Name: "link", Linkname: "real", Mode: 0o777, Uid: 0, Gid: 0, ModTime: ts}) + }) + goImg := buildGoImage(t, tarData) + mkfsImg := buildMkfsImage(t, tarData) + fsckImageBytes(t, "go", goImg) + fsckImageBytes(t, "mkfs", mkfsImg) + compareImages(t, goImg, mkfsImg) +} + +// TestCompareWithMkfsHardLinksOutOfOrder verifies that go-erofs produces a +// valid image for out-of-order hard links and runs fsck on it. mkfs.erofs 1.9 +// does not support hard links whose target appears later in the tar stream +// (it errors with ENOENT), so we only compare against our own image with fsck. +func TestCompareWithMkfsHardLinksOutOfOrder(t *testing.T) { + ts := fixedBuildTime + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "a/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + // Link before target — mkfs.erofs cannot handle this. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "a/link", Linkname: "a/target", Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "a/target", Size: 5, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("hello")) + }) + goImg := buildGoImage(t, tarData) + fsckImageBytes(t, "go", goImg) + + // Verify content via the go-erofs reader. + imgFS, err := erofs.Open(bytes.NewReader(goImg)) + if err != nil { + t.Fatalf("Open: %v", err) + } + got, err := fs.ReadFile(imgFS, "a/target") + if err != nil { + t.Fatalf("ReadFile a/target: %v", err) + } + if string(got) != "hello" { + t.Errorf("a/target: got %q want hello", got) + } + got2, err := fs.ReadFile(imgFS, "a/link") + if err != nil { + t.Fatalf("ReadFile a/link: %v", err) + } + if string(got2) != "hello" { + t.Errorf("a/link: got %q want hello", got2) + } +} + +// TestCompareWithMkfsWhiteouts builds a tar with OCI whiteout entries. +// mkfs.erofs --aufs converts them to char devices too, so the outputs +// should match. +func TestCompareWithMkfsWhiteouts(t *testing.T) { + ts := fixedBuildTime + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "lib/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/.wh.removed.so", Size: 0, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/.wh..wh..opq", Size: 0, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/present.so", Size: 4, Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("lib!")) + }) + goImg := buildGoImage(t, tarData) + mkfsImg := buildMkfsImage(t, tarData) + fsckImageBytes(t, "go", goImg) + fsckImageBytes(t, "mkfs", mkfsImg) + compareImages(t, goImg, mkfsImg) +} + +// TestCompareWithMkfsUbuntuLike runs the full Ubuntu-shaped workload through +// both converters and diffs the results. +func TestCompareWithMkfsUbuntuLike(t *testing.T) { + ts := fixedBuildTime + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + dirs := []string{"bin/", "sbin/", "lib/", "lib/x86_64-linux-gnu/", + "etc/", "etc/apt/", "usr/", "usr/bin/", "usr/lib/", "var/", "var/log/"} + for _, d := range dirs { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: d, Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + } + for _, name := range []string{"bin/sh", "bin/ls", "sbin/init"} { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: name, Size: 4, Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("fake")) + } + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/x86_64-linux-gnu/libc.so.6", Size: 8, Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("libcdata")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "lib/libc.so.6", Linkname: "lib/x86_64-linux-gnu/libc.so.6", Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeSymlink, Name: "lib64", Linkname: "lib/x86_64-linux-gnu", Mode: 0o777, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "dev/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeChar, Name: "dev/null", Mode: 0o666, Uid: 0, Gid: 0, Devmajor: 1, Devminor: 3, ModTime: ts}) + tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeReg, Name: "usr/bin/ping", Size: 4, Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts, + PAXRecords: map[string]string{"SCHILY.xattr.security.capability": "\x01\x00\x00\x02\x00 \x00\x00"}, + }) + tw.Write([]byte("ping")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "var/log/syslog", Size: 0, Mode: 0o640, Uid: 0, Gid: 4, ModTime: ts}) + }) + + goImg := buildGoImage(t, tarData) + mkfsImg := buildMkfsImage(t, tarData) + fsckImageBytes(t, "go", goImg) + fsckImageBytes(t, "mkfs", mkfsImg) + compareImages(t, goImg, mkfsImg) +} + +// TestCompareFSWalk is the definitive filesystem equality test. +// +// It builds the same comprehensive tar with both tar.Convert and mkfs.erofs, +// then walks both resulting images as fs.FS from root to leaves and asserts +// exact equality at every node. This goes beyond the targeted metadata checks +// above by also verifying: +// - total entry count is identical +// - directory child order is identical (EROFS sorts lexicographically) +// - every file's full byte content matches +// - the root directory (".") itself matches +// - every xattr present on either side is present on the other +// - nlink is exactly equal (not just ">= 2") +// - rdev is exactly equal for device nodes +// - the complete unix mode word (type + special + perm) matches +func TestCompareFSWalk(t *testing.T) { + tarData := buildComparisonTar(t) + goImg := buildGoImage(t, tarData) + mkfsImg := buildMkfsImage(t, tarData) + + // fsck both images first. + fsckImageBytes(t, "go", goImg) + fsckImageBytes(t, "mkfs", mkfsImg) + + goFS, err := erofs.Open(bytes.NewReader(goImg)) + if err != nil { + t.Fatalf("open go image: %v", err) + } + mkFS, err := erofs.Open(bytes.NewReader(mkfsImg)) + if err != nil { + t.Fatalf("open mkfs image: %v", err) + } + + goEntries := collectImage(t, goFS, "go") + mkEntries := collectImage(t, mkFS, "mkfs") + + // The sorted entry slices must have the same length. + if len(goEntries) != len(mkEntries) { + t.Errorf("entry count mismatch: go=%d mkfs=%d", len(goEntries), len(mkEntries)) + // Still print which paths differ. + goSet := make(map[string]bool, len(goEntries)) + for _, e := range goEntries { + goSet[e.path] = true + } + mkSet := make(map[string]bool, len(mkEntries)) + for _, e := range mkEntries { + mkSet[e.path] = true + } + for _, e := range goEntries { + if !mkSet[e.path] { + t.Errorf(" go-only path: %q", e.path) + } + } + for _, e := range mkEntries { + if !goSet[e.path] { + t.Errorf(" mkfs-only path: %q", e.path) + } + } + } + + // Walk in parallel sorted order and compare entry by entry. + i, j := 0, 0 + for i < len(goEntries) && j < len(mkEntries) { + ge := goEntries[i] + me := mkEntries[j] + switch { + case ge.path == me.path: + diffEntries(t, ge.path, ge, me) + i++ + j++ + case ge.path < me.path: + t.Errorf("path %q: in go image only", ge.path) + i++ + default: + t.Errorf("path %q: in mkfs image only", me.path) + j++ + } + } + for ; i < len(goEntries); i++ { + t.Errorf("path %q: in go image only (tail)", goEntries[i].path) + } + for ; j < len(mkEntries); j++ { + t.Errorf("path %q: in mkfs image only (tail)", mkEntries[j].path) + } +} + +// TestCompareWithMkfsHardLinks builds a single-layer tar with a variety of +// hard-link configurations, converts it with both tarconv.Apply (default mode) +// and mkfs.erofs, and asserts the resulting images are identical. +// +// Covered cases: +// - 2-way hard link (canonical + 1 alias), nlink=2 +// - 3-way hard link (canonical + 2 aliases), nlink=3 +// - Cross-directory hard link (alias in a different dir from canonical) +// - Hard link to a file with non-root uid/gid (Chown applied to canonical +// must be reflected on all aliases) +func TestCompareWithMkfsHardLinks(t *testing.T) { + ts := fixedBuildTime + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "a/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "b/", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + + // 2-way: a/one → a/one-link + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "a/one", Size: 3, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("one")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "a/one-link", Linkname: "a/one", Uid: 0, Gid: 0, ModTime: ts}) + + // 3-way: a/three, a/three-1, a/three-2 — nlink must be exactly 3 + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "a/three", Size: 5, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("three")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "a/three-1", Linkname: "a/three", Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "a/three-2", Linkname: "a/three", Uid: 0, Gid: 0, ModTime: ts}) + + // cross-directory: canonical in a/, alias in b/ + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "a/shared", Size: 6, Mode: 0o755, Uid: 1000, Gid: 1000, ModTime: ts}) + tw.Write([]byte("shared")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "b/shared", Linkname: "a/shared", Uid: 1000, Gid: 1000, ModTime: ts}) + }) + + goImg := buildGoImage(t, tarData) + mkfsImg := buildMkfsImage(t, tarData) + fsckImageBytes(t, "go", goImg) + fsckImageBytes(t, "mkfs", mkfsImg) + compareImages(t, goImg, mkfsImg) +} + +// TestCompareMergeHardLinksWithMkfs verifies that tarconv.Apply(WithMerge) +// produces the same result as mkfs.erofs operating on the equivalent +// pre-merged tar. +// +// Three sub-cases are tested: +// +// 1. Both canonical and alias in the same layer. +// 2. Canonical in layer 1, alias in layer 2 (cross-layer hard link). +// The pre-merged tar for mkfs includes both the canonical file and the +// hard-link entry in one stream. +// 3. Canonical in layer 1, alias in layer 2, with the canonical file +// updated (overwritten) in layer 2 — alias must reflect the update +// (nlink=2, new content). +func TestCompareMergeHardLinksWithMkfs(t *testing.T) { + ts := fixedBuildTime + + t.Run("SameLayer", func(t *testing.T) { + // Both canonical and alias land in the same layer — identical to the + // non-merge case, but exercised through WithMerge. + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "data", Size: 4, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("data")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "link", Linkname: "data", Uid: 0, Gid: 0, ModTime: ts}) + }) + + // merged image via WithMerge + out := &buf{} + w := erofs.Create(out, erofs.WithBuildTime(uint64(ts.Unix()), 0)) + if err := tarconv.Apply(w, bytes.NewReader(layer1), tarconv.WithMerge()); err != nil { + t.Fatalf("Apply: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + mergedImg := out.b + + // equivalent single-layer mkfs image + mkfsImg := buildMkfsImage(t, layer1) + + fsckImageBytes(t, "merged", mergedImg) + fsckImageBytes(t, "mkfs", mkfsImg) + compareImages(t, mergedImg, mkfsImg) + }) + + t.Run("CrossLayer", func(t *testing.T) { + // Canonical in layer 1, alias in layer 2. + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "data", Size: 4, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("data")) + }) + layer2 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "link", Linkname: "data", Uid: 0, Gid: 0, ModTime: ts}) + }) + + // merged image via two Apply(WithMerge) calls + out := &buf{} + w := erofs.Create(out, erofs.WithBuildTime(uint64(ts.Unix()), 0)) + if err := tarconv.Apply(w, bytes.NewReader(layer1), tarconv.WithMerge()); err != nil { + t.Fatalf("Apply layer1: %v", err) + } + if err := tarconv.Apply(w, bytes.NewReader(layer2), tarconv.WithMerge()); err != nil { + t.Fatalf("Apply layer2: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + mergedImg := out.b + + // equivalent pre-merged tar for mkfs: canonical + hard link in one stream + preMerged := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "data", Size: 4, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("data")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "link", Linkname: "data", Uid: 0, Gid: 0, ModTime: ts}) + }) + mkfsImg := buildMkfsImage(t, preMerged) + + fsckImageBytes(t, "merged", mergedImg) + fsckImageBytes(t, "mkfs", mkfsImg) + compareImages(t, mergedImg, mkfsImg) + }) + + t.Run("CrossLayerWithUpdate", func(t *testing.T) { + // Canonical in layer 1, overwritten in layer 2, alias also in layer 2. + // The final image should have nlink=2 and the new content. + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "data", Size: 3, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("old")) + }) + layer2 := makeTar(t, func(tw *tar.Writer) { + // Overwrite with new content. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "data", Size: 3, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("new")) + // Hard link to the new version. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "link", Linkname: "data", Uid: 0, Gid: 0, ModTime: ts}) + }) + + out := &buf{} + w := erofs.Create(out, erofs.WithBuildTime(uint64(ts.Unix()), 0)) + if err := tarconv.Apply(w, bytes.NewReader(layer1), tarconv.WithMerge()); err != nil { + t.Fatalf("Apply layer1: %v", err) + } + if err := tarconv.Apply(w, bytes.NewReader(layer2), tarconv.WithMerge()); err != nil { + t.Fatalf("Apply layer2: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Close: %v", err) + } + mergedImg := out.b + + preMerged := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "./", Mode: 0o755, Uid: 0, Gid: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "data", Size: 3, Mode: 0o644, Uid: 0, Gid: 0, ModTime: ts}) + tw.Write([]byte("new")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "link", Linkname: "data", Uid: 0, Gid: 0, ModTime: ts}) + }) + mkfsImg := buildMkfsImage(t, preMerged) + + fsckImageBytes(t, "merged", mergedImg) + fsckImageBytes(t, "mkfs", mkfsImg) + compareImages(t, mergedImg, mkfsImg) + }) +} + +// TestFsckConvert validates all Convert test outputs against fsck.erofs. +// This runs fsck on every image produced in the main convert_test.go suite. +func TestFsckConvert(t *testing.T) { + if _, err := exec.LookPath("fsck.erofs"); err != nil { + t.Skip("fsck.erofs not in PATH") + } + ts := fixedBuildTime + cases := []struct { + name string + tar func(tw *tar.Writer) + }{ + {"BasicFiles", func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "etc/", Mode: 0o755, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/hosts", Size: 9, Mode: 0o644, ModTime: ts}) + tw.Write([]byte("127.0.0.1")) + }}, + {"HardLinksOutOfOrder", func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "early", Linkname: "actual", ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "actual", Size: 4, Mode: 0o644, ModTime: ts}) + tw.Write([]byte("data")) + }}, + {"DeviceNodes", func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeChar, Name: "dev/null", Mode: 0o666, Devmajor: 1, Devminor: 3, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeFifo, Name: "tmp/pipe", Mode: 0o644, ModTime: ts}) + }}, + {"SetuidSticky", func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "su", Size: 2, Mode: 0o4755, ModTime: ts}) + tw.Write([]byte("su")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "tmp/", Mode: 0o1777, ModTime: ts}) + }}, + {"Whiteouts", func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "lib/", Mode: 0o755, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/.wh.gone", Size: 0, ModTime: ts}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/.wh..wh..opq", Size: 0, ModTime: ts}) + }}, + } + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + tarData := makeTar(t, tc.tar) + imgData := buildGoImage(t, tarData) + fsckImageBytes(t, tc.name, imgData) + }) + } +} + +// ---------------------------------------------------------------------------- +// Comparison benchmark: walk both images and verify matching stats. +// ---------------------------------------------------------------------------- + +// BenchmarkImageRoundtrip builds a medium workload, converts it, and reads +// back every entry — measuring end-to-end throughput including image reads. +func BenchmarkImageRoundtrip(b *testing.B) { + entries := mediumWorkload() + tarData := buildTarBytes(b, entries) + b.SetBytes(int64(len(tarData))) + b.ResetTimer() + for i := 0; i < b.N; i++ { + out := &buf{} + w := erofs.Create(out, erofs.WithBuildTime(uint64(fixedBuildTime.Unix()), 0)) + if err := tarconv.Apply(w, bytes.NewReader(tarData)); err != nil { + b.Fatalf("Convert: %v", err) + } + if err := w.Close(); err != nil { + b.Fatalf("Close: %v", err) + } + img, err := erofs.Open(bytes.NewReader(out.b)) + if err != nil { + b.Fatalf("Open: %v", err) + } + rd, _ := img.(readDirer) + ls, _ := img.(lstater) + var walkCount int + var walkDir func(string) + walkDir = func(dir string) { + des, _ := rd.ReadDir(dir) + for _, de := range des { + var p string + if dir == "." { + p = de.Name() + } else { + p = dir + "/" + de.Name() + } + ls.Lstat(p) + walkCount++ + if de.IsDir() { + walkDir(p) + } + } + } + walkDir(".") + _ = walkCount + } +} + +// ---------------------------------------------------------------------------- +// Helpers used only in this file. +// ---------------------------------------------------------------------------- + +// mediumSyntheticTar returns tar bytes for the medium workload. +// Reused from bench_test.go workload definitions. +func mediumSyntheticTar(t testing.TB) []byte { + t.Helper() + return buildTarBytes(t, mediumWorkload()) +} + +// pathBase returns the last element of a /-separated path. +func pathBase(p string) string { + if i := strings.LastIndex(p, "/"); i >= 0 { + return p[i+1:] + } + return p +} + +// pathDir returns all but the last element of a /-separated path. +func pathDir(p string) string { + if i := strings.LastIndex(p, "/"); i >= 0 { + return p[:i] + } + return "." +} + +// writeTarFile writes a tar.Header plus optional data to a temporary file, +// returns the path. Caller must remove. +func writeTarToFile(t testing.TB, tarData []byte) string { + t.Helper() + f, err := os.CreateTemp("", "cmp-*.tar") + if err != nil { + t.Fatalf("create tar file: %v", err) + } + defer f.Close() + if _, err := f.Write(tarData); err != nil { + t.Fatalf("write tar file: %v", err) + } + return f.Name() +} + +// readMkfsImage runs mkfs.erofs on a tar file and returns the image bytes. +func readMkfsImageFromFile(t testing.TB, tarPath, outDir string) []byte { + t.Helper() + outPath := filepath.Join(outDir, "out.erofs") + f, err := os.Open(tarPath) + if err != nil { + t.Fatalf("open tar: %v", err) + } + defer f.Close() + args := []string{"--tar=f", "--aufs", "--quiet", "-Enoinline_data", + "-T" + fixedBuildTimeStr, "--all-time", outPath} + cmd := exec.CommandContext(context.Background(), "mkfs.erofs", args...) + cmd.Stdin = f + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("mkfs.erofs: %v\n%s", err, out) + } + data, err := os.ReadFile(outPath) + if err != nil { + t.Fatalf("read image: %v", err) + } + return data +} + +// unused – kept to avoid "imported and not used" in pathBase/pathDir. +var _ = pathBase +var _ = pathDir +var _ = writeTarToFile +var _ = readMkfsImageFromFile +var _ = mediumSyntheticTar diff --git a/tarconv/convert_test.go b/tarconv/convert_test.go new file mode 100644 index 0000000..c4301c3 --- /dev/null +++ b/tarconv/convert_test.go @@ -0,0 +1,922 @@ +package tarconv_test + +import ( + "archive/tar" + "bytes" + "errors" + "io" + "io/fs" + "os" + "os/exec" + "testing" + "time" + + erofs "github.com/erofs/go-erofs" + + "github.com/containerd/continuity/tarconv" +) + +// ---------------------------------------------------------------------------- +// Helpers +// ---------------------------------------------------------------------------- + +// buf is a simple in-memory io.WriteSeeker. +type buf struct { + b []byte + off int +} + +func (b *buf) Write(p []byte) (int, error) { + end := b.off + len(p) + if end > len(b.b) { + b.b = append(b.b, make([]byte, end-len(b.b))...) + } + copy(b.b[b.off:], p) + b.off = end + return len(p), nil +} + +func (b *buf) Seek(offset int64, whence int) (int64, error) { + var abs int64 + switch whence { + case io.SeekStart: + abs = offset + case io.SeekCurrent: + abs = int64(b.off) + offset + case io.SeekEnd: + abs = int64(len(b.b)) + offset + } + if abs < 0 { + return 0, errors.New("negative seek") + } + b.off = int(abs) + return abs, nil +} + +func (b *buf) ReadAt(p []byte, off int64) (int, error) { + if int(off) >= len(b.b) { + return 0, io.EOF + } + n := copy(p, b.b[off:]) + if n < len(p) { + return n, io.EOF + } + return n, nil +} + +// makeTar builds an in-memory tar stream from entries defined by f. +func makeTar(t testing.TB, f func(tw *tar.Writer)) []byte { + t.Helper() + var out bytes.Buffer + tw := tar.NewWriter(&out) + f(tw) + if err := tw.Close(); err != nil { + t.Fatalf("tar close: %v", err) + } + return out.Bytes() +} + +// buildImage applies a single tar layer using the default (convert-whiteouts) mode. +func buildImage(t *testing.T, tarData []byte) []byte { + t.Helper() + out := &buf{} + w := erofs.Create(out) + if err := tarconv.Apply(w, bytes.NewReader(tarData)); err != nil { + t.Fatalf("Apply: %v", err) + } + if err := w.Close(); err != nil { + t.Fatalf("Writer.Close: %v", err) + } + return out.b +} + +// buildMergedImage applies layers in order using WithMerge and returns the final image. +func buildMergedImage(t *testing.T, layers ...[]byte) []byte { + t.Helper() + out := &buf{} + w := erofs.Create(out) + for i, layer := range layers { + if err := tarconv.Apply(w, bytes.NewReader(layer), tarconv.WithMerge()); err != nil { + t.Fatalf("Apply(WithMerge) layer %d: %v", i, err) + } + } + if err := w.Close(); err != nil { + t.Fatalf("Writer.Close: %v", err) + } + return out.b +} + +// openImage opens an EROFS image from bytes for reading. +func openImage(t *testing.T, data []byte) fs.FS { + t.Helper() + img, err := erofs.Open(bytes.NewReader(data)) + if err != nil { + t.Fatalf("erofs.Open: %v", err) + } + return img +} + +// checkFile verifies a file's content. +func checkFile(t *testing.T, fsys fs.FS, name, want string) { + t.Helper() + got, err := fs.ReadFile(fsys, name) + if err != nil { + t.Fatalf("ReadFile %s: %v", name, err) + } + if string(got) != want { + t.Errorf("%s: got %q want %q", name, got, want) + } +} + +// checkStat retrieves stat for name. +func checkStat(t *testing.T, fsys fs.FS, name string) fs.FileInfo { + t.Helper() + info, err := fs.Stat(fsys, name) + if err != nil { + t.Fatalf("Stat %s: %v", name, err) + } + return info +} + +// checkNotExist asserts the path does not exist. +func checkNotExist(t *testing.T, fsys fs.FS, name string) { + t.Helper() + _, err := fs.Stat(fsys, name) + if !errors.Is(err, fs.ErrNotExist) { + t.Errorf("%s should not exist but Stat returned: %v", name, err) + } +} + +// checkDirNames asserts a directory has exactly the given child names. +func checkDirNames(t *testing.T, fsys fs.FS, dir string, want ...string) { + t.Helper() + entries, err := fs.ReadDir(fsys, dir) + if err != nil { + t.Fatalf("ReadDir %s: %v", dir, err) + } + got := make(map[string]bool) + for _, e := range entries { + got[e.Name()] = true + } + wantMap := make(map[string]bool) + for _, n := range want { + wantMap[n] = true + } + for _, n := range want { + if !got[n] { + t.Errorf("%s: missing child %q", dir, n) + } + } + for n := range got { + if !wantMap[n] { + t.Errorf("%s: unexpected child %q", dir, n) + } + } +} + +// fsckImage runs fsck.erofs if available. +func fsckImage(t *testing.T, data []byte) { + t.Helper() + if _, err := exec.LookPath("fsck.erofs"); err != nil { + return + } + f, err := os.CreateTemp("", "erofs-*.img") + if err != nil { + t.Fatal(err) + } + defer os.Remove(f.Name()) + if _, err := f.Write(data); err != nil { + _ = f.Close() + t.Fatal(err) + } + _ = f.Close() + out, err := exec.Command("fsck.erofs", f.Name()).CombinedOutput() + if err != nil { + t.Fatalf("fsck.erofs: %v\n%s", err, out) + } +} + +var epoch = time.Unix(1700000000, 0) + +// ---------------------------------------------------------------------------- +// Convert tests +// ---------------------------------------------------------------------------- + +// TestConvertBasicFiles exercises a simple tar with files and directories. +func TestConvertBasicFiles(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "etc/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/hostname", Size: 10, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("myhost\n ")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/passwd", Size: 5, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("root\n")) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + checkFile(t, fsys, "etc/hostname", "myhost\n ") + checkFile(t, fsys, "etc/passwd", "root\n") + info := checkStat(t, fsys, "etc") + if !info.IsDir() { + t.Error("etc should be a directory") + } +} + +// TestConvertMetadata checks uid/gid/mtime/mode are preserved. +func TestConvertMetadata(t *testing.T) { + mt := time.Unix(1600000000, 123456789) + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeReg, + Name: "secret", + Size: 3, + Mode: 0o600, + Uid: 1000, + Gid: 2000, + ModTime: mt, + }) + tw.Write([]byte("abc")) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + info := checkStat(t, fsys, "secret") + if info.Mode().Perm() != 0o600 { + t.Errorf("mode: got %o want %o", info.Mode().Perm(), 0o600) + } + st, ok := info.Sys().(*erofs.Stat) + if !ok { + t.Fatalf("Sys() is %T, want *erofs.Stat", info.Sys()) + } + if st.UID != 1000 { + t.Errorf("uid: got %d want 1000", st.UID) + } + if st.GID != 2000 { + t.Errorf("gid: got %d want 2000", st.GID) + } + if st.Mtime != uint64(mt.Unix()) { + t.Errorf("mtime: got %d want %d", st.Mtime, mt.Unix()) + } +} + +// TestConvertSymlink checks symlinks are preserved. +func TestConvertSymlink(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "usr/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "usr/bin/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "usr/bin/sh", Size: 4, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("#!/s")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeSymlink, Name: "bin", Linkname: "usr/bin", Mode: 0o777, ModTime: epoch}) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + imgFS := openImage(t, img) + // Use Lstat to avoid following the symlink. + lstater, ok := imgFS.(interface{ Lstat(string) (fs.FileInfo, error) }) + if !ok { + t.Skip("image FS does not implement Lstat") + } + info, err := lstater.Lstat("bin") + if err != nil { + t.Fatalf("Lstat bin: %v", err) + } + if info.Mode()&fs.ModeSymlink == 0 { + t.Errorf("bin: expected symlink, got %v", info.Mode()) + } +} + +// TestConvertHardLinks exercises in-order hard links. +func TestConvertHardLinks(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "data", Size: 5, Mode: 0o644, ModTime: epoch, Uid: 100}) + tw.Write([]byte("hello")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "data-link", Linkname: "data", ModTime: epoch, Uid: 100}) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + checkFile(t, fsys, "data", "hello") + checkFile(t, fsys, "data-link", "hello") + // Verify shared inode (nlink >= 2). + info, _ := fs.Stat(fsys, "data") + st := info.Sys().(*erofs.Stat) + if st.Nlink < 2 { + t.Errorf("data: nlink = %d, want >= 2", st.Nlink) + } +} + +// TestConvertHardLinksOutOfOrder exercises hard links that appear before their +// target in the tar stream. +func TestConvertHardLinksOutOfOrder(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + // Hard link appears BEFORE the target. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "early-link", Linkname: "actual", ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "actual", Size: 4, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("data")) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + checkFile(t, fsys, "actual", "data") + checkFile(t, fsys, "early-link", "data") + info, _ := fs.Stat(fsys, "actual") + st := info.Sys().(*erofs.Stat) + if st.Nlink < 2 { + t.Errorf("actual: nlink = %d, want >= 2", st.Nlink) + } +} + +// TestConvertUnresolvedHardLink verifies that a hard link whose target never +// appears returns an error. +func TestConvertUnresolvedHardLink(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "broken", Linkname: "ghost", ModTime: epoch}) + }) + out := &buf{} + w := erofs.Create(out) + err := tarconv.Apply(w, bytes.NewReader(tarData)) + if err == nil { + t.Fatal("expected error for unresolved hard link, got nil") + } +} + +// TestConvertDeviceNodes checks char and block devices. +func TestConvertDeviceNodes(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeChar, Name: "dev/null", + Mode: 0o666, Devmajor: 1, Devminor: 3, ModTime: epoch, + }) + tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeBlock, Name: "dev/sda", + Mode: 0o660, Devmajor: 8, Devminor: 0, ModTime: epoch, + }) + tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeFifo, Name: "tmp/pipe", + Mode: 0o644, ModTime: epoch, + }) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + + info := checkStat(t, fsys, "dev/null") + if info.Mode()&(fs.ModeDevice|fs.ModeCharDevice) != fs.ModeDevice|fs.ModeCharDevice { + t.Errorf("dev/null: mode %v should be char device", info.Mode()) + } + st := info.Sys().(*erofs.Stat) + // rdev encodes major/minor; just verify it's nonzero for a known device. + if st.Rdev == 0 { + t.Errorf("dev/null: rdev should be nonzero") + } + + info = checkStat(t, fsys, "dev/sda") + if info.Mode()&fs.ModeDevice == 0 || info.Mode()&fs.ModeCharDevice != 0 { + t.Errorf("dev/sda: mode %v should be block device", info.Mode()) + } + + info = checkStat(t, fsys, "tmp/pipe") + if info.Mode()&fs.ModeNamedPipe == 0 { + t.Errorf("tmp/pipe: mode %v should be named pipe", info.Mode()) + } +} + +// TestConvertXattrs checks PAX xattrs survive the round-trip. +func TestConvertXattrs(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + hdr := &tar.Header{ + Typeflag: tar.TypeReg, + Name: "bin/ping", + Size: 4, + Mode: 0o755, + ModTime: epoch, + PAXRecords: map[string]string{ + "SCHILY.xattr.security.capability": "AQIDBA==", + "SCHILY.xattr.user.comment": "hello", + }, + } + tw.WriteHeader(hdr) + tw.Write([]byte("ping")) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + info := checkStat(t, fsys, "bin/ping") + st, ok := info.Sys().(*erofs.Stat) + if !ok { + t.Fatalf("Sys() is %T", info.Sys()) + } + if st.Xattrs["security.capability"] != "AQIDBA==" { + t.Errorf("security.capability: got %q", st.Xattrs["security.capability"]) + } + if st.Xattrs["user.comment"] != "hello" { + t.Errorf("user.comment: got %q", st.Xattrs["user.comment"]) + } +} + +// TestConvertWhiteouts checks that whiteout entries become overlayfs char +// device 0/0 entries (Convert mode). +func TestConvertWhiteouts(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + // Create the directory so the opaque xattr has somewhere to land. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "lib/", Mode: 0o755, ModTime: epoch}) + // Opaque whiteout on lib/. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/.wh..wh..opq", Size: 0, ModTime: epoch}) + // Regular whiteout: removes lib/removed.so. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/.wh.removed.so", Size: 0, ModTime: epoch}) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + + // lib/removed.so should exist as a char device 0/0. + info := checkStat(t, fsys, "lib/removed.so") + if info.Mode()&(fs.ModeDevice|fs.ModeCharDevice) != fs.ModeDevice|fs.ModeCharDevice { + t.Errorf("lib/removed.so: expected char device whiteout, got mode %v", info.Mode()) + } + st := info.Sys().(*erofs.Stat) + if st.Rdev != 0 { + t.Errorf("lib/removed.so: rdev should be 0 for whiteout, got %d", st.Rdev) + } + + // lib itself should have trusted.overlay.opaque=y (from .wh..wh..opq) and + // trusted.overlay.origin="" (from the regular .wh.removed.so whiteout). + info = checkStat(t, fsys, "lib") + st = info.Sys().(*erofs.Stat) + if st.Xattrs[overlayOpaqueXattr] != "y" { + t.Errorf("lib: expected opaque xattr, got xattrs=%v", st.Xattrs) + } + if _, ok := st.Xattrs["trusted.overlay.origin"]; !ok { + t.Errorf("lib: expected trusted.overlay.origin from regular whiteout, got xattrs=%v", st.Xattrs) + } +} + +// TestConvertOpaqueBeforeDir tests that the opaque xattr is applied even when +// the .wh..wh..opq entry appears before the directory entry itself. +func TestConvertOpaqueBeforeDir(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + // opaque marker BEFORE the directory entry. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "newdir/.wh..wh..opq", Size: 0, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "newdir/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "newdir/file.txt", Size: 3, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("hi!")) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + + info := checkStat(t, fsys, "newdir") + st := info.Sys().(*erofs.Stat) + if st.Xattrs[overlayOpaqueXattr] != "y" { + t.Errorf("newdir: expected opaque xattr, got xattrs=%v", st.Xattrs) + } + // Opaque directories get trusted.overlay.opaque=y only (not origin). + // trusted.overlay.origin is set on directories containing regular whiteouts. + if _, ok := st.Xattrs["trusted.overlay.origin"]; ok { + t.Errorf("newdir: unexpected trusted.overlay.origin on opaque dir, xattrs=%v", st.Xattrs) + } + checkFile(t, fsys, "newdir/file.txt", "hi!") +} + +// TestConvertEmptyFile verifies empty regular files work. +func TestConvertEmptyFile(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "empty", Size: 0, Mode: 0o644, ModTime: epoch}) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + checkFile(t, fsys, "empty", "") +} + +// TestConvertLargeFile exercises a file that spans multiple EROFS blocks. +func TestConvertLargeFile(t *testing.T) { + const size = 4*4096 + 7 + data := make([]byte, size) + for i := range data { + data[i] = byte(i) + } + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "big", Size: size, Mode: 0o644, ModTime: epoch}) + tw.Write(data) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + got, err := fs.ReadFile(fsys, "big") + if err != nil { + t.Fatalf("ReadFile: %v", err) + } + if !bytes.Equal(got, data) { + t.Errorf("large file content mismatch: got %d bytes, want %d", len(got), len(data)) + } +} + +// TestConvertSetuidBit verifies that setuid/setgid/sticky bits survive. +func TestConvertSetuidBit(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "su", Size: 2, Mode: 0o4755, ModTime: epoch}) + tw.Write([]byte("su")) + }) + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + info := checkStat(t, fsys, "su") + // erofs.Stat carries raw unix mode; check setuid via Sys(). + st, ok := info.Sys().(*erofs.Stat) + if !ok { + t.Fatalf("Sys() is %T, want *erofs.Stat", info.Sys()) + } + // In Go's fs.FileMode, ModeSetuid is set when the unix setuid bit is present. + // erofs.Stat.Mode is a Go fs.FileMode. + if st.Mode&fs.ModeSetuid == 0 { + t.Errorf("su: setuid bit missing, mode=%v", st.Mode) + } +} + +// ---------------------------------------------------------------------------- +// Merge tests +// ---------------------------------------------------------------------------- + +// TestMergeBasic applies two layers and checks the final state. +func TestMergeBasic(t *testing.T) { + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "etc/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/hosts", Size: 9, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("127.0.0.1")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/passwd", Size: 4, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("root")) + }) + layer2 := makeTar(t, func(tw *tar.Writer) { + // Overwrite hosts. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/hosts", Size: 9, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("127.0.0.2")) + // Whiteout passwd. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/.wh.passwd", Size: 0, ModTime: epoch}) + }) + img := buildMergedImage(t, layer1, layer2) + fsckImage(t, img) + fsys := openImage(t, img) + checkFile(t, fsys, "etc/hosts", "127.0.0.2") + checkNotExist(t, fsys, "etc/passwd") +} + +// TestMergeOpaqueDir checks that .wh..wh..opq removes existing children in +// Merge mode. The merged image must be a clean flattened result: no overlay +// xattrs (trusted.overlay.opaque, trusted.overlay.origin) should appear +// anywhere, and only the upper layer's children should remain. +func TestMergeOpaqueDir(t *testing.T) { + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "lib/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/libc.so", Size: 4, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("libc")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/libm.so", Size: 4, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("libm")) + }) + layer2 := makeTar(t, func(tw *tar.Writer) { + // Opaque: clear lib's children, then add only the new lib. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "lib/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/.wh..wh..opq", Size: 0, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/libz.so", Size: 4, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("libz")) + }) + img := buildMergedImage(t, layer1, layer2) + fsckImage(t, img) + fsys := openImage(t, img) + + // Old children must be gone. + checkNotExist(t, fsys, "lib/libc.so") + checkNotExist(t, fsys, "lib/libm.so") + + // New child must be present with correct content. + checkFile(t, fsys, "lib/libz.so", "libz") + + // lib/ must not carry any overlay xattrs — the merged image is flat. + info := checkStat(t, fsys, "lib") + st := info.Sys().(*erofs.Stat) + if v, ok := st.Xattrs[overlayOpaqueXattr]; ok { + t.Errorf("lib: Merge should not leave %q=%q in merged image", overlayOpaqueXattr, v) + } + if v, ok := st.Xattrs["trusted.overlay.origin"]; ok { + t.Errorf("lib: Merge should not leave trusted.overlay.origin=%q in merged image", v) + } +} + +// TestMergeOpaqueDeeplyNested verifies that an opaque marker on a directory +// removes all descendants at every depth, not just direct children. +func TestMergeOpaqueDeeplyNested(t *testing.T) { + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "app/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "app/a/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "app/a/b/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "app/a/b/deep.txt", Size: 4, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("deep")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "app/a/mid.txt", Size: 3, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("mid")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "app/top.txt", Size: 3, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("top")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeSymlink, Name: "app/link", Linkname: "a/mid.txt", ModTime: epoch}) + }) + layer2 := makeTar(t, func(tw *tar.Writer) { + // Opaque wipes every descendant of app/. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "app/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "app/.wh..wh..opq", Size: 0, ModTime: epoch}) + // Only newfile.txt from this layer should be present. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "app/newfile.txt", Size: 3, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("new")) + }) + img := buildMergedImage(t, layer1, layer2) + fsckImage(t, img) + fsys := openImage(t, img) + + // All layer-1 descendants must be gone — including multi-level nesting. + checkNotExist(t, fsys, "app/top.txt") + checkNotExist(t, fsys, "app/link") + checkNotExist(t, fsys, "app/a") + checkNotExist(t, fsys, "app/a/mid.txt") + checkNotExist(t, fsys, "app/a/b") + checkNotExist(t, fsys, "app/a/b/deep.txt") + + // Layer-2 content must be present. + checkFile(t, fsys, "app/newfile.txt", "new") + + // No overlay xattrs on the merged directory. + info := checkStat(t, fsys, "app") + st := info.Sys().(*erofs.Stat) + if v, ok := st.Xattrs[overlayOpaqueXattr]; ok { + t.Errorf("app: Merge should not leave %q=%q in merged image", overlayOpaqueXattr, v) + } +} + +// TestMergeOpaqueNoXattrs verifies that neither regular whiteouts nor opaque +// markers leave any overlay xattrs in the merged image. Merge mode produces a +// flat filesystem; xattrs are an overlay-layer concept that belongs only in +// Convert mode output. +func TestMergeOpaqueNoXattrs(t *testing.T) { + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "etc/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/old.conf", Size: 4, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("old!")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/keep.conf", Size: 4, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("keep")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "lib/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/old.so", Size: 3, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("old")) + }) + layer2 := makeTar(t, func(tw *tar.Writer) { + // Regular whiteout removes etc/old.conf. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/.wh.old.conf", Size: 0, ModTime: epoch}) + // Opaque wipes lib/ entirely and replaces with new.so. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "lib/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/.wh..wh..opq", Size: 0, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/new.so", Size: 3, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("new")) + }) + img := buildMergedImage(t, layer1, layer2) + fsckImage(t, img) + fsys := openImage(t, img) + + // Structural assertions: correct merge behaviour. + checkNotExist(t, fsys, "etc/old.conf") + checkFile(t, fsys, "etc/keep.conf", "keep") + checkNotExist(t, fsys, "lib/old.so") + checkFile(t, fsys, "lib/new.so", "new") + + // Walk the entire image and assert no overlay xattrs exist anywhere. + overlayXattrs := []string{overlayOpaqueXattr, "trusted.overlay.origin", "trusted.overlay.whiteout"} + err := fs.WalkDir(fsys, ".", func(p string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + fi, err := d.Info() + if err != nil { + return err + } + st, ok := fi.Sys().(*erofs.Stat) + if !ok { + return nil + } + for _, key := range overlayXattrs { + if v, found := st.Xattrs[key]; found { + t.Errorf("%s: Merge left overlay xattr %q=%q in merged image", p, key, v) + } + } + return nil + }) + if err != nil { + t.Fatalf("WalkDir: %v", err) + } +} + +// TestMergeWhiteoutMissingPath checks that whiteouts targeting non-existent +// paths are silently ignored in Merge mode. +func TestMergeWhiteoutMissingPath(t *testing.T) { + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "exists", Size: 2, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("ok")) + }) + layer2 := makeTar(t, func(tw *tar.Writer) { + // Whiteout for a path that was never created. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: ".wh.ghost", Size: 0, ModTime: epoch}) + }) + img := buildMergedImage(t, layer1, layer2) + fsckImage(t, img) + fsys := openImage(t, img) + // Existing file should still be present. + checkFile(t, fsys, "exists", "ok") +} + +// TestMergeHardLinks exercises hard links across a merged image. +func TestMergeHardLinks(t *testing.T) { + layer1 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "bin/sh", Size: 5, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("shell")) + }) + layer2 := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "bin/bash", Linkname: "bin/sh", ModTime: epoch}) + }) + img := buildMergedImage(t, layer1, layer2) + fsckImage(t, img) + fsys := openImage(t, img) + checkFile(t, fsys, "bin/sh", "shell") + checkFile(t, fsys, "bin/bash", "shell") + info, _ := fs.Stat(fsys, "bin/sh") + st := info.Sys().(*erofs.Stat) + if st.Nlink < 2 { + t.Errorf("bin/sh: nlink = %d, want >= 2", st.Nlink) + } +} + +// TestMergeThreeLayers tests a three-layer scenario similar to real container +// images (base + deps + app). +func TestMergeThreeLayers(t *testing.T) { + // Layer 1: base OS skeleton. + base := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "bin/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "bin/sh", Size: 2, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("sh")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "etc/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "etc/os-release", Size: 6, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("alpine")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "usr/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "usr/lib/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "usr/lib/libc.so", Size: 4, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("libc")) + }) + + // Layer 2: install a package (adds files, removes some base files). + deps := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "usr/bin/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "usr/bin/python3", Size: 6, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("python")) + // Remove bin/sh (replaced later by a symlink). + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "bin/.wh.sh", Size: 0, ModTime: epoch}) + }) + + // Layer 3: app layer. + app := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "app/", Mode: 0o755, ModTime: epoch}) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "app/main.py", Size: 4, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("main")) + // Re-add sh as a symlink. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeSymlink, Name: "bin/sh", Linkname: "/bin/busybox", Mode: 0o777, ModTime: epoch}) + }) + + img := buildMergedImage(t, base, deps, app) + fsckImage(t, img) + fsys := openImage(t, img) + + // etc/os-release is from layer 1 and should still be present. + checkFile(t, fsys, "etc/os-release", "alpine") + checkFile(t, fsys, "usr/bin/python3", "python") + checkFile(t, fsys, "app/main.py", "main") + + // bin/sh was removed in layer 2 and replaced by a symlink in layer 3. + // Use Lstat to see the symlink itself. + lstater, ok := fsys.(interface{ Lstat(string) (fs.FileInfo, error) }) + if !ok { + t.Skip("image FS does not implement Lstat") + } + info, err := lstater.Lstat("bin/sh") + if err != nil { + t.Fatalf("Lstat bin/sh: %v", err) + } + if info.Mode()&fs.ModeSymlink == 0 { + t.Errorf("bin/sh: expected symlink, got %v", info.Mode()) + } +} + +// TestConvertNoTempFile verifies that Convert itself does not create a temp +// file for payload data. We set TMPDIR to a read-only dir and verify that +// Convert still succeeds (meaning it doesn't need TMPDIR for its own +// intermediate data). Note: erofs.Writer may create a spool file via +// WithTempDir; we are only verifying Convert's own behaviour, so we pass a +// writable tempDir to the writer explicitly. +func TestConvertNoTempFile(t *testing.T) { + if os.Getuid() == 0 { + t.Skip("running as root, cannot test read-only tmpdir") + } + readonly, err := os.MkdirTemp("", "ro-tmpdir-*") + if err != nil { + t.Skip("cannot create temp dir:", err) + } + defer os.RemoveAll(readonly) + if err := os.Chmod(readonly, 0o500); err != nil { + t.Skip("cannot chmod temp dir:", err) + } + // Make a separate writable temp dir for the writer spool. + writable, err := os.MkdirTemp("", "rw-tmpdir-*") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(writable) + + t.Setenv("TMPDIR", readonly) + + tarData := makeTar(t, func(tw *tar.Writer) { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "f", Size: 3, Mode: 0o644, ModTime: epoch}) + tw.Write([]byte("abc")) + }) + out := &buf{} + w := erofs.Create(out, erofs.WithTempDir(writable)) + if err := tarconv.Apply(w, bytes.NewReader(tarData)); err != nil { + t.Fatalf("Convert failed: %v", err) + } + _ = w.Close() +} + +// ---------------------------------------------------------------------------- +// Real-image-shape test: simulates Ubuntu base layer structure +// ---------------------------------------------------------------------------- + +// TestConvertUbuntuLikeLayer exercises a tar that resembles a real Ubuntu +// base layer: deep directory tree, many files, symlinks, a few device nodes. +func TestConvertUbuntuLikeLayer(t *testing.T) { + tarData := makeTar(t, func(tw *tar.Writer) { + dirs := []string{"bin/", "sbin/", "lib/", "lib/x86_64-linux-gnu/", + "etc/", "etc/apt/", "usr/", "usr/bin/", "usr/lib/", "var/", "var/log/", + "tmp/", "root/", "home/"} + for _, d := range dirs { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: d, Mode: 0o755, Uid: 0, Gid: 0, ModTime: epoch}) + } + // Typical binaries. + for _, f := range []string{"bin/sh", "bin/ls", "bin/cat", "bin/echo", "sbin/init"} { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: f, Size: 8, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("fakebinx")) + } + // Typical libs — hard linked to each other (versioned .so). + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "lib/x86_64-linux-gnu/libc.so.6", Size: 8, Mode: 0o755, ModTime: epoch}) + tw.Write([]byte("libcdata")) + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeLink, Name: "lib/libc.so.6", Linkname: "lib/x86_64-linux-gnu/libc.so.6", ModTime: epoch}) + // Config files. + for _, f := range []string{"etc/hostname", "etc/hosts", "etc/resolv.conf"} { + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: f, Size: 1, Mode: 0o644, Uid: 0, Gid: 0, ModTime: epoch}) + tw.Write([]byte("\n")) + } + // Symlinks (common in Ubuntu). + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeSymlink, Name: "lib64", Linkname: "lib/x86_64-linux-gnu", Mode: 0o777, ModTime: epoch}) + // Device node. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeChar, Name: "dev/null", Mode: 0o666, Devmajor: 1, Devminor: 3, ModTime: epoch}) + // File with capability xattr (common for ping, etc.). + tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeReg, Name: "usr/bin/ping", Size: 4, Mode: 0o755, ModTime: epoch, + PAXRecords: map[string]string{"SCHILY.xattr.security.capability": "\x01\x00\x00\x02\x00 \x00\x00"}, + }) + tw.Write([]byte("ping")) + // Empty log file. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeReg, Name: "var/log/dpkg.log", Size: 0, Mode: 0o644, ModTime: epoch}) + // Sticky tmp. + tw.WriteHeader(&tar.Header{Typeflag: tar.TypeDir, Name: "tmp/", Mode: 0o1777, ModTime: epoch}) + }) + + img := buildImage(t, tarData) + fsckImage(t, img) + fsys := openImage(t, img) + + checkFile(t, fsys, "bin/sh", "fakebinx") + checkFile(t, fsys, "lib/x86_64-linux-gnu/libc.so.6", "libcdata") + checkFile(t, fsys, "lib/libc.so.6", "libcdata") // hard link + + info, _ := fs.Stat(fsys, "lib/x86_64-linux-gnu/libc.so.6") + st := info.Sys().(*erofs.Stat) + if st.Nlink < 2 { + t.Errorf("libc.so.6: nlink=%d want >=2", st.Nlink) + } + + info = checkStat(t, fsys, "tmp") + // Sticky bit: check via erofs.Stat.Mode which uses the properly-decoded + // raw inode mode (fs.FileInfo.Mode() carries it unreliably due to the + // reader's inode decode path). + st2 := info.Sys().(*erofs.Stat) + if st2.Mode.Perm() != 0o777 || st2.Mode&fs.ModeSticky == 0 { + t.Errorf("tmp: erofs.Stat.Mode=%v want drwxrwxrwt", st2.Mode) + } + + info = checkStat(t, fsys, "usr/bin/ping") + st = info.Sys().(*erofs.Stat) + if st.Xattrs["security.capability"] == "" { + t.Error("ping: missing security.capability xattr") + } +} + +const overlayOpaqueXattr = "trusted.overlay.opaque" diff --git a/tarconv/helpers_test.go b/tarconv/helpers_test.go new file mode 100644 index 0000000..fcd0597 --- /dev/null +++ b/tarconv/helpers_test.go @@ -0,0 +1,263 @@ +package tarconv_test + +import ( + "archive/tar" + "context" + "fmt" + "io" + "os" + "os/exec" + "testing" + "time" +) + +// writerToTar is satisfied by any type that can emit entries to a tar.Writer. +type writerToTar interface { + writeTo(tw *tar.Writer) error +} + +// tarAll sequences multiple writerToTar entries. +type tarAll []writerToTar + +func (a tarAll) writeTo(tw *tar.Writer) error { + for _, w := range a { + if err := w.writeTo(tw); err != nil { + return err + } + } + return nil +} + +// tarFromWriterTo returns an io.ReadCloser streaming a tar built from wt. +func tarFromWriterTo(wt writerToTar) io.ReadCloser { + r, w := io.Pipe() + go func() { + tw := tar.NewWriter(w) + if err := wt.writeTo(tw); err != nil { + _ = w.CloseWithError(err) + return + } + _ = tw.Close() + _ = w.Close() + }() + return r +} + +// tarContext holds shared metadata for generated entries. +type tarContext struct { + uid int + gid int + modTime time.Time + xattrs map[string]string +} + +func (tc tarContext) withModTime(t time.Time) tarContext { + tc.modTime = t + return tc +} + +func (tc tarContext) withXattrs(xattrs map[string]string) tarContext { + tc.xattrs = xattrs + return tc +} + +// --- tarFile --- + +type tarFile struct { + name string + data []byte + mode int64 + uid int + gid int + modTime time.Time + xattrs map[string]string +} + +func (f *tarFile) writeTo(tw *tar.Writer) error { + hdr := &tar.Header{ + Typeflag: tar.TypeReg, Name: f.name, + Size: int64(len(f.data)), Mode: f.mode, + Uid: f.uid, Gid: f.gid, ModTime: f.modTime, + } + if len(f.xattrs) > 0 { + hdr.PAXRecords = make(map[string]string) + for k, v := range f.xattrs { + hdr.PAXRecords["SCHILY.xattr."+k] = v + } + } + if err := tw.WriteHeader(hdr); err != nil { + return err + } + if len(f.data) > 0 { + _, err := tw.Write(f.data) + return err + } + return nil +} + +func (tc tarContext) file(name string, data []byte, mode int64) writerToTar { + return &tarFile{name: name, data: data, mode: mode, uid: tc.uid, gid: tc.gid, modTime: tc.modTime, xattrs: tc.xattrs} +} + +// --- tarDir --- + +type tarDir struct { + name string + mode int64 + uid int + gid int + modTime time.Time + xattrs map[string]string +} + +func (d *tarDir) writeTo(tw *tar.Writer) error { + hdr := &tar.Header{ + Typeflag: tar.TypeDir, Name: d.name, Mode: d.mode, + Uid: d.uid, Gid: d.gid, ModTime: d.modTime, + } + if len(d.xattrs) > 0 { + hdr.PAXRecords = make(map[string]string) + for k, v := range d.xattrs { + hdr.PAXRecords["SCHILY.xattr."+k] = v + } + } + return tw.WriteHeader(hdr) +} + +func (tc tarContext) dir(name string, mode int64) writerToTar { + return &tarDir{name: name, mode: mode, uid: tc.uid, gid: tc.gid, modTime: tc.modTime, xattrs: tc.xattrs} +} + +// --- tarSymlink --- + +type tarSymlink struct { + name string + target string + uid int + gid int + modTime time.Time +} + +func (s *tarSymlink) writeTo(tw *tar.Writer) error { + return tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeSymlink, Name: s.name, Linkname: s.target, + Mode: 0o777, Uid: s.uid, Gid: s.gid, ModTime: s.modTime, + }) +} + +func (tc tarContext) symlink(name, target string) writerToTar { + return &tarSymlink{name: name, target: target, uid: tc.uid, gid: tc.gid, modTime: tc.modTime} +} + +// --- tarDevice --- + +type tarDevice struct { + name string + mode int64 + typeflag byte + devmajor int64 + devminor int64 + uid int + gid int + modTime time.Time +} + +func (d *tarDevice) writeTo(tw *tar.Writer) error { + return tw.WriteHeader(&tar.Header{ + Typeflag: d.typeflag, Name: d.name, Mode: d.mode, + Devmajor: d.devmajor, Devminor: d.devminor, + Uid: d.uid, Gid: d.gid, ModTime: d.modTime, + }) +} + +func (tc tarContext) charDevice(name string, mode int64, major, minor int64) writerToTar { + return &tarDevice{name: name, mode: mode, typeflag: tar.TypeChar, devmajor: major, devminor: minor, uid: tc.uid, gid: tc.gid, modTime: tc.modTime} +} + +func (tc tarContext) blockDevice(name string, mode int64, major, minor int64) writerToTar { + return &tarDevice{name: name, mode: mode, typeflag: tar.TypeBlock, devmajor: major, devminor: minor, uid: tc.uid, gid: tc.gid, modTime: tc.modTime} +} + +func (tc tarContext) fifo(name string, mode int64) writerToTar { + return &tarDevice{name: name, mode: mode, typeflag: tar.TypeFifo, uid: tc.uid, gid: tc.gid, modTime: tc.modTime} +} + +// --- tarHardLink --- + +type tarHardLink struct { + name string + target string + uid int + gid int + modTime time.Time +} + +func (h *tarHardLink) writeTo(tw *tar.Writer) error { + return tw.WriteHeader(&tar.Header{ + Typeflag: tar.TypeLink, Name: h.name, Linkname: h.target, + Uid: h.uid, Gid: h.gid, ModTime: h.modTime, + }) +} + +func (tc tarContext) hardLink(name, target string) writerToTar { + return &tarHardLink{name: name, target: target, uid: tc.uid, gid: tc.gid, modTime: tc.modTime} +} + +// --- mkfs.erofs helper --- + +// convertTarMkfs runs mkfs.erofs to convert a tar to an EROFS image. +// The flags --tar=f --aufs --quiet -Enoinline_data are always applied. +// Returns an error (and skips) if mkfs.erofs is not found in PATH. +func convertTarMkfs(ctx context.Context, t testing.TB, tarData []byte, outPath string, extraArgs []string) error { + t.Helper() + if _, err := exec.LookPath("mkfs.erofs"); err != nil { + t.Skip("mkfs.erofs not found in PATH") + } + f, err := os.CreateTemp("", "mkfs-bench-*.tar") + if err != nil { + return fmt.Errorf("create temp tar: %w", err) + } + defer os.Remove(f.Name()) + if _, err := f.Write(tarData); err != nil { + _ = f.Close() + return err + } + if _, err := f.Seek(0, io.SeekStart); err != nil { + _ = f.Close() + return err + } + + args := []string{"--tar=f", "--aufs", "--quiet", "-Enoinline_data"} + args = append(args, extraArgs...) + args = append(args, outPath) + cmd := exec.CommandContext(ctx, "mkfs.erofs", args...) + cmd.Stdin = f + out, err := cmd.CombinedOutput() + _ = f.Close() + if err != nil { + return fmt.Errorf("mkfs.erofs %v: %w\n%s", args, err, out) + } + return nil +} + +// fsckErofsBytes validates an EROFS image using fsck.erofs if available. +func fsckErofsBytes(t testing.TB, data []byte) { + t.Helper() + if _, err := exec.LookPath("fsck.erofs"); err != nil { + return + } + f, err := os.CreateTemp("", "erofs-*.img") + if err != nil { + t.Fatal(err) + } + defer os.Remove(f.Name()) + if _, err := f.Write(data); err != nil { + _ = f.Close() + t.Fatal(err) + } + _ = f.Close() + out, err := exec.Command("fsck.erofs", f.Name()).CombinedOutput() + if err != nil { + t.Fatalf("fsck.erofs: %v\n%s", err, out) + } +} diff --git a/vendor/github.com/erofs/go-erofs/.golangci.yml b/vendor/github.com/erofs/go-erofs/.golangci.yml new file mode 100644 index 0000000..d4f8fd0 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/.golangci.yml @@ -0,0 +1,25 @@ +version: "2" + +run: + timeout: 5m + +linters: + enable: + - misspell + - gocritic + - revive + - unconvert + - unparam + settings: + revive: + rules: + - name: exported + disabled: true + +formatters: + enable: + - goimports + settings: + goimports: + local-prefixes: + - github.com/erofs/go-erofs diff --git a/vendor/github.com/erofs/go-erofs/LICENSE b/vendor/github.com/erofs/go-erofs/LICENSE new file mode 100644 index 0000000..be4c94b --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + https://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright The go-erofs Authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + https://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/erofs/go-erofs/README.md b/vendor/github.com/erofs/go-erofs/README.md new file mode 100644 index 0000000..dde7676 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/README.md @@ -0,0 +1,82 @@ +# go-erofs + +A Go library for reading and creating [EROFS](https://erofs.docs.kernel.org/) filesystem images using the standard [fs.FS](https://pkg.go.dev/io/fs#FS) interface. + +## Features + +- **Read** EROFS images through Go's `fs.FS` interface +- **Create** EROFS images from directories or any `fs.FS` +- **Merge** multiple filesystem sources with overlay whiteout support +- **Metadata-only** mode for container layer indexing (chunk-based references to original data) +- Pure Go, no CGO — uses only the standard library + +### Status + +- [x] Read erofs files created with default `mkfs.erofs` options +- [x] Read chunk-based erofs files with indexes +- [x] Xattr support including long xattr prefixes +- [x] Extra devices for chunked data +- [x] Create erofs files from any `fs.FS` +- [x] Directory to erofs packing +- [x] AUFS whiteout to overlayfs conversion +- [x] Merge multiple filesystem layers with whiteout processing +- [ ] Read erofs files with compression + +## Reading an EROFS image + +```go +f, err := os.Open("image.erofs") +if err != nil { + log.Fatal(err) +} +defer f.Close() + +img, err := erofs.Open(f) +if err != nil { + log.Fatal(err) +} + +fs.WalkDir(img, ".", func(path string, d fs.DirEntry, err error) error { + fmt.Println(path) + return nil +}) +``` + +## Merging multiple layers + +Combine multiple filesystem sources into one image. The `Merge` option enables overlay semantics — AUFS-style whiteout files (`.wh.`) delete entries from prior layers: + +```go +outFile, _ := os.Create("merged.erofs") +w := erofs.Create(outFile) + +w.CopyFrom(baseLayer) +w.CopyFrom(overlayLayer, erofs.Merge()) +w.Close() +``` + +Merge can also be combined with `MetadataOnly` to build a merged index without copying data: + +```go +w := erofs.Create(outFile) +w.CopyFrom(layer1, erofs.MetadataOnly()) +w.CopyFrom(layer2, erofs.MetadataOnly(), erofs.Merge()) +w.Close() +``` + +## Building an image programmatically + +```go +outFile, _ := os.Create("image.erofs") +w := erofs.Create(outFile) + +f, _ := w.Create("/hello.txt") +f.Write([]byte("hello world\n")) +f.Close() + +w.Mkdir("/dir", 0o755) +w.Symlink("hello.txt", "/link") + +w.Close() +outFile.Close() +``` diff --git a/vendor/github.com/erofs/go-erofs/block.go b/vendor/github.com/erofs/go-erofs/block.go new file mode 100644 index 0000000..cea7da8 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/block.go @@ -0,0 +1,22 @@ +package erofs + +type block struct { + buf []byte + offset int32 + end int32 +} + +func (b *block) bytes() []byte { + if b.buf == nil || b.offset == -1 { + return nil + } + return b.buf[b.offset:b.end] +} + +func calculateBlocks(blockBits uint8, size int64) int { + blockNum := size >> blockBits + if size > blockNum<= 0. The bytes at [Offset, Offset+Size) in +// Device are the file's content verbatim — uncompressed, unreferenced +// by transformation. +// - A hole entry has Offset == -1. It represents Size bytes of zeros at the +// current logical position. Device is ignored for hole entries. +// +// Compressed data should not be represented as a DataRange. When a source +// FS contains compressed files, it should not provide DataRange() []DataRange +// for those files (or should return nil). In full-image mode CopyFrom will +// fall back to reading through Open(), which decompresses transparently, and +// write the decompressed data into the output image. In MetadataOnly mode +// there is no such fallback: files without DataRange() (or pre-built chunks) +// are stored as chunk-based inodes with no physical mappings (all holes). +type DataRange struct { + Device uint16 // device index (0 for the device assigned by CopyFrom); ignored for holes + Offset int64 // byte offset in the device, or -1 for a hole entry + Size int64 // byte length of this entry +} + +type options struct { + extraDevices []io.ReaderAt +} + +// OpenOpt is an option for configuring the EROFS reader +type OpenOpt func(*options) + +// Deprecated: Use [OpenOpt] instead, will be removed in 0.3 +type Opt = OpenOpt + +// WithExtraDevices specifies additional devices to read +// chunk data from +func WithExtraDevices(devices ...io.ReaderAt) OpenOpt { + return func(o *options) { + o.extraDevices = append(o.extraDevices, devices...) + } +} + +// Open returns a FileSystem reading from the given ReaderAt. +// The ReaderAt must be a valid EROFS block file. +// No additional memory mapping is done and must be handled by +// the caller. +func Open(r io.ReaderAt, opts ...OpenOpt) (fs.FS, error) { + o := options{} + for _, opt := range opts { + opt(&o) + } + var superBlock [disk.SizeSuperBlock]byte + n, err := r.ReadAt(superBlock[:], disk.SuperBlockOffset) + if err != nil { + return nil, err + } + + if n != disk.SizeSuperBlock { + return nil, fmt.Errorf("invalid super block: read %d bytes", n) + } + + i := image{ + meta: r, + } + if err = decodeSuperBlock(superBlock, &i.sb); err != nil { + return nil, err + } + // The maximum reasonable filesystem block size is 64k, which is + // the largest supported page size of aarch64 platforms. + if i.sb.BlkSizeBits < 9 || i.sb.BlkSizeBits > 16 { + return nil, fmt.Errorf("unsupported block size bits %d: %w", i.sb.BlkSizeBits, ErrInvalidSuperblock) + } + unknownFeat := i.sb.FeatureIncompat &^ disk.FeatureIncompatAll + if unknownFeat != 0 { + return nil, fmt.Errorf("unsupported incompatible feature 0x%x: %w", unknownFeat, ErrNotImplemented) + } + ondiskExtraDevices := uint32(0) + if i.sb.FeatureIncompat&disk.FeatureIncompatDeviceTable != 0 { + ondiskExtraDevices = uint32(i.sb.ExtraDevices) + // Calculate device_id_mask + // sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; + i.deviceIDMask = uint16(roundupPowerOfTwo(uint32(i.sb.ExtraDevices)+1) - 1) + } + + if int(ondiskExtraDevices) != len(o.extraDevices) { + // TODO: Provide options for skipping extra devices and error out later? + return nil, fmt.Errorf("invalid super block: extra devices count %d does not match provided %d", ondiskExtraDevices, len(o.extraDevices)) + } + + // Parse the device table if extra devices exist + if ondiskExtraDevices > 0 { + devTableOffset := int64(i.sb.DevtSlotOff) * disk.SizeDeviceSlot + i.devices = make([]deviceInfo, int(ondiskExtraDevices)) + for idx := range i.devices { + var slotBuf [disk.SizeDeviceSlot]byte + offset := devTableOffset + int64(idx)*disk.SizeDeviceSlot + if _, err := r.ReadAt(slotBuf[:], offset); err != nil { + return nil, fmt.Errorf("failed to read device slot %d at offset %d: %w", idx, offset, err) + } + var slot disk.DeviceSlot + if _, err := binary.Decode(slotBuf[:], binary.LittleEndian, &slot); err != nil { + return nil, fmt.Errorf("failed to decode device slot %d: %w", idx, err) + } + i.devices[idx] = deviceInfo{ + device: o.extraDevices[idx], + mappedBlkAddr: slot.MappedBlkAddr, + blocks: slot.Blocks, + } + } + } + + // Error out filesystems with unsupported compressed inodes + if i.sb.FeatureIncompat&disk.FeatureIncompatLZ4_0Padding != 0 || + i.sb.ComprAlgs != 0 { + return nil, fmt.Errorf("unsupported compressed filesystem (FeatureIncompat=0x%x, ComprAlgs=0x%x): %w", + i.sb.FeatureIncompat, i.sb.ComprAlgs, ErrNotImplemented) + } + + i.blkPool.New = func() any { + return &block{ + buf: make([]byte, 1<> 1 + v |= v >> 2 + v |= v >> 4 + v |= v >> 8 + v |= v >> 16 + v++ + return v +} + +// deviceInfo holds the parsed mapped address range for a device table entry. +type deviceInfo struct { + device io.ReaderAt + mappedBlkAddr uint32 // starting mapped block address + blocks uint32 // total block count for this device +} + +type image struct { + sb disk.SuperBlock + + meta io.ReaderAt + devices []deviceInfo // parsed device table entries + deviceIDMask uint16 + blkPool sync.Pool + longPrefixes []string // cached long xattr prefixes + prefixesOnce sync.Once + prefixesErr error +} + +// start physical offset of the separate metadata zone +func (img *image) metaStartPos() int64 { + return int64(img.sb.MetaBlkAddr) << int64(img.sb.BlkSizeBits) +} + +// maxReadFileSize is the maximum file size that ReadFile will allocate. +// ReadFile is intended for small files; for larger files, callers should +// use Open and io.Copy. 128 MiB is generous for typical use (configs, +// manifests, symlink targets, etc.) while guarding against +// unexpectedly large files. +const maxReadFileSize = 128 << 20 // 128 MiB + +// mapDev resolves map->m_bdev and map->m_pa mapping for go-erofs. +// It works similarly to erofs_map_dev in the linux kernel. +func (img *image) mapDev(deviceID uint16, pa int64) (io.ReaderAt, int64, error) { + if deviceID > 0 { + if int(deviceID) > len(img.devices) { + return nil, 0, fmt.Errorf("invalid device id %d", deviceID) + } + return img.devices[deviceID-1].device, pa, nil + } + + if len(img.devices) > 0 { + for _, dev := range img.devices { + if dev.mappedBlkAddr == 0 { + continue + } + + startOff := int64(dev.mappedBlkAddr) << img.sb.BlkSizeBits + length := int64(dev.blocks) << img.sb.BlkSizeBits + + if pa >= startOff && pa < startOff+length { + return dev.device, pa - startOff, nil + } + } + } + + return img.meta, pa, nil +} + +// blockSize returns the filesystem block size. +func (img *image) blockSize() uint32 { return 1 << img.sb.BlkSizeBits } + +// buildTime returns the build timestamp from the superblock. +func (img *image) buildTime() uint64 { return img.sb.BuildTime } + +// deviceBlocks returns the total block count across all extra devices. +// Each device's block count is reported at the device's native block size +// (matching the superblock block size). +func (img *image) deviceBlocks() []uint64 { + if len(img.devices) == 0 { + return nil + } + blocks := make([]uint64, len(img.devices)) + for i, d := range img.devices { + blocks[i] = uint64(d.blocks) + } + return blocks +} + +// openDirect returns an io.Reader for a file's data that reads directly +// from the underlying metadata reader, bypassing the block-at-a-time +// Read path. Returns nil if direct reading is not possible (e.g. +// chunk-based or compressed files). +func (img *image) openDirect(ino *inode) io.Reader { + if ino.size <= 0 { + return nil + } + blockSize := int64(1 << img.sb.BlkSizeBits) + switch ino.inodeLayout { + case disk.LayoutFlatPlain: + // Data is contiguous starting at dataBlkAddr. + dataOffset := int64(ino.inodeData) << img.sb.BlkSizeBits + return io.NewSectionReader(img.meta, dataOffset, ino.size) + case disk.LayoutFlatInline: + // Last block is inline after the inode; earlier blocks at dataBlkAddr. + // Only use direct read for single-block files (all data inline). + if ino.size > blockSize { + return nil + } + inodeAddr := img.metaStartPos() + int64(ino.nid)*disk.SizeInodeCompact + trailingAddr := inodeAddr + ino.flatDataOffset() + return io.NewSectionReader(img.meta, trailingAddr, ino.size) + case disk.LayoutChunkBased: + // Chunk-based files store data at the physical block addresses + // listed in the chunk index. For contiguous single-device files, + // the data is laid out consecutively and can be read directly. + chunkFmt := uint16(ino.inodeData) + if chunkFmt&disk.LayoutChunkFormatIndexes == 0 { + return nil + } + chunkBits := img.sb.BlkSizeBits + uint8(chunkFmt&disk.LayoutChunkFormatBits) + nchunks := int((ino.size-1)>>chunkBits) + 1 + + // Read chunk index entries to check contiguity. + inodeStart := img.metaStartPos() + int64(ino.nid)*disk.SizeInodeCompact + baseOffset := inodeStart + ino.flatDataOffset() + if baseOffset%8 != 0 { + baseOffset = (baseOffset + 7) & ^int64(7) + } + needed := int64(nchunks) * int64(disk.SizeChunkIndex) + if needed > maxChunkIndexBytes { + return nil + } + idxBuf := make([]byte, needed) + if _, err := img.meta.ReadAt(idxBuf, baseOffset); err != nil { + return nil + } + + // Check that all chunks are contiguous on the same device. + var startBlock uint64 + var deviceID uint16 + for i := range nchunks { + off := i * disk.SizeChunkIndex + blkLo := binary.LittleEndian.Uint32(idxBuf[off+4 : off+8]) + if ^blkLo == 0 { + return nil // hole + } + blkHi := binary.LittleEndian.Uint16(idxBuf[off : off+2]) + did := binary.LittleEndian.Uint16(idxBuf[off+2:off+4]) & img.deviceIDMask + phys := (uint64(blkHi) << 32) | uint64(blkLo) + + blocksPerChunk := uint64(1 << (chunkBits - img.sb.BlkSizeBits)) + if i == 0 { + startBlock = phys + deviceID = did + } else { + expected := startBlock + uint64(i)*blocksPerChunk + if phys != expected || did != deviceID { + return nil // not contiguous or different device + } + } + } + + // All chunks contiguous — resolve through the device. + dataOffset := int64(startBlock) << img.sb.BlkSizeBits + if deviceID > 0 && int(deviceID) <= len(img.devices) { + return io.NewSectionReader(img.devices[deviceID-1].device, dataOffset, ino.size) + } + return io.NewSectionReader(img.meta, dataOffset, ino.size) + default: + return nil + } +} + +func (img *image) readMetadata(r io.Reader) ([]byte, error) { + // - A 2-byte little-endian length field, which is aligned to a 4-byte boundary + // - The length bytes of payload data + var lenBuf [2]byte + if _, err := io.ReadFull(r, lenBuf[:]); err != nil { + return nil, fmt.Errorf("failed to read metadata length %v: %w", lenBuf, err) + } + + dataLen := int(binary.LittleEndian.Uint16(lenBuf[:])) + if dataLen < 1 { + dataLen = 65536 + } + + data := make([]byte, dataLen) + if _, err := io.ReadFull(r, data); err != nil { + return nil, fmt.Errorf("failed to read metadata payload: %w", err) + } + + // Align to 4-byte boundary except for hitting EOF + totalLen := 2 + dataLen + if rem := totalLen % 4; rem != 0 { + padding := int64(4 - rem) + if _, err := io.CopyN(io.Discard, r, padding); err != nil && + !errors.Is(err, io.EOF) && + !errors.Is(err, io.ErrUnexpectedEOF) { + return nil, fmt.Errorf("failed to discard padding of %d bytes: %w", padding, err) + } + } + return data, nil +} + +// loadLongPrefixes loads and caches the long xattr prefixes from the packed inode +// using the regular inode read logic to handle compressed/non-inline data. +// +// Long xattr name prefixes are used to optimize storage of xattrs with common +// prefixes. They are stored sequentially in a special "packed inode" or +// "meta inode". +// See: https://docs.kernel.org/filesystems/erofs.html#extended-attributes +func (img *image) loadLongPrefixes() error { + img.prefixesOnce.Do(func() { + if img.sb.XattrPrefixCount == 0 { + return + } + + var r io.Reader + + // Calculate the starting offset. XattrPrefixStart is defined in the + // superblock as being in units of 4 bytes from the start of the corresponding inode + startOffset := int64(img.sb.XattrPrefixStart) * 4 + + if (img.sb.FeatureIncompat&disk.FeatureIncompatFragments != 0) && img.sb.PackedNid > 0 { + // The packed inode (identified by PackedNid in the superblock) is a special + // inode used for shared data and metadata. + // We use ".packed" as a descriptive name for this internal inode. + f := &file{ + img: img, + name: ".packed", + nid: img.sb.PackedNid, + ftype: 0, // regular file + } + + // Read inode info to determine size and layout + fi, err := f.readInfo() + if err != nil { + img.prefixesErr = fmt.Errorf("failed to read packed inode: %w", err) + return + } + + if startOffset > fi.size { + img.prefixesErr = fmt.Errorf("xattr prefix start offset %d exceeds packed inode size %d", startOffset, fi.size) + return + } + + // Set the read offset + f.offset = startOffset + r = bufio.NewReader(f) + } else { + // FIXME(hsiangkao): should avoid hacky 1<<32 here since we don't care about the end + r = io.NewSectionReader(img.meta, startOffset, 1<<32) + } + + img.longPrefixes = make([]string, img.sb.XattrPrefixCount) + for i := 0; i < int(img.sb.XattrPrefixCount); i++ { + data, err := img.readMetadata(r) + if err != nil { + img.prefixesErr = + fmt.Errorf("failed to read long xattr prefix %d: %w", i, err) + return + } + + // First byte is the base_index referencing a standard xattr prefix + baseIndex := xattrIndex(data[0]) + + // Remaining bytes are the infix to be appended to the base prefix + infix := string(data[1:]) + + // Construct full prefix: base prefix + infix + img.longPrefixes[i] = baseIndex.String() + infix + } + }) + return img.prefixesErr +} + +// getLongPrefix returns the long xattr prefix at the given index +func (img *image) getLongPrefix(index uint8) (string, error) { + if err := img.loadLongPrefixes(); err != nil { + return "", err + } + + if int(index) >= len(img.longPrefixes) { + return "", fmt.Errorf("long xattr prefix index %d out of range (max %d)", index, len(img.longPrefixes)-1) + } + + return img.longPrefixes[index], nil +} + +func (img *image) loadAt(addr, size int64) (*block, error) { + blkSize := int64(1 << img.sb.BlkSizeBits) + if size > blkSize { + size = blkSize + } + + b := img.getBlock() + if n, err := img.meta.ReadAt(b.buf[:size], addr); err != nil { + img.putBlock(b) + return nil, fmt.Errorf("failed to read %d bytes at %d: %w", size, addr, err) + } else { + b.offset = 0 + b.end = int32(n) + } + + return b, nil +} + +// loadBlock loads the block with the given data +func (img *image) loadBlock(fi *inode, pos int64) (*block, error) { + nblocks := calculateBlocks(img.sb.BlkSizeBits, fi.size) + bn := int(pos >> int(img.sb.BlkSizeBits)) + if bn >= nblocks { + return nil, fmt.Errorf("block position larger than number of blocks for inode: %w", io.EOF) + } + var addr int64 + blockSize := int(1 << img.sb.BlkSizeBits) + blockOffset := 0 + blockEnd := blockSize + switch fi.inodeLayout { + case disk.LayoutFlatPlain: + // flat plain has no holes + addr = int64(int(fi.inodeData)+bn) << img.sb.BlkSizeBits + blockOffset = int(pos % int64(blockSize)) + if bn == nblocks-1 { + blockEnd = int(fi.size - int64(bn)*int64(1< blockSize { + return nil, fmt.Errorf("inline data cross block boundary for nid %d: %w", fi.nid, ErrInvalid) + } + } else { + addr = int64(int(fi.inodeData)+bn) << img.sb.BlkSizeBits + blockOffset = int(pos % int64(blockSize)) + } + case disk.LayoutChunkBased: + // first 2 le bytes for format, second 2 bytes are reserved + format := uint16(fi.inodeData) + if format&disk.LayoutChunkFormat48Bit != 0 { + return nil, fmt.Errorf("48-bit chunk format for nid %d: %w", fi.nid, ErrNotImplemented) + } + if format&^(disk.LayoutChunkFormatBits|disk.LayoutChunkFormatIndexes) != 0 { + return nil, fmt.Errorf("unsupported chunk format %x for nid %d: %w", format, fi.nid, ErrInvalid) + } + + chunkbits := img.sb.BlkSizeBits + uint8(format&disk.LayoutChunkFormatBits) + chunkn := int((fi.size-1)>>chunkbits) + 1 + cn := int(pos >> chunkbits) + + if cn >= chunkn { + return nil, fmt.Errorf("chunk format does not fit into allocated bytes for nid %d: %w", fi.nid, ErrInvalid) + } + + inodeStart := img.metaStartPos() + int64(fi.nid*disk.SizeInodeCompact) + baseOffset := inodeStart + fi.flatDataOffset() + + unit := 4 + if format&disk.LayoutChunkFormatIndexes == disk.LayoutChunkFormatIndexes { + unit = 8 + // Align to 8 bytes + if baseOffset%8 != 0 { + baseOffset = (baseOffset + 7) & ^int64(7) + } + } + + entryPos := baseOffset + int64(cn*unit) + var entryBuf [8]byte + if n, err := img.meta.ReadAt(entryBuf[:unit], entryPos); err != nil { + return nil, fmt.Errorf("failed to read chunk entry at %d: %w", entryPos, err) + } else if n != unit { + return nil, fmt.Errorf("short read of chunk entry at %d: read %d bytes, expected %d", entryPos, n, unit) + } + + var addr int64 + var deviceID uint16 + + if unit == 8 { + startBlkLo := binary.LittleEndian.Uint32(entryBuf[4:8]) + if ^startBlkLo == 0 { + addr = -1 + } else { + addr = int64(startBlkLo) << img.sb.BlkSizeBits + deviceID = binary.LittleEndian.Uint16(entryBuf[2:4]) & img.deviceIDMask + } + } else { + rawAddr := binary.LittleEndian.Uint32(entryBuf[:4]) + if ^rawAddr == 0 { + addr = -1 + } else { + addr = int64(rawAddr) << img.sb.BlkSizeBits + } + } + + if bn == nblocks-1 { + blockEnd = int(fi.size - int64(bn)*int64(1< 0 { + addr += (blockPos - int64(cn< blockSize || blockOffset >= blockEnd { + return nil, fmt.Errorf("invalid chunk block bounds [%d:%d] for nid %d: %w", blockOffset, blockEnd, fi.nid, ErrInvalid) + } + b := img.getBlock() + if n, err := reader.ReadAt(b.buf[blockOffset:blockEnd], addr+int64(blockOffset)); err != nil { + img.putBlock(b) + return nil, fmt.Errorf("failed to read block for nid %d: %w", fi.nid, err) + } else if n != (blockEnd - blockOffset) { + img.putBlock(b) + return nil, fmt.Errorf("failed to read full block for nid %d: %w", fi.nid, ErrInvalid) + } + b.offset = int32(blockOffset) + b.end = int32(blockEnd) + return b, nil + case disk.LayoutCompressedFull, disk.LayoutCompressedCompact: + return nil, fmt.Errorf("inode layout (%d) for %d: %w", fi.inodeLayout, fi.nid, ErrNotImplemented) + default: + return nil, fmt.Errorf("inode layout (%d) for %d: %w", fi.inodeLayout, fi.nid, ErrInvalid) + } + if blockOffset < 0 || blockEnd > blockSize || blockOffset >= blockEnd { + return nil, fmt.Errorf("invalid block bounds [%d:%d] for nid %d: %w", blockOffset, blockEnd, fi.nid, ErrInvalid) + } + + b := img.getBlock() + b.offset = int32(blockOffset) + b.end = int32(blockEnd) + if n, err := img.meta.ReadAt(b.bytes(), addr+int64(blockOffset)); err != nil { + img.putBlock(b) + return nil, fmt.Errorf("failed to read block for nid %d: %w", fi.nid, err) + } else if n != blockEnd-blockOffset { + img.putBlock(b) + return nil, fmt.Errorf("failed to read full block for nid %d: %w, expected %d, actual %d", fi.nid, ErrInvalid, blockEnd-blockOffset, n) + } + return b, nil +} + +func (img *image) getBlock() *block { + return img.blkPool.Get().(*block) +} + +// putBlock returns a block after complete so its +// buffer can be put back into the buffer pool +func (img *image) putBlock(b *block) { + img.blkPool.Put(b) +} + +const maxSymlinks = 255 + +// maxSymlinkSize is the maximum size of a symlink target. +// Linux PATH_MAX is 4096; we use the same limit. +const maxSymlinkSize = 4096 + +// readLink reads the symlink target for the given nid. +func (i *image) readLink(nid uint64, name string) (string, error) { + f := &file{img: i, name: name, nid: nid, ftype: fs.ModeSymlink} + fi, err := f.readInfo() + if err != nil { + return "", err + } + if fi.size < 0 || fi.size > maxSymlinkSize { + return "", fmt.Errorf("symlink target size %d out of range: %w", fi.size, ErrInvalid) + } + buf := make([]byte, fi.size) + if fi.size > 0 { + if _, err = f.Read(buf); err != nil && err != io.EOF { + return "", err + } + } + return string(buf), nil +} + +// resolve cleans the path and walks directory entries to find the target inode. +// When follow is true, symlinks are followed (including the final component). +// When follow is false, the final component is not followed (for Lstat/ReadLink). +// Intermediate symlinks are always followed. +func (i *image) resolve(op, name string, follow bool) (nid uint64, ftype fs.FileMode, basename string, err error) { + original := name + if path.IsAbs(name) { + name = name[1:] + } + name = path.Clean(name) + if name == "." { + name = "" + } + + nid = uint64(i.sb.RootNid) + ftype = fs.ModeDir + + // curPath tracks the full resolved path of the current directory + // so that relative symlink targets can be resolved correctly. + linksFollowed := 0 + curPath := "" + basename = name + for name != "" { + var sep int + for sep < len(name) && name[sep] != '/' { + sep++ + } + var rest string + if sep < len(name) { + basename = name[:sep] + rest = name[sep+1:] + } else { + basename = name + rest = "" + } + + if ftype != fs.ModeDir { + return 0, 0, "", &fs.PathError{Op: op, Path: original, Err: ErrNotDirectory} + } + d := &dir{ + file: file{ + img: i, + name: basename, + nid: nid, + ftype: ftype, + }, + } + entNid, entFtype, err := d.lookup(basename) + if err != nil { + return 0, 0, "", &fs.PathError{Op: op, Path: original, Err: err} + } + nid = entNid + ftype = entFtype & fs.ModeType + + // Follow symlinks for intermediate components always, + // and for the final component only when follow is true. + isFinal := rest == "" + if ftype&fs.ModeSymlink != 0 && (follow || !isFinal) { + linksFollowed++ + if linksFollowed > maxSymlinks { + return 0, 0, "", &fs.PathError{Op: op, Path: original, Err: ErrLoop} + } + target, err := i.readLink(nid, basename) + if err != nil { + return 0, 0, "", err + } + // Prepend the symlink target to the remaining path + if rest != "" { + target = target + "/" + rest + } + // Resolve relative to the parent directory's full path + if !path.IsAbs(target) { + target = curPath + "/" + target + } + // Clean and re-resolve from root + target = path.Clean(target) + if len(target) > 0 && target[0] == '/' { + target = target[1:] + } + nid = uint64(i.sb.RootNid) + ftype = fs.ModeDir + curPath = "" + name = target + if name == "." { + name = "" + } + basename = name + continue + } + + if curPath == "" { + curPath = basename + } else { + curPath = curPath + "/" + basename + } + name = rest + } + + if basename == "" { + basename = original + } + return nid, ftype, basename, nil +} + +func (i *image) Open(name string) (fs.File, error) { + nid, ftype, basename, err := i.resolve("open", name, true) + if err != nil { + return nil, err + } + b := file{img: i, name: basename, nid: nid, ftype: ftype} + if ftype.IsDir() { + return &dir{file: b}, nil + } + return &b, nil +} + +func (i *image) Stat(name string) (fs.FileInfo, error) { + nid, ftype, basename, err := i.resolve("stat", name, true) + if err != nil { + return nil, err + } + f := &file{img: i, name: basename, nid: nid, ftype: ftype} + return f.statInfo() +} + +// ReadFile reads the named file and returns its contents. +// Files larger than maxReadFileSize (128 MiB) are rejected; +// use Open and io.Copy for larger files. +func (i *image) ReadFile(name string) ([]byte, error) { + nid, ftype, basename, err := i.resolve("readfile", name, true) + if err != nil { + return nil, err + } + if ftype.IsDir() { + return nil, &fs.PathError{Op: "read", Path: name, Err: ErrIsDirectory} + } + f := &file{img: i, name: basename, nid: nid, ftype: ftype} + fi, err := f.readInfo() + if err != nil { + return nil, err + } + if fi.size < 0 || fi.size > maxReadFileSize { + return nil, fmt.Errorf("file size %d exceeds ReadFile limit %d; use Open and io.Copy for large files: %w", fi.size, int64(maxReadFileSize), ErrInvalid) + } + buf := make([]byte, fi.size) + if fi.size > 0 { + if _, err = f.Read(buf); err != nil && err != io.EOF { + return nil, err + } + } + return buf, nil +} + +func (i *image) ReadDir(name string) ([]fs.DirEntry, error) { + nid, ftype, basename, err := i.resolve("readdir", name, true) + if err != nil { + return nil, err + } + if !ftype.IsDir() { + return nil, &fs.PathError{Op: "readdir", Path: name, Err: ErrNotDirectory} + } + d := &dir{file: file{img: i, name: basename, nid: nid, ftype: ftype}} + entries, err := d.ReadDir(-1) + if err != nil { + return nil, err + } + slices.SortFunc(entries, func(a, b fs.DirEntry) int { + return cmp.Compare(a.Name(), b.Name()) + }) + return entries, nil +} + +func (i *image) ReadLink(name string) (string, error) { + nid, ftype, basename, err := i.resolve("readlink", name, false) + if err != nil { + return "", err + } + if ftype&fs.ModeSymlink == 0 { + return "", &fs.PathError{Op: "readlink", Path: name, Err: fs.ErrInvalid} + } + return i.readLink(nid, basename) +} + +func (i *image) Lstat(name string) (fs.FileInfo, error) { + nid, ftype, basename, err := i.resolve("lstat", name, false) + if err != nil { + return nil, err + } + f := &file{img: i, name: basename, nid: nid, ftype: ftype} + return f.statInfo() +} + +type file struct { + img *image + name string + nid uint64 + ftype fs.FileMode + + // Mutable fields, open file should not be accessed concurrently + offset int64 // current offset for read operations + info *inode // cached inode +} + +func (b *file) readInfo() (ino *inode, err error) { + if b.info != nil { + return b.info, nil + } + + addr := b.img.metaStartPos() + int64(b.nid*disk.SizeInodeCompact) + blkSize := int32(1 << b.img.sb.BlkSizeBits) + blk := b.img.getBlock() + blk.offset = int32(addr & int64(blkSize-1)) + blk.end = blkSize + if blk.end-blk.offset < disk.SizeInodeExtended { + // Use buffer starting from beginning of inode, do not use the position + // in the block since an extended inode may span multiple blocks + blk.offset = 0 + blk.end = disk.SizeInodeExtended + } + + defer func() { + v := recover() + if v != nil { + err = fmt.Errorf("file format error: %v", v) + } + if err != nil { + b.img.putBlock(blk) + } + + }() + + buf := blk.bytes() + _, err = b.img.meta.ReadAt(buf, addr) + if err != nil { + return nil, err + } + + var format, xcnt uint16 + if _, err = binary.Decode(buf[:2], binary.LittleEndian, &format); err != nil { + return nil, err + } + + layout := uint8((format & 0x0E) >> 1) + if format&0x01 == 0 { + var di disk.InodeCompact + if _, err := binary.Decode(buf[:disk.SizeInodeCompact], binary.LittleEndian, &di); err != nil { + return nil, err + } + b.info = &inode{ + name: b.name, + nid: b.nid, + icsize: disk.SizeInodeCompact, + inodeLayout: layout, + inodeData: di.InodeData, + size: int64(di.Size), + mode: (fs.FileMode(di.Mode) & ^fs.ModeType) | b.ftype, + rawMode: di.Mode, + uid: uint32(di.UID), + gid: uint32(di.GID), + nlink: int(di.Nlink), + mtime: b.img.sb.BuildTime, + mtimeNs: b.img.sb.BuildTimeNs, + } + xcnt = di.XattrCount + } else { + var di disk.InodeExtended + if _, err = binary.Decode(buf[:disk.SizeInodeExtended], binary.LittleEndian, &di); err != nil { + return nil, err + } + b.info = &inode{ + name: b.name, + nid: b.nid, + icsize: disk.SizeInodeExtended, + inodeLayout: layout, + inodeData: di.InodeData, + size: int64(di.Size), + mode: (fs.FileMode(di.Mode) & ^fs.ModeType) | b.ftype, + rawMode: di.Mode, + uid: di.UID, + gid: di.GID, + nlink: int(di.Nlink), + mtime: di.Mtime, + mtimeNs: di.MtimeNs, + } + xcnt = di.XattrCount + } + + if xcnt > 0 { + b.info.xsize = int(xcnt-1)*disk.SizeXattrEntry + disk.SizeXattrBodyHeader + } + + switch { + case b.info.inodeLayout == disk.LayoutFlatPlain || b.info.size == 0 || blk.end != blkSize: + b.img.putBlock(blk) + default: + // If the inode has trailing data used later, cache it + b.info.cached = blk + } + return b.info, nil +} + +// statInfo reads the inode and builds a fileInfo with full stat data +// including extended attributes. The cached block is released since +// stat callers do not need inline data. +func (b *file) statInfo() (*fileInfo, error) { + ino, err := b.readInfo() + if err != nil { + return nil, err + } + fi := &fileInfo{ + name: ino.name, + size: ino.size, + mode: ino.mode, + mtime: ino.mtime, + mtimeNs: ino.mtimeNs, + stat: &Stat{ + Mode: disk.EroFSModeToGoFileMode(ino.rawMode), + Size: ino.size, + InodeLayout: ino.inodeLayout, + Ino: int64(ino.nid), + Rdev: disk.RdevFromMode(ino.rawMode, ino.inodeData), + UID: ino.uid, + GID: ino.gid, + Nlink: ino.nlink, + Mtime: ino.mtime, + MtimeNs: ino.mtimeNs, + }, + } + if ino.xsize > 0 { + if err := loadXattrs(b, fi.stat); err != nil { + return nil, err + } + } + // Build data ranges for regular files. + // Flat layouts are cheap (no I/O) — compute eagerly. + // Chunk-based layout requires a ReadAt on the image; defer until needed. + if ino.mode.IsRegular() && ino.size > 0 { + if ino.inodeLayout == disk.LayoutChunkBased { + // Capture a snapshot of the fields buildChunkDataRanges needs. + // We must not capture ino by pointer: the caller may reuse it, + // and cached block is released below. + inoCopy := *ino + inoCopy.cached = nil + img := b.img + fi.rangesLoader = func() []DataRange { + f := &file{img: img} + return f.buildChunkDataRanges(&inoCopy) + } + } else { + fi.dataRanges = b.buildDataRanges(ino) + } + } + // Release cached block - stat callers don't need inline data + if ino.cached != nil { + b.img.putBlock(ino.cached) + ino.cached = nil + } + return fi, nil +} + +// buildDataRanges computes the physical data ranges for a regular file. +func (b *file) buildDataRanges(ino *inode) []DataRange { + blockSize := int64(1 << b.img.sb.BlkSizeBits) + switch ino.inodeLayout { + case disk.LayoutFlatPlain: + dataOffset := int64(ino.inodeData) << b.img.sb.BlkSizeBits + return []DataRange{{Device: 0, Offset: dataOffset, Size: ino.size}} + case disk.LayoutFlatInline: + inodeAddr := b.img.metaStartPos() + int64(ino.nid)*disk.SizeInodeCompact + trailingAddr := inodeAddr + ino.flatDataOffset() + if ino.size <= blockSize { + return []DataRange{{Device: 0, Offset: trailingAddr, Size: ino.size}} + } + // Multi-block inline: earlier full blocks at dataBlkAddr, last block inline. + // headSize is the number of complete blocks before the inline tail, in bytes. + // ino.inodeData is the starting block address, not a block count. + headSize := ((ino.size - 1) / blockSize) * blockSize + tailSize := ino.size - headSize + var ranges []DataRange + if headSize > 0 { + dataOffset := int64(ino.inodeData) << b.img.sb.BlkSizeBits + ranges = append(ranges, DataRange{Device: 0, Offset: dataOffset, Size: headSize}) + } + ranges = append(ranges, DataRange{Device: 0, Offset: trailingAddr, Size: tailSize}) + return ranges + case disk.LayoutChunkBased: + return b.buildChunkDataRanges(ino) + } + return nil +} + +// maxChunkIndexBytes is an upper bound on the chunk-index table we will +// allocate for a single file. 64 MiB covers ~8 M chunks; no real EROFS image +// should approach this, and it prevents allocation bombs from corrupt images. +const maxChunkIndexBytes = 64 << 20 // 64 MiB + +// buildChunkDataRanges parses chunk indexes into DataRange entries covering +// the complete logical layout of the file. The returned slice satisfies the +// DataRange contract: entries are in logical-file order and their sizes sum +// to ino.size exactly. +// +// Null/hole chunks are emitted as DataRange{Offset: -1, Size: ...} entries. +// Consecutive null chunks coalesce into a single hole entry. +// Adjacent data chunks that are physically contiguous on the same device +// merge into one entry. Data chunks never merge across a hole boundary. +// +// The final entry (data or hole) has its Size trimmed to the file-tail length +// so the invariant sum(Size) == ino.size holds precisely. +func (b *file) buildChunkDataRanges(ino *inode) []DataRange { + chunkFmt := uint16(ino.inodeData) + if chunkFmt&disk.LayoutChunkFormatIndexes == 0 { + return nil + } + // 48-bit chunk addressing is not yet implemented; the null-chunk sentinel + // (blkLo == 0xFFFFFFFF) is only unambiguous in 32-bit address mode. + if chunkFmt&disk.LayoutChunkFormat48Bit != 0 { + return nil + } + chunkBits := b.img.sb.BlkSizeBits + uint8(chunkFmt&disk.LayoutChunkFormatBits) + nchunks := int((ino.size-1)>>chunkBits) + 1 + chunkSize := int64(1) << chunkBits + + inodeStart := b.img.metaStartPos() + int64(ino.nid)*disk.SizeInodeCompact + baseOffset := inodeStart + ino.flatDataOffset() + if baseOffset%8 != 0 { + baseOffset = (baseOffset + 7) & ^int64(7) + } + needed := int64(nchunks) * int64(disk.SizeChunkIndex) + if needed > maxChunkIndexBytes { + return nil + } + idxBuf := make([]byte, needed) + if _, err := b.img.meta.ReadAt(idxBuf, baseOffset); err != nil { + return nil + } + + var ranges []DataRange + for i := range nchunks { + // Size of this logical chunk: full chunkSize for all but the last. + size := chunkSize + if i == nchunks-1 { + size = ino.size - int64(i)*chunkSize + } + + off := i * disk.SizeChunkIndex + blkLo := binary.LittleEndian.Uint32(idxBuf[off+4 : off+8]) + if ^blkLo == 0 { + // Null/hole chunk: coalesce with a preceding hole if possible. + if len(ranges) > 0 && ranges[len(ranges)-1].Offset == holeOffset { + ranges[len(ranges)-1].Size += size + } else { + ranges = append(ranges, DataRange{Offset: holeOffset, Size: size}) + } + continue + } + + blkHi := binary.LittleEndian.Uint16(idxBuf[off : off+2]) + deviceID := binary.LittleEndian.Uint16(idxBuf[off+2:off+4]) & b.img.deviceIDMask + phys := (uint64(blkHi) << 32) | uint64(blkLo) + byteOffset := int64(phys) << b.img.sb.BlkSizeBits + + // Merge with the previous entry if it is a data range that is + // physically contiguous on the same device. + if len(ranges) > 0 { + prev := &ranges[len(ranges)-1] + if prev.Offset != holeOffset && prev.Device == deviceID && prev.Offset+prev.Size == byteOffset { + prev.Size += size + continue + } + } + ranges = append(ranges, DataRange{Device: deviceID, Offset: byteOffset, Size: size}) + } + return ranges +} + +func (b *file) Stat() (fs.FileInfo, error) { + return b.statInfo() +} + +func (b *file) Read(p []byte) (int, error) { + fi, err := b.readInfo() + if err != nil { + return 0, err + } + + var n int + for len(p) > 0 { + if b.offset >= fi.size { + return n, io.EOF + } + blk, err := b.img.loadBlock(fi, b.offset) + if err != nil { + if errors.Is(err, io.EOF) { + err = io.EOF + b.offset += int64(n) + } + return n, err + } + buf := blk.bytes() + copied := copy(p, buf) + n += copied + p = p[copied:] + b.offset += int64(copied) + + b.img.putBlock(blk) + } + return n, nil +} + +func (b *file) Close() error { + if b.info != nil && b.info.cached != nil { + b.img.putBlock(b.info.cached) + b.info.cached = nil + } + return nil +} + +type direntry struct { + file +} + +func (d *direntry) Name() string { + return d.name +} + +func (d *direntry) IsDir() bool { + return d.ftype.IsDir() +} + +func (d *direntry) Type() fs.FileMode { + return d.ftype +} + +func (d *direntry) Info() (fs.FileInfo, error) { + return d.statInfo() +} + +type dir struct { + file + + // bn is the current block to read from (relative to file start) + bn int + + // consumed is how many have been returned in the current block + consumed uint16 +} + +func (d *dir) ReadDir(n int) ([]fs.DirEntry, error) { + fi, err := d.readInfo() + if err != nil { + return nil, fmt.Errorf("readInfo failed: %w", err) + } + + var ents []fs.DirEntry + pos := int64(d.bn << d.img.sb.BlkSizeBits) + for pos < fi.size { + b, err := d.img.loadBlock(fi, pos) + if err != nil { + if errors.Is(err, io.EOF) { + break + } + return nil, err + } + buf := b.bytes() + if len(buf) < 12 { + d.img.putBlock(b) + break + } + + var dirents [2]disk.Dirent + + readN, err := binary.Decode(buf[:12], binary.LittleEndian, &dirents[0]) + if err != nil { + d.img.putBlock(b) + return nil, fmt.Errorf("decode failed: %w", err) + } + if readN != 12 { + d.img.putBlock(b) + return nil, errors.New("invalid dirent: not fully decoded") + } + + entryN := dirents[0].NameOff / disk.SizeDirent + bufLen := len(buf) + + // Validate that NameOff is within bounds and dirent entries fit. + if int(dirents[0].NameOff) > bufLen || entryN == 0 { + d.img.putBlock(b) + return ents, fmt.Errorf("invalid dirent name offset %d (buf size %d): %w", dirents[0].NameOff, bufLen, ErrInvalid) + } + + for i := uint16(0); i < entryN; i++ { + var name string + if i < entryN-1 { + start := int(disk.SizeDirent) * (int(i) + 1) + if start+int(disk.SizeDirent) > bufLen { + d.img.putBlock(b) + return ents, fmt.Errorf("dirent entry %d exceeds block: %w", i+1, ErrInvalid) + } + readN, err := binary.Decode(buf[start:start+int(disk.SizeDirent)], binary.LittleEndian, &dirents[1]) + if err != nil { + d.img.putBlock(b) + return nil, fmt.Errorf("decode failed: %w", err) + } + if readN != 12 { + d.img.putBlock(b) + return nil, errors.New("invalid dirent: not fully decoded") + } + if int(dirents[0].NameOff) > bufLen || int(dirents[1].NameOff) > bufLen || dirents[1].NameOff < dirents[0].NameOff { + d.img.putBlock(b) + return ents, fmt.Errorf("invalid dirent name offset range [%d:%d] (buf size %d): %w", + dirents[0].NameOff, dirents[1].NameOff, bufLen, ErrInvalid) + } + name = string(buf[dirents[0].NameOff:dirents[1].NameOff]) + } else { + if int(dirents[0].NameOff) > bufLen { + d.img.putBlock(b) + return ents, fmt.Errorf("invalid dirent name offset %d (buf size %d): %w", dirents[0].NameOff, bufLen, ErrInvalid) + } + // The last entry name extends to end of block; + // trim any NUL padding. + raw := buf[dirents[0].NameOff:] + if j := bytes.IndexByte(raw, 0); j >= 0 { + raw = raw[:j] + } + name = string(raw) + } + + if i >= d.consumed && name != "." && name != ".." { + f := file{ + img: d.img, + name: name, + nid: dirents[0].Nid, + ftype: disk.EroFSFtypeToFileMode(dirents[0].FileType), + } + ents = append(ents, &direntry{f}) + d.consumed = i + 1 + + if n > 0 && len(ents) == n { + if i == entryN-1 { + d.consumed = 0 + d.bn++ + } + d.img.putBlock(b) + return ents, nil + } + } + + // Rotate next to current + dirents[0] = dirents[1] + } + + d.img.putBlock(b) + d.consumed = 0 + d.bn++ + pos = int64(d.bn << d.img.sb.BlkSizeBits) + } + + // Per fs.ReadDirFile contract: when n > 0 and we've reached the end + // of the directory, return io.EOF. When n <= 0, return all entries + // without io.EOF. + if n > 0 { + return ents, io.EOF + } + return ents, nil +} + +// lookup searches for a directory entry by name using binary search. +// EROFS directories are sorted by name both within and across blocks. +// A cross-block binary search locates the correct block, then an +// intra-block binary search finds the entry. +// Returns the nid and file type if found, or fs.ErrNotExist if not. +func (d *dir) lookup(target string) (uint64, fs.FileMode, error) { + fi, err := d.readInfo() + if err != nil { + return 0, 0, fmt.Errorf("readInfo failed: %w", err) + } + + targetBytes := []byte(target) + blkSize := int64(1 << d.img.sb.BlkSizeBits) + nblocks := int((fi.size + blkSize - 1) / blkSize) + + // Binary search across blocks: compare target against the first + // entry of each block to find which block may contain the target. + // The last loaded block is retained to avoid reloading it for the + // intra-block search. + var lastBlk *block + lastIdx := -1 + lo, hi := 0, nblocks + for lo < hi { + mid := lo + (hi-lo)/2 + pos := int64(mid) * blkSize + b, err := d.img.loadBlock(fi, pos) + if err != nil { + if errors.Is(err, io.EOF) { + hi = mid + continue + } + if lastBlk != nil { + d.img.putBlock(lastBlk) + } + return 0, 0, err + } + buf := b.bytes() + firstName, err := blockFirstName(buf) + if err != nil { + d.img.putBlock(b) + if lastBlk != nil { + d.img.putBlock(lastBlk) + } + return 0, 0, err + } + + if bytes.Compare(firstName, targetBytes) <= 0 { + // This block's first entry <= target; keep it as candidate. + if lastBlk != nil { + d.img.putBlock(lastBlk) + } + lastBlk = b + lastIdx = mid + lo = mid + 1 + } else { + d.img.putBlock(b) + hi = mid + } + } + + // lastIdx is the last block whose first entry <= target. + // The target must be in that block if it exists. + if lastIdx < 0 { + return 0, 0, fs.ErrNotExist + } + + buf := lastBlk.bytes() + nid, ftype, err := lookupBlock(buf, targetBytes) + d.img.putBlock(lastBlk) + return nid, ftype, err +} + +// blockFirstName returns the name of the first entry in a directory block. +func blockFirstName(buf []byte) ([]byte, error) { + if len(buf) < disk.SizeDirent { + return nil, fmt.Errorf("directory block too small: %w", ErrInvalid) + } + var first disk.Dirent + if _, err := binary.Decode(buf[:disk.SizeDirent], binary.LittleEndian, &first); err != nil { + return nil, fmt.Errorf("decode failed: %w", err) + } + entryN := first.NameOff / disk.SizeDirent + if entryN == 0 || int(first.NameOff) > len(buf) { + return nil, fmt.Errorf("invalid name offset %d: %w", first.NameOff, ErrInvalid) + } + var nameEnd uint16 + if entryN > 1 { + nextOff := int(disk.SizeDirent) + 8 + if nextOff+2 > len(buf) { + return nil, fmt.Errorf("next dirent name offset out of range: %w", ErrInvalid) + } + nameEnd = binary.LittleEndian.Uint16(buf[nextOff:]) + } else { + nameEnd = uint16(len(buf)) + } + if first.NameOff > nameEnd || int(nameEnd) > len(buf) { + return nil, fmt.Errorf("name range [%d:%d] out of bounds: %w", first.NameOff, nameEnd, ErrInvalid) + } + name := buf[first.NameOff:nameEnd] + // Trim NUL terminator if present + if i := bytes.IndexByte(name, 0); i >= 0 { + name = name[:i] + } + return name, nil +} + +// blockDirent decodes the dirent at index i from buf and returns the +// name bytes for that entry. entryN is the total number of entries. +func blockDirent(buf []byte, i, entryN uint16) (disk.Dirent, []byte, error) { + var de disk.Dirent + off := int(disk.SizeDirent * i) + if off+disk.SizeDirent > len(buf) { + return de, nil, fmt.Errorf("dirent %d offset %d out of range: %w", i, off, ErrInvalid) + } + if _, err := binary.Decode(buf[off:off+disk.SizeDirent], binary.LittleEndian, &de); err != nil { + return de, nil, fmt.Errorf("decode dirent %d failed: %w", i, err) + } + var nameEnd uint16 + if i < entryN-1 { + nextOff := int(disk.SizeDirent*(i+1)) + 8 + if nextOff+2 > len(buf) { + return de, nil, fmt.Errorf("dirent %d next name offset out of range: %w", i, ErrInvalid) + } + nameEnd = binary.LittleEndian.Uint16(buf[nextOff:]) + } else { + nameEnd = uint16(len(buf)) + } + if de.NameOff > nameEnd || int(nameEnd) > len(buf) { + return de, nil, fmt.Errorf("dirent %d name range [%d:%d] out of bounds: %w", i, de.NameOff, nameEnd, ErrInvalid) + } + name := buf[de.NameOff:nameEnd] + // The last entry name may be NUL-terminated before the end of the block. + if i == entryN-1 { + if j := bytes.IndexByte(name, 0); j >= 0 { + name = name[:j] + } + } + return de, name, nil +} + +// lookupBlock searches a single directory block for the target name +// using binary search. +func lookupBlock(buf, target []byte) (uint64, fs.FileMode, error) { + if len(buf) < disk.SizeDirent { + return 0, 0, fmt.Errorf("directory block too small: %w", ErrInvalid) + } + var first disk.Dirent + if _, err := binary.Decode(buf[:disk.SizeDirent], binary.LittleEndian, &first); err != nil { + return 0, 0, fmt.Errorf("decode failed: %w", err) + } + if first.NameOff%disk.SizeDirent != 0 { + return 0, 0, fmt.Errorf("invalid name offset %d not aligned to dirent size: %w", first.NameOff, ErrInvalid) + } + entryN := first.NameOff / disk.SizeDirent + if int(first.NameOff) > len(buf) { + return 0, 0, fmt.Errorf("name offset %d exceeds block size %d: %w", first.NameOff, len(buf), ErrInvalid) + } + + lo, hi := uint16(0), entryN + for lo < hi { + mid := lo + (hi-lo)/2 + de, name, err := blockDirent(buf, mid, entryN) + if err != nil { + return 0, 0, err + } + switch bytes.Compare(name, target) { + case 0: + return de.Nid, disk.EroFSFtypeToFileMode(de.FileType), nil + case -1: + lo = mid + 1 + default: + hi = mid + } + } + return 0, 0, fs.ErrNotExist +} + +// inode holds the parsed on-disk inode data needed for I/O operations. +// It is an internal type and is not returned to callers directly. +type inode struct { + name string + nid uint64 + icsize int8 + xsize int + inodeLayout uint8 + inodeData uint32 + size int64 + mode fs.FileMode + rawMode uint16 + uid uint32 + gid uint32 + nlink int + mtime uint64 + mtimeNs uint32 + cached *block +} + +func (ino *inode) flatDataOffset() int64 { + // inode core size + xattr size + return int64(ino.icsize) + int64(ino.xsize) +} + +// fileInfo implements [fs.FileInfo] and provides extended metadata +// via type-assertable accessor methods. Callers can extract +// Unix-style metadata without importing this package: +// +// if u, ok := fi.(interface{ UID() uint32 }); ok { uid = u.UID() } +type fileInfo struct { + name string + size int64 + mode fs.FileMode + mtime uint64 + mtimeNs uint32 + stat *Stat + dataRanges []DataRange + + // rangesOnce and rangesLoader support lazy computation of data ranges + // for chunk-based files (LayoutChunkBased). The loader performs a ReadAt + // to parse the chunk index, so it is deferred until the caller actually + // calls DataRange(). For flat layouts (FlatPlain, FlatInline), ranges + // are computed eagerly at stat time since they require no I/O. + rangesOnce sync.Once + rangesLoader func() []DataRange +} + +func (fi *fileInfo) Name() string { return fi.name } +func (fi *fileInfo) Size() int64 { return fi.size } +func (fi *fileInfo) Mode() fs.FileMode { return fi.mode } +func (fi *fileInfo) IsDir() bool { return fi.mode.IsDir() } +func (fi *fileInfo) Sys() any { return fi.stat } +func (fi *fileInfo) ModTime() time.Time { return time.Unix(int64(fi.mtime), int64(fi.mtimeNs)) } +func (fi *fileInfo) UID() uint32 { return fi.stat.UID } +func (fi *fileInfo) GID() uint32 { return fi.stat.GID } +func (fi *fileInfo) Ino() uint64 { return uint64(fi.stat.Ino) } +func (fi *fileInfo) Nlink() uint64 { return uint64(fi.stat.Nlink) } +func (fi *fileInfo) Rdev() uint64 { return uint64(fi.stat.Rdev) } + +// DataRange returns the physical data ranges for this file's uncompressed +// content. Returns nil for compressed files, directories, symlinks, and +// other non-regular entries. +func (fi *fileInfo) DataRange() []DataRange { + if fi.rangesLoader != nil { + fi.rangesOnce.Do(func() { + fi.dataRanges = fi.rangesLoader() + }) + } + return fi.dataRanges +} + +// GetAllXattr returns all extended attributes. +func (fi *fileInfo) GetAllXattr() map[string]string { return fi.stat.Xattrs } + +// GetXattr returns the value of a single extended attribute. +func (fi *fileInfo) GetXattr(name string) (string, bool) { + v, ok := fi.stat.Xattrs[name] + return v, ok +} +func decodeSuperBlock(b [disk.SizeSuperBlock]byte, sb *disk.SuperBlock) error { + n, err := binary.Decode(b[:], binary.LittleEndian, sb) + if err != nil { + return err + } + if n != disk.SizeSuperBlock { + return fmt.Errorf("invalid super block: decoded %d bytes", n) + } + if sb.MagicNumber != disk.MagicNumber { + return fmt.Errorf("invalid super block: invalid magic number %x", sb.MagicNumber) + } + return nil +} diff --git a/vendor/github.com/erofs/go-erofs/format.go b/vendor/github.com/erofs/go-erofs/format.go new file mode 100644 index 0000000..cb1d9d9 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/format.go @@ -0,0 +1,137 @@ +package erofs + +import ( + "io/fs" + "sort" + "strings" + + "github.com/erofs/go-erofs/internal/disk" +) + +// Standard xattr name prefix table (index → on-disk NameIndex). +var xattrPrefixes = [...]struct { + index uint8 + prefix string +}{ + {1, "user."}, + {2, "system.posix_acl_access."}, + {3, "system.posix_acl_default."}, + {4, "trusted."}, + {5, "lustre."}, + {6, "security."}, +} + +// xattrSplit splits a full xattr name into (NameIndex, suffix). +func xattrSplit(name string) (uint8, string) { + for _, p := range xattrPrefixes { + if strings.HasPrefix(name, p.prefix) { + return p.index, name[len(p.prefix):] + } + } + return 0, name +} + +// xattrEntrySize returns the on-disk size of a single xattr entry, padded to 4 bytes. +func xattrEntrySize(name, value string) int { + _, suffix := xattrSplit(name) + sz := disk.SizeXattrEntry + len(suffix) + len(value) + if sz%4 != 0 { + sz = (sz + 3) & ^3 + } + return sz +} + +// calcXattrSize returns the total xattr area size (header + entries), or 0. +func calcXattrSize(e *erofsEntry) int { + if len(e.xattrs) == 0 { + return 0 + } + entriesSize := 0 + for name, value := range e.xattrs { + entriesSize += xattrEntrySize(name, value) + } + return disk.SizeXattrBodyHeader + entriesSize +} + +// xattrCount encodes the xattr area size into the inode XattrCount field. +func xattrCount(xattrSize int) uint16 { + if xattrSize == 0 { + return 0 + } + return uint16((xattrSize-disk.SizeXattrBodyHeader)/disk.SizeXattrEntry) + 1 +} + +// sortedXattrKeys returns xattr keys in deterministic order. +func sortedXattrKeys(m map[string]string) []string { + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} + +// inodeFormat builds the Format field: bit 0 = extended, bits 1-3 = layout. +func inodeFormat(layout uint8, compact bool) uint16 { + f := uint16(layout) << 1 + if !compact { + f |= 1 // bit 0 = extended + } + return f +} + +// goModeToUnixMode converts Go fs.FileMode to Unix mode bits. +func goModeToUnixMode(m fs.FileMode) uint16 { + mode := uint16(m.Perm()) + + if m&fs.ModeSetuid != 0 { + mode |= disk.StatTypeIsUID + } + if m&fs.ModeSetgid != 0 { + mode |= disk.StatTypeIsGID + } + if m&fs.ModeSticky != 0 { + mode |= disk.StatTypeIsVTX + } + + switch m.Type() { + case 0: // regular file + mode |= disk.StatTypeReg + case fs.ModeDir: + mode |= disk.StatTypeDir + case fs.ModeSymlink: + mode |= disk.StatTypeSymlink + case fs.ModeDevice | fs.ModeCharDevice: + mode |= disk.StatTypeChrdev + case fs.ModeDevice: + mode |= disk.StatTypeBlkdev + case fs.ModeNamedPipe: + mode |= disk.StatTypeFifo + case fs.ModeSocket: + mode |= disk.StatTypeSock + } + + return mode +} + +// modeToFileType converts Unix mode bits to an EROFS file type. +func modeToFileType(mode uint16) uint8 { + switch mode & disk.StatTypeMask { + case disk.StatTypeReg: + return disk.FileTypeReg + case disk.StatTypeDir: + return disk.FileTypeDir + case disk.StatTypeChrdev: + return disk.FileTypeChrdev + case disk.StatTypeBlkdev: + return disk.FileTypeBlkdev + case disk.StatTypeFifo: + return disk.FileTypeFifo + case disk.StatTypeSock: + return disk.FileTypeSock + case disk.StatTypeSymlink: + return disk.FileTypeSymlink + default: + return 0 + } +} diff --git a/vendor/github.com/erofs/go-erofs/internal/builder/entry.go b/vendor/github.com/erofs/go-erofs/internal/builder/entry.go new file mode 100644 index 0000000..abeaf97 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/internal/builder/entry.go @@ -0,0 +1,34 @@ +// Package builder provides shared types for the mkfs sub-packages. +package builder + +import "io" + +// Entry carries extended metadata for a filesystem entry. +// Mode and Size come from fs.FileInfo; everything else lives here. +type Entry struct { + UID, GID uint32 + Mtime uint64 + MtimeNs uint32 + Nlink uint32 + Rdev uint32 + Xattrs map[string]string + LinkTarget string + Data io.Reader // file content (full-image mode) + Chunks []Chunk // physical block refs (metadata-only mode) + Contiguous bool // data blocks are contiguous; flat-plain is sufficient + MetadataOnly bool // chunk-based layout even without chunks +} + +// NullPhysicalBlock is the sentinel value for Chunk.PhysicalBlock that marks +// a hole (a sparse region of zero bytes). It corresponds to the on-disk +// EROFS null chunk encoding (StartBlkHi=0xFFFF, StartBlkLo=0xFFFFFFFF). +const NullPhysicalBlock uint64 = ^uint64(0) + +// Chunk maps a range of logical blocks to physical blocks on a device. +// If PhysicalBlock == NullPhysicalBlock the chunk is a hole: Count logical +// blocks of zeros with no physical backing. DeviceID is ignored for holes. +type Chunk struct { + PhysicalBlock uint64 // physical block address, or NullPhysicalBlock for a hole + Count uint16 // number of contiguous blocks + DeviceID uint16 // 0 = primary, 1+ = extra device; ignored for holes +} diff --git a/vendor/github.com/erofs/go-erofs/internal/disk/ftypes.go b/vendor/github.com/erofs/go-erofs/internal/disk/ftypes.go new file mode 100644 index 0000000..e49621f --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/internal/disk/ftypes.go @@ -0,0 +1,88 @@ +package disk + +import "io/fs" + +const ( + FileTypeReg = 1 + FileTypeDir = 2 + FileTypeChrdev = 3 + FileTypeBlkdev = 4 + FileTypeFifo = 5 + FileTypeSock = 6 + FileTypeSymlink = 7 + + StatTypeMask = 0170000 // Mask for the type bits + StatTypeReg = 0100000 // Regular file + StatTypeDir = 0040000 // Directory + StatTypeChrdev = 0020000 // Character device + StatTypeBlkdev = 0060000 // Block device + StatTypeFifo = 0010000 // FIFO + StatTypeSock = 0140000 // Socket + StatTypeSymlink = 0120000 // Symlink + StatTypeIsUID = 0004000 // Setuid on execution + StatTypeIsGID = 0002000 // Setgid on execution + StatTypeIsVTX = 0001000 // Sticky bit +) + +// Converts EroFS filetypes to Go FileMode +func EroFSFtypeToFileMode(ftype uint8) fs.FileMode { + switch ftype { + case FileTypeDir: + return fs.ModeDir + case FileTypeChrdev: + return fs.ModeDevice | fs.ModeCharDevice + case FileTypeBlkdev: + return fs.ModeDevice + case FileTypeFifo: + return fs.ModeNamedPipe + case FileTypeSock: + return fs.ModeSocket + case FileTypeSymlink: + return fs.ModeSymlink + default: + return 0 + } +} + +func EroFSModeToGoFileMode(mode uint16) fs.FileMode { + var m fs.FileMode + m |= fs.FileMode(mode & 0777) + switch mode & StatTypeMask { + case StatTypeReg: + case StatTypeDir: + m |= fs.ModeDir + case StatTypeChrdev: + m |= fs.ModeDevice | fs.ModeCharDevice + case StatTypeBlkdev: + m |= fs.ModeDevice + case StatTypeFifo: + m |= fs.ModeNamedPipe + case StatTypeSock: + m |= fs.ModeSocket + case StatTypeSymlink: + m |= fs.ModeSymlink + default: + m |= fs.ModeIrregular // Unknown type, treat as irregular file + } + if mode&StatTypeIsUID != 0 { + m |= fs.ModeSetuid + } + if mode&StatTypeIsGID != 0 { + m |= fs.ModeSetgid + } + if mode&StatTypeIsVTX != 0 { + m |= fs.ModeSticky + } + + return m +} + +func RdevFromMode(mode uint16, inodeData uint32) uint32 { + switch mode & StatTypeMask { + case StatTypeChrdev, StatTypeBlkdev, StatTypeFifo, StatTypeSock: + // inodeData field is device number for some file types + return inodeData + default: + return 0 // Not a device type + } +} diff --git a/vendor/github.com/erofs/go-erofs/internal/disk/types.go b/vendor/github.com/erofs/go-erofs/internal/disk/types.go new file mode 100644 index 0000000..5f654c6 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/internal/disk/types.go @@ -0,0 +1,155 @@ +package disk + +const ( + MagicNumber = 0xe0f5e1e2 + SuperBlockOffset = 1024 + + FeatureIncompatLZ4_0Padding = 0x1 + FeatureIncompatChunkedFile = 0x4 + FeatureIncompatDeviceTable = 0x8 + FeatureIncompatFragments = 0x20 + FeatureIncompatXattrPrefixes = 0x40 + FeatureIncompatAll uint32 = FeatureIncompatLZ4_0Padding | + FeatureIncompatChunkedFile | FeatureIncompatDeviceTable | + FeatureIncompatFragments | FeatureIncompatXattrPrefixes + + SizeSuperBlock = 128 + SizeInodeCompact = 32 + SizeInodeExtended = 64 + SizeDirent = 12 + SizeXattrBodyHeader = 12 + SizeXattrEntry = 4 + SizeDeviceSlot = 128 + SizeChunkIndex = 8 + + LayoutFlatPlain = 0 + LayoutCompressedFull = 1 + LayoutFlatInline = 2 + LayoutCompressedCompact = 3 + LayoutChunkBased = 4 + + LayoutChunkFormatBits = 0x001F + LayoutChunkFormatIndexes = 0x0020 + LayoutChunkFormat48Bit = 0x0040 +) + +// SuperBlock represents the EROFS on-disk superblock. +// See: https://docs.kernel.org/filesystems/erofs.html#on-disk-layout +type SuperBlock struct { + MagicNumber uint32 + Checksum uint32 + FeatureCompat uint32 + BlkSizeBits uint8 + ExtSlots uint8 + RootNid uint16 + Inos uint64 + BuildTime uint64 + BuildTimeNs uint32 + Blocks uint32 + MetaBlkAddr uint32 + XattrBlkAddr uint32 + UUID [16]uint8 + VolumeName [16]uint8 + FeatureIncompat uint32 + ComprAlgs uint16 + ExtraDevices uint16 + DevtSlotOff uint16 + DirBlkBits uint8 + XattrPrefixCount uint8 + XattrPrefixStart uint32 + PackedNid uint64 // Nid of the special "packed" inode for shared data/prefixes + XattrFilterRes uint8 + Reserved [23]uint8 +} + +// InodeCompact represents the 32-byte on-disk compact inode. +type InodeCompact struct { + Format uint16 // i_format + XattrCount uint16 // i_xattr_icount + Mode uint16 // i_mode + Nlink uint16 // i_nlink + Size uint32 // i_size + Reserved uint32 // i_reserved + InodeData uint32 // i_u (i_raw_blkaddr, i_rdev, etc.) + Inode uint32 // i_ino + UID uint16 // i_uid + GID uint16 // i_gid + Reserved2 uint32 // i_reserved2 +} + +// InodeExtended represents the 64-byte on-disk extended inode. +type InodeExtended struct { + Format uint16 // i_format + XattrCount uint16 // i_xattr_icount + Mode uint16 // i_mode + Reserved uint16 // i_reserved + Size uint64 // i_size + InodeData uint32 // i_u (i_raw_blkaddr, i_rdev, etc.) + Inode uint32 // i_ino + UID uint32 // i_uid + GID uint32 // i_gid + Mtime uint64 // i_mtime + MtimeNs uint32 // i_mtime_nsec + Nlink uint32 // i_nlink + Reserved2 [16]uint8 +} + +type Dirent struct { + Nid uint64 + NameOff uint16 + FileType uint8 + Reserved uint8 +} + +// XattrHeader is the header after an inode containing xattr information +// +// Original definition: +// inline xattrs (n == i_xattr_icount): +// erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes +// +// 12 bytes / \ +// / \ +// /-----------------------\ +// | erofs_xattr_entries+ | +// +-----------------------+ +// +// inline xattrs must starts in erofs_xattr_ibody_header, +// for read-only fs, no need to introduce h_refcount +// Actual name is prefix | long prefix (prefix + infix) + name +type XattrHeader struct { + NameFilter uint32 // bit value 1 indicate not-present + SharedCount uint8 + Reserved [7]uint8 +} + +type XattrEntry struct { + NameLen uint8 // length of name + NameIndex uint8 // index of name in XattrHeader, 0x80 set indicates long prefix at index&0x7F + XattrPrefixStart + ValueLen uint16 // length of value + // Name+Value +} + +type XattrLongPrefixitem struct { + PrefixAddr uint32 // address of the long prefix + PrefixLen uint8 // length of the long prefix +} + +type XattrLongPrefix struct { + BaseIndex uint8 // short xattr name prefix index + // Infix part after short prefix +} + +type InodeChunkIndex struct { + StartBlkHi uint16 // part of 48-bit support (not yet implemented) + DeviceID uint16 + StartBlkLo uint32 +} + +// DeviceSlot represents the on-disk device table entry (erofs_deviceslot). +// See: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/erofs/erofs_fs.h +type DeviceSlot struct { + Tag [64]uint8 // digest(sha256), etc. + Blocks uint32 // total fs blocks of this device + MappedBlkAddr uint32 // map starting at mapped_blkaddr + Reserved [56]uint8 +} diff --git a/vendor/github.com/erofs/go-erofs/layout.go b/vendor/github.com/erofs/go-erofs/layout.go new file mode 100644 index 0000000..677366d --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/layout.go @@ -0,0 +1,225 @@ +package erofs + +import ( + "sort" + + "github.com/erofs/go-erofs/internal/disk" +) + +// planLayout assigns NIDs and determines trailing data sizes for all entries. +func (w *erofsWriter) planLayout(root *erofsEntry) { + // Collect all entries in a deterministic order (DFS, pre-order). + // DFS keeps directory contents close to their parent inode, + // improving locality for operations like find and ls -lR. + // Hardlink alias entries (linkTo != nil) are skipped — they share the + // NID of the canonical entry and do not get their own inode slot. + w.entries = nil + var walk func(e *erofsEntry) + walk = func(e *erofsEntry) { + if e.linkTo != nil { + return // alias: no inode, NID comes from linkTo + } + w.entries = append(w.entries, e) + if e.mode&disk.StatTypeMask == disk.StatTypeDir { + sort.Slice(e.children, func(i, j int) bool { + return e.children[i].name < e.children[j].name + }) + for _, c := range e.children { + walk(c) + } + } + } + walk(root) + + w.totalInodes = uint64(len(w.entries)) + + // Block 0 holds: 1024-byte pad + 128-byte superblock + device slot(s) + padding + // MetaBlkAddr is set later by write() depending on the on-disk layout. + + // Assign NIDs sequentially. + // NID = byte offset from metaStartPos / 32. + // Each extended inode is 64 bytes = 2 NID slots. + // Trailing data follows and is padded to 32-byte boundary. + currentOff := 0 // byte offset from metaStartPos + for _, e := range w.entries { + e.nid = uint64(currentOff / 32) + e.xattrSize = calcXattrSize(e) + + // Decide compact (32B) vs extended (64B) inode. + e.compact = e.uid <= 0xFFFF && e.gid <= 0xFFFF && + e.nlink <= 0xFFFF && e.size <= 0xFFFFFFFF && + e.mtime == w.buildTime && e.mtimeNs == 0 + + inodeSize := disk.SizeInodeExtended + if e.compact { + inodeSize = disk.SizeInodeCompact + } + + // The inode header region is inode core + xattr area. + // Trailing data (dirents, chunk indexes, inline data) follows. + headerSize := inodeSize + e.xattrSize + + // Determine layout + switch e.mode & disk.StatTypeMask { + case disk.StatTypeReg: + switch { + case e.size == 0 && len(e.chunks) == 0 && e.data == nil && !e.metadataOnly: + e.layout = disk.LayoutFlatPlain + case len(e.chunks) > 0 || e.metadataOnly: + e.layout = disk.LayoutChunkBased + if e.contiguous { + e.chunkBits = w.minChunkBits(e.size) + } + default: + // Full-image mode: decide inline vs plain + if int(e.size) <= w.blockSize-headerSize { + inBlockOff := (currentOff + headerSize) % w.blockSize + if inBlockOff+int(e.size) <= w.blockSize { + e.layout = disk.LayoutFlatInline + } else { + e.layout = disk.LayoutFlatPlain + } + } else { + e.layout = disk.LayoutFlatPlain + } + } + case disk.StatTypeDir: + direntDataSize := w.direntDataSize(e) + inBlockOff := (currentOff + headerSize) % w.blockSize + if direntDataSize > 0 && inBlockOff+direntDataSize <= w.blockSize { + e.layout = disk.LayoutFlatInline + } else { + e.layout = disk.LayoutFlatPlain + } + case disk.StatTypeSymlink: + inBlockOff := (currentOff + headerSize) % w.blockSize + if len(e.symTarget) > 0 && inBlockOff+len(e.symTarget) <= w.blockSize { + e.layout = disk.LayoutFlatInline + } else { + e.layout = disk.LayoutFlatPlain + } + default: + // Device files, fifos, sockets + e.layout = disk.LayoutFlatPlain + } + + // Recalculate trailing size now that layout is decided + e.trailingSize = w.calcTrailingSize(e) + + totalInodeSize := headerSize + e.trailingSize + // Pad to 32-byte boundary + if totalInodeSize%32 != 0 { + totalInodeSize = (totalInodeSize + 31) & ^31 + } + + // Check block boundary: inode core must not cross a block boundary + blockOff := currentOff % w.blockSize + if blockOff+inodeSize > w.blockSize { + // Align to next block + currentOff = (currentOff + w.blockSize - 1) & ^(w.blockSize - 1) + e.nid = uint64(currentOff / 32) + } + + // Also check that trailing data doesn't cross block boundary for inline layouts + if e.layout == disk.LayoutFlatInline { + blockOff = currentOff % w.blockSize + if blockOff+headerSize+e.trailingSize > w.blockSize { + // Fall back to flat-plain (data would cross block boundary) + e.layout = disk.LayoutFlatPlain + e.trailingSize = w.calcTrailingSize(e) + totalInodeSize = headerSize + e.trailingSize + if totalInodeSize%32 != 0 { + totalInodeSize = (totalInodeSize + 31) & ^31 + } + } + } + + currentOff += totalInodeSize + } + + w.rootNid = root.nid +} + +// calcTrailingSize returns the number of bytes following the 64-byte inode. +func (w *erofsWriter) calcTrailingSize(e *erofsEntry) int { + switch e.mode & disk.StatTypeMask { + case disk.StatTypeReg: + if e.layout == disk.LayoutChunkBased { + if e.size == 0 && len(e.chunks) == 0 { + return 0 + } + cs := w.entryChunkSize(e) + nchunks := (int(e.size) + cs - 1) / cs + return nchunks * disk.SizeChunkIndex + } + if e.layout == disk.LayoutFlatInline { + return int(e.size) + } + return 0 + case disk.StatTypeDir: + if e.layout == disk.LayoutFlatInline { + return w.direntDataSize(e) + } + return 0 + case disk.StatTypeSymlink: + if e.layout == disk.LayoutFlatInline { + return len(e.symTarget) + } + return 0 + default: + return 0 + } +} + +// direntNames returns the sorted list of dirent names for a directory, +// including "." and "..". EROFS requires dirents within each block to +// be sorted alphabetically. +func direntNames(e *erofsEntry) []string { + names := make([]string, 0, len(e.children)+2) + names = append(names, ".", "..") + for _, c := range e.children { + names = append(names, c.name) + } + sort.Strings(names) + return names +} + +// direntDataSize calculates the serialized EROFS dirent data size for a directory. +// For multi-block directories, this includes inter-block padding. +func (w *erofsWriter) direntDataSize(e *erofsEntry) int { + names := direntNames(e) + nEntries := len(names) + if len(e.children) == 0 { + // Empty dir still needs "." and ".." entries + return 2*disk.SizeDirent + 1 + 2 + } + + totalSize := 0 + i := 0 + for i < nEntries { + blockUsed := 0 + start := i + nameSize := 0 + for j := i; j < nEntries; j++ { + headerSize := (j - start + 1) * disk.SizeDirent + nameSize += len(names[j]) + needed := headerSize + nameSize + if needed > w.blockSize { + break + } + blockUsed = needed + i = j + 1 + } + if i == start { + blockUsed = disk.SizeDirent + len(names[i]) + i++ + } + // Pad non-final blocks to block boundary + if i < nEntries && blockUsed%w.blockSize != 0 { + blockUsed = (blockUsed + w.blockSize - 1) & ^(w.blockSize - 1) + } + totalSize += blockUsed + } + + return totalSize +} diff --git a/vendor/github.com/erofs/go-erofs/mkfs.go b/vendor/github.com/erofs/go-erofs/mkfs.go new file mode 100644 index 0000000..208950a --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/mkfs.go @@ -0,0 +1,1825 @@ +package erofs + +import ( + "errors" + "fmt" + "io" + "io/fs" + "math/bits" + "os" + "path" + "sort" + "strings" + "time" + + "github.com/erofs/go-erofs/internal/builder" + "github.com/erofs/go-erofs/internal/disk" +) + +// errDirNotEmpty is returned by Remove when the named path is a non-empty +// directory. Mirrors the behavior of os.Remove (which returns ENOTEMPTY). +var errDirNotEmpty = errors.New("directory not empty") + +// --- Exported types --- + +// Writer is a writable filesystem that produces an EROFS image on Close. +// Files are added via Create, Mkdir, Symlink, and Mknod, then finalized +// by calling Close which serializes the complete EROFS image. +type Writer struct { + out io.WriteSeeker + closed bool + blockSize int // 0 = unset, resolved to defaultBlockSize in Close + buildTime uint64 // from WithBuildTime or buildTimer + buildTimeNs uint32 + hasBuildTime bool + wErr error // sticky error: once set, all subsequent ops return it + root *fsEntry // root directory + byPath map[string]*fsEntry // path → entry (all types) + + devices []uint64 // per-device block counts (one per MetadataOnly source) + + // Per-CopyFrom state, reset at the start of each CopyFrom call. + copyMetadataOnly bool // metadata-only for current CopyFrom + copyMerge bool // merge mode: apply whiteouts + copyDeviceID uint16 // device ID assigned to current MetadataOnly CopyFrom + + dataFile *os.File // external data file (nil = spool mode) + dataOff int64 // current byte offset in data file + spool *os.File // temp spool (created lazily) + spoolOff int64 // current byte offset in spool + tempDir string // from WithTempDir + cpBuf []byte // shared buffer for io.Copy into File + padBuf []byte // shared zero buffer for padding (block-sized, lazy) +} + +// File is a writable regular file returned by Writer.Create. +// Data is written via Write or ReadFrom, then committed with Close. +type File struct { + fs *Writer + entry *fsEntry + dataStartOff int64 // byte offset where this file's data begins + written int64 + closed bool +} + +// CreateOpt configures EROFS image creation. +type CreateOpt func(*createOptions) + +// CopyOpt configures a CopyFrom operation. +type CopyOpt func(*Writer) + +// --- Constructor --- + +// Create returns a Writer that produces an EROFS image on Close. +// Options configure build time, data file, and temp directory. +func Create(out io.WriteSeeker, opts ...CreateOpt) *Writer { + var o createOptions + for _, opt := range opts { + opt(&o) + } + + root := &fsEntry{ + path: "/", + mode: disk.StatTypeDir | 0o755, + } + fsys := &Writer{ + out: out, + buildTime: o.buildTime, + buildTimeNs: o.buildTimeNs, + hasBuildTime: o.hasBuildTime, + root: root, + byPath: map[string]*fsEntry{"/": root}, + dataFile: o.dataFile, + tempDir: o.tempDir, + } + + if o.blockSize != 0 { + if err := fsys.setBlockSize(o.blockSize); err != nil { + fsys.wErr = err + } + } + + if o.dataFile != nil { + // Reserve device slot 0 (DeviceID=1) for the data file. + // MetadataOnly CopyFrom device IDs will start at slot 1+. + // The reserved slot is filled in with the actual block count at Close. + fsys.devices = append(fsys.devices, 0) + off, err := o.dataFile.Seek(0, io.SeekEnd) + if err == nil { + fsys.dataOff = off + } + } + + return fsys +} + +// --- CopyOpt functions --- + +// MetadataOnly configures the current CopyFrom to emit only metadata. +// Regular files with pre-existing chunk mappings use chunk-based layout +// referencing an external device; file data is not copied. +func MetadataOnly() CopyOpt { + return func(w *Writer) { + w.copyMetadataOnly = true + } +} + +// Merge enables overlay merge semantics for the current CopyFrom. +// AUFS-style whiteout files (.wh.) delete the named entry from +// prior layers, and opaque markers (.wh..wh..opq) delete all children +// of their parent directory. The whiteout entries themselves are not +// added to the image. +// +// When using Merge with a source containing AUFS whiteout files, do not +// pre-convert them; the Writer processes raw whiteout entries directly. +func Merge() CopyOpt { + return func(w *Writer) { + w.copyMerge = true + } +} + +// --- CreateOpt functions --- + +// WithBlockSize sets the filesystem block size. The value must be a power +// of two between 512 and 64 KiB. When unset the default is 4096. +// An invalid size causes subsequent Writer operations to return an error. +// If CopyFrom is called with a source that declares a different block size, +// CopyFrom returns an error. +func WithBlockSize(n int) CreateOpt { + return func(o *createOptions) { + o.blockSize = n + } +} + +// WithBuildTime sets the filesystem build timestamp. +func WithBuildTime(sec uint64, nsec uint32) CreateOpt { + return func(o *createOptions) { + o.buildTime = sec + o.buildTimeNs = nsec + o.hasBuildTime = true + } +} + +// WithDataFile sets an external data file for metadata-only mode. +// File.Write appends to this file at block-aligned offsets; chunk +// indexes reference those blocks with DeviceID=1. +func WithDataFile(f *os.File) CreateOpt { + return func(o *createOptions) { + o.dataFile = f + } +} + +// WithTempDir overrides the temp directory for the spool file. +// Only used when no data file is provided. +func WithTempDir(dir string) CreateOpt { + return func(o *createOptions) { + o.tempDir = dir + } +} + +// --- Writer entry methods --- + +// Create creates a regular file with default mode 0644. The caller must +// Close the returned File. +func (fsys *Writer) Create(name string) (*File, error) { + if fsys.wErr != nil { + return nil, fsys.wErr + } + name = cleanPath(name) + if name == "/" { + return nil, fmt.Errorf("mkfs: cannot create file at root") + } + if err := fsys.checkPath(name); err != nil { + return nil, err + } + + fsys.ensureParent(name) + + e := &fsEntry{ + path: name, + mode: disk.StatTypeReg | 0o644, + } + fsys.addChild(e) + + f := &File{ + fs: fsys, + entry: e, + } + + if fsys.dataFile != nil { + f.dataStartOff = fsys.dataOff + e.dataStartOff = fsys.dataOff + } else { + if err := fsys.ensureSpool(); err != nil { + return nil, err + } + f.dataStartOff = fsys.spoolOff + e.spoolOff = fsys.spoolOff + e.dataStartOff = fsys.spoolOff + } + + return f, nil +} + +// Mkdir creates a directory. Only permission bits from perm are used; +// type bits are forced to directory. Mkdir("/", perm) sets root permissions. +func (fsys *Writer) Mkdir(name string, perm fs.FileMode) error { + if fsys.wErr != nil { + return fsys.wErr + } + name = cleanPath(name) + if name == "/" { + fsys.root.mode = disk.StatTypeDir | uint16(perm.Perm()) + return nil + } + if err := fsys.checkPath(name); err != nil { + return err + } + + fsys.ensureParent(name) + + e := &fsEntry{ + path: name, + mode: disk.StatTypeDir | uint16(perm.Perm()), + } + fsys.addChild(e) + + return nil +} + +// Symlink creates newname as a symbolic link to oldname (mode 0777). +func (fsys *Writer) Symlink(oldname, newname string) error { + if fsys.wErr != nil { + return fsys.wErr + } + newname = cleanPath(newname) + if newname == "/" { + return fmt.Errorf("mkfs: cannot create symlink at root") + } + if err := fsys.checkPath(newname); err != nil { + return err + } + + fsys.ensureParent(newname) + + e := &fsEntry{ + path: newname, + mode: disk.StatTypeSymlink | 0o777, + linkTarget: oldname, + } + fsys.addChild(e) + + return nil +} + +// Mknod creates a device, FIFO, or socket. mode must include type bits +// (e.g. disk.StatTypeChrdev | 0o666). +func (fsys *Writer) Mknod(name string, mode uint16, rdev uint32) error { + if fsys.wErr != nil { + return fsys.wErr + } + name = cleanPath(name) + if name == "/" { + return fmt.Errorf("mkfs: cannot mknod at root") + } + if err := fsys.checkPath(name); err != nil { + return err + } + + fsys.ensureParent(name) + + e := &fsEntry{ + path: name, + mode: mode, + rdev: rdev, + } + fsys.addChild(e) + + return nil +} + +// Link creates newname as a hard link to oldname. oldname must refer to an +// existing regular file, character device, block device, FIFO, or socket — +// directories and symlinks cannot be used as hard-link targets. +// +// Both paths may be in different directories; newname's parent directory must +// already exist. Link returns an error if oldname is not found, if newname +// already exists, or if the target is a directory or symlink. +// +// After Link, both paths share the same inode in the produced EROFS image. +// The computed nlink on oldname's inode equals 1 + the number of Link calls +// that targeted it (transitively). SetNlink must not be called on any path +// participating in a hardlink group. +func (fsys *Writer) Link(oldname, newname string) error { + if fsys.wErr != nil { + return fsys.wErr + } + oldname = cleanPath(oldname) + newname = cleanPath(newname) + + if oldname == newname { + return fmt.Errorf("mkfs: Link: oldname and newname are the same: %q", oldname) + } + if newname == "/" { + return fmt.Errorf("mkfs: Link: cannot create hardlink at root") + } + if fsys.closed { + return fmt.Errorf("mkfs: FS is closed") + } + + // Resolve the target. It may itself be a hardlink alias — in that case, + // follow to the canonical entry so all aliases share one fsEntry. + target, ok := fsys.byPath[oldname] + if !ok { + return fmt.Errorf("mkfs: Link: %q not found", oldname) + } + if target.linkedTo != nil { + target = target.linkedTo + } + + // Validate target type: no directories, no symlinks. + typ := target.mode & disk.StatTypeMask + if typ == disk.StatTypeDir { + return fmt.Errorf("mkfs: Link: %q is a directory", oldname) + } + if typ == disk.StatTypeSymlink { + return fmt.Errorf("mkfs: Link: %q is a symlink", oldname) + } + + // newname must not already exist. + if _, exists := fsys.byPath[newname]; exists { + return fmt.Errorf("mkfs: Link: %q already exists", newname) + } + + // Ensure newname's parent directory exists. + fsys.ensureParent(newname) + + // Register the alias. The alias fsEntry exists only as a byPath/tree entry; + // it does not duplicate data — it points back to the canonical entry. + alias := &fsEntry{ + path: newname, + linkedTo: target, + } + fsys.addChild(alias) + + // Record the alias path on the canonical entry and bump its nlink. + target.hardlinks = append(target.hardlinks, newname) + // nlink is recomputed from len(hardlinks)+1 in buildErofsTree; clear any + // previously set nlink so it doesn't interfere. + target.nlinkSet = false + + return nil +} + +// --- Writer metadata methods --- + +// Chmod sets permission bits on the named path, preserving type bits. +func (fsys *Writer) Chmod(name string, mode fs.FileMode) error { + if fsys.wErr != nil { + return fsys.wErr + } + e, err := fsys.lookup(name) + if err != nil { + return err + } + perm := goModeToUnixMode(mode) & 0o7777 + e.mode = (e.mode & disk.StatTypeMask) | perm + return nil +} + +// Chown sets the owner UID and GID on the named path. +func (fsys *Writer) Chown(name string, uid, gid int) error { + if fsys.wErr != nil { + return fsys.wErr + } + e, err := fsys.lookup(name) + if err != nil { + return err + } + e.uid = uint32(uid) + e.gid = uint32(gid) + return nil +} + +// Chtimes sets the access and modification times on the named path. +// EROFS only stores mtime; atime is retained for read-back before Close. +func (fsys *Writer) Chtimes(name string, atime time.Time, mtime time.Time) error { + if fsys.wErr != nil { + return fsys.wErr + } + e, err := fsys.lookup(name) + if err != nil { + return err + } + e.atime = uint64(atime.Unix()) + e.atimeNs = uint32(atime.Nanosecond()) + e.mtime = uint64(mtime.Unix()) + e.mtimeNs = uint32(mtime.Nanosecond()) + return nil +} + +// Setxattr sets an extended attribute on the named path. +func (fsys *Writer) Setxattr(name, attr, value string) error { + if fsys.wErr != nil { + return fsys.wErr + } + e, err := fsys.lookup(name) + if err != nil { + return err + } + if e.xattrs == nil { + e.xattrs = make(map[string]string) + } + e.xattrs[attr] = value + return nil +} + +// SetNlink overrides the computed link count on the named path. +// SetNlink must not be called on any path that participates in a hardlink +// group created via Link; use Link to manage link counts in that case. +func (fsys *Writer) SetNlink(name string, nlink uint32) error { + if fsys.wErr != nil { + return fsys.wErr + } + e, err := fsys.lookup(name) + if err != nil { + return err + } + // Resolve alias → canonical so the check applies to the real entry. + if e.linkedTo != nil { + e = e.linkedTo + } + if len(e.hardlinks) > 0 { + return fmt.Errorf("mkfs: SetNlink: %q is part of a hardlink group; use Link() to manage link counts", name) + } + e.nlink = nlink + e.nlinkSet = true + return nil +} + +// Remove removes the named path from the writer's tree. It mirrors the +// semantics of [os.Root.Remove]: it is non-recursive, returns +// [fs.ErrNotExist] (wrapped in [fs.PathError]) if the path does not exist, +// and returns an error if the path is a non-empty directory. +// +// Removing a hardlink alias only removes the dirent at that path; the +// underlying inode and other aliases are preserved. Removing the canonical +// path of a hardlink group with surviving aliases promotes the first +// remaining alias to canonical (POSIX unlink semantics). +// +// Remove cannot be used to delete the root. +// +// Recursive removal can be implemented by the caller by listing the +// directory with [fs.ReadDir] (via [Writer.Open]) and calling Remove on +// each descendant before removing the directory itself. +func (fsys *Writer) Remove(name string) error { + if fsys.wErr != nil { + return fsys.wErr + } + if fsys.closed { + return fmt.Errorf("mkfs: FS is closed") + } + name = cleanPath(name) + if name == "/" { + return &fs.PathError{Op: "remove", Path: name, Err: fmt.Errorf("cannot remove root")} + } + e, ok := fsys.byPath[name] + if !ok { + return &fs.PathError{Op: "remove", Path: name, Err: fs.ErrNotExist} + } + // Non-empty directory check. + if e.mode&disk.StatTypeMask == disk.StatTypeDir { + for _, c := range e.children { + if !c.removed { + return &fs.PathError{Op: "remove", Path: name, Err: errDirNotEmpty} + } + } + } + fsys.unlinkOne(e) + return nil +} + +// unlinkOne removes a single entry from the writer's tree, applying POSIX +// unlink semantics for hardlinks. The entry must already be located in +// byPath. Callers are responsible for any caller-visible preconditions +// (e.g. empty-directory check). +func (fsys *Writer) unlinkOne(e *fsEntry) { + switch { + case e.linkedTo != nil: + // Alias: drop the alias path from the canonical's hardlinks list. + canonical := e.linkedTo + for i, p := range canonical.hardlinks { + if p == e.path { + canonical.hardlinks = append(canonical.hardlinks[:i], canonical.hardlinks[i+1:]...) + break + } + } + case len(e.hardlinks) > 0: + // Canonical with surviving aliases: promote first alias. + newCanonicalPath := e.hardlinks[0] + remaining := e.hardlinks[1:] + newCanonical := fsys.byPath[newCanonicalPath] + if newCanonical != nil { + // Copy data-bearing fields from old canonical to the alias entry. + newCanonical.mode = e.mode + newCanonical.uid = e.uid + newCanonical.gid = e.gid + newCanonical.atime = e.atime + newCanonical.atimeNs = e.atimeNs + newCanonical.mtime = e.mtime + newCanonical.mtimeNs = e.mtimeNs + newCanonical.size = e.size + newCanonical.rdev = e.rdev + newCanonical.xattrs = e.xattrs + newCanonical.linkTarget = e.linkTarget + newCanonical.chunks = e.chunks + newCanonical.contiguous = e.contiguous + newCanonical.spoolOff = e.spoolOff + newCanonical.dataStartOff = e.dataStartOff + newCanonical.fileClosed = e.fileClosed + newCanonical.directData = e.directData + newCanonical.metadataOnly = e.metadataOnly + newCanonical.nlink = e.nlink + newCanonical.nlinkSet = e.nlinkSet + newCanonical.linkedTo = nil + newCanonical.hardlinks = remaining + // Repoint remaining aliases at the new canonical. + for _, ap := range remaining { + if a := fsys.byPath[ap]; a != nil { + a.linkedTo = newCanonical + } + } + } + } + e.removed = true + delete(fsys.byPath, e.path) +} + +// --- Writer bulk copy --- + +// CopyFrom walks an fs.FS and adds all entries. +// Opens files for data when Entry.Data is nil. +// Reads symlink targets via readLinker interface when Entry.LinkTarget is empty. +// If src implements blockSizer, the image block size is set accordingly. +func (fsys *Writer) CopyFrom(src fs.FS, opts ...CopyOpt) error { + if fsys.wErr != nil { + return fsys.wErr + } + // Reset per-CopyFrom state. + fsys.copyMetadataOnly = false + fsys.copyMerge = false + fsys.copyDeviceID = 0 + for _, opt := range opts { + opt(fsys) + } + // Detect EROFS image source for direct metadata/chunk extraction. + // The fast path (copyFromImage) only applies to MetadataOnly mode + // where no file data needs to be read — just inodes, dirents, and + // chunk indexes. For non-MetadataOnly, fall through to the fs.WalkDir + // path which opens files for data. + if srcImg, ok := src.(*image); ok { + if err := fsys.setBlockSize(int(srcImg.blockSize())); err != nil { + return err + } + if !fsys.hasBuildTime { + fsys.buildTime = srcImg.buildTime() + fsys.hasBuildTime = true + } + if fsys.copyMetadataOnly { + devBlocks := srcImg.deviceBlocks() + fsys.devices = append(fsys.devices, devBlocks...) + fsys.copyDeviceID = uint16(len(fsys.devices) - len(devBlocks) + 1) + return fsys.copyFromImage(srcImg) + } + } + if bs, ok := src.(blockSizer); ok { + if err := fsys.setBlockSize(int(bs.BlockSize())); err != nil { + return err + } + } + if fsys.copyMetadataOnly { + if db, ok := src.(deviceBlocker); ok { + fsys.devices = append(fsys.devices, db.DeviceBlocks()) + fsys.copyDeviceID = uint16(len(fsys.devices)) + } + } + if bt, ok := src.(buildTimer); ok && !fsys.hasBuildTime { + fsys.buildTime = bt.BuildTime() + fsys.hasBuildTime = true + } + + // seenIno tracks inode identity across the walk for sources that expose + // Stat.Ino (EROFS images) so that hardlinks (multiple paths sharing one + // NID with nlink > 1) are preserved via Link() rather than duplicated. + // Keyed by Ino; value is the first-seen destination path. + var seenIno map[int64]string + + return fs.WalkDir(src, ".", func(fpath string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + info, err := d.Info() + if err != nil { + return fmt.Errorf("stat %s: %w", fpath, err) + } + + // Normalize path to absolute + p := "/" + fpath + if fpath == "." { + p = "/" + } + + // Merge mode: process whiteout markers. + if fsys.copyMerge && p != "/" { + base := path.Base(p) + if strings.HasPrefix(base, whiteoutPrefix) { + if base == opaqueWhiteout { + // Opaque directory: remove all prior children of parent. + fsys.removeChildren(path.Dir(p)) + } else { + // File whiteout: remove the named entry. + target := path.Join(path.Dir(p), base[len(whiteoutPrefix):]) + fsys.remove(target) + } + return nil + } + } + + // Extract extended metadata from Sys(). + var be *builder.Entry + switch sys := info.Sys().(type) { + case *builder.Entry: + be = sys + case *Stat: + // EROFS image source: convert *Stat to *builder.Entry. + // Also detect hardlinks via Ino + Nlink. + if !info.IsDir() && sys.Nlink > 1 && p != "/" { + if seenIno == nil { + seenIno = make(map[int64]string) + } + if firstPath, seen := seenIno[sys.Ino]; seen { + // Second (or later) path to this inode: create a hardlink. + if err := fsys.Link(firstPath, p); err != nil { + return fmt.Errorf("link %s → %s: %w", firstPath, p, err) + } + return nil + } + seenIno[sys.Ino] = p + } + be = &builder.Entry{ + UID: sys.UID, + GID: sys.GID, + Mtime: sys.Mtime, + MtimeNs: sys.MtimeNs, + Nlink: uint32(sys.Nlink), + Rdev: sys.Rdev, + Xattrs: sys.Xattrs, + } + } + + // For regular files, get a data reader. + if info.Mode().IsRegular() && info.Size() > 0 && (be == nil || be.Data == nil) { + // In metadata-only mode, data is referenced via chunk indexes + // from the source — no need to open the file. + if fsys.copyMetadataOnly { + if be == nil { + be = entryFromSys(info) + if be == nil { + be = &builder.Entry{} + } + } + // Generate chunks from DataRange if available. + if len(be.Chunks) == 0 { + if dr, ok := info.(dataRanger); ok { + if ranges := dr.DataRange(); len(ranges) > 0 { + chunks, err := fsys.chunksFromRanges(ranges, info.Size()) + if err != nil { + return fmt.Errorf("chunksFromRanges %s: %w", p, err) + } + be.Chunks = chunks + // Contiguous: a single non-hole range whose total-size + // invariant is satisfied (guaranteed by chunksFromRanges) + // means the file is fully covered by one contiguous extent. + be.Contiguous = len(ranges) == 1 && ranges[0].Offset != holeOffset + } + } + } + return fsys.add(p, &entryFileInfo{info: info, sys: be}) + } + // For EROFS sources, use direct SectionReader (bypasses + // block-at-a-time reader for contiguous flat-plain data). + if srcImg, ok := src.(*image); ok { + if st, ok := info.Sys().(*Stat); ok { + f := file{img: srcImg, nid: uint64(st.Ino)} + if ino, err := f.readInfo(); err == nil { + if dr := srcImg.openDirect(ino); dr != nil { + if be == nil { + be = &builder.Entry{} + } + be.Data = dr + return fsys.add(p, &entryFileInfo{info: info, sys: be}) + } + } + } + } + f, err := src.Open(fpath) + if err != nil { + return fmt.Errorf("open %s: %w", fpath, err) + } + if be == nil { + be = entryFromSys(info) + if be == nil { + be = &builder.Entry{} + } + } + be.Data = f.(io.Reader) + return fsys.add(p, &entryFileInfo{info: info, sys: be}) + } + + // For symlinks without LinkTarget, read via ReadLink interface. + if info.Mode()&fs.ModeSymlink != 0 && (be == nil || be.LinkTarget == "") { + if rl, ok := src.(readLinker); ok { + target, err := rl.ReadLink(fpath) + if err != nil { + return fmt.Errorf("readlink %s: %w", fpath, err) + } + if be == nil { + be = entryFromSys(info) + if be == nil { + be = &builder.Entry{} + } + } + be.LinkTarget = target + return fsys.add(p, &entryFileInfo{info: info, sys: be}) + } + } + + // For directories, ensure nlink >= 2. + if info.Mode().IsDir() { + if be == nil { + be = entryFromSys(info) + if be == nil { + be = &builder.Entry{Nlink: 2} + } + } + if be.Nlink < 2 { + be.Nlink = 2 + } + return fsys.add(p, &entryFileInfo{info: info, sys: be}) + } + + // General case: devices, fifos, sockets, etc. + // Wrap in entryFileInfo when be was extracted from Sys() + // so that add() sees the metadata. + if be != nil { + return fsys.add(p, &entryFileInfo{info: info, sys: be}) + } + return fsys.add(p, info) + }) +} + +// --- Writer finalization --- + +// Close writes the EROFS image. The FS must not be used after Close. +func (fsys *Writer) Close() error { + if fsys.wErr != nil { + return fsys.wErr + } + if fsys.closed { + return fmt.Errorf("mkfs: FS already closed") + } + fsys.closed = true + + if fsys.spool != nil { + defer func() { _ = fsys.spool.Close() }() + } + + fsys.resolveBlockSize() + + if fsys.dataFile != nil { + // Fill in the reserved device slot 0 with the actual block count. + blocks := (fsys.dataOff + int64(fsys.blockSize) - 1) / int64(fsys.blockSize) + fsys.devices[0] = uint64(blocks) + } + + buildTime := fsys.buildTime + if !fsys.hasBuildTime { + buildTime = uint64(time.Now().Unix()) + } + + // Build erofsEntry tree from the fsEntry tree via BFS. + root := fsys.buildErofsTree() + + var chunkBits uint8 + for cs := fsys.blockSize; cs < 4096; cs <<= 1 { + chunkBits++ + } + + ew := &erofsWriter{ + buildTime: buildTime, + buildTimeNs: fsys.buildTimeNs, + devices: fsys.devices, + blockSize: fsys.blockSize, + chunkBits: chunkBits, + zeroBuf: make([]byte, fsys.blockSize), + } + + ew.planLayout(root) + fixParentNids(root, root) + + return ew.write(fsys.out) +} + +// Stat returns file info for the named path. The name is cleaned the same +// way as other Writer methods (leading slash, no trailing slash). +func (fsys *Writer) Stat(name string) (fs.FileInfo, error) { + name = cleanPath(name) + e, ok := fsys.byPath[name] + if !ok { + return nil, &fs.PathError{Op: "stat", Path: name, Err: fs.ErrNotExist} + } + return &writerFileInfo{entry: e}, nil +} + +// Open opens the named file for reading. For regular files, the file must +// have been closed (data finalized) before it can be opened for reading. +// For directories, the returned file implements fs.ReadDirFile. +func (fsys *Writer) Open(name string) (fs.File, error) { + name = cleanPath(name) + e, ok := fsys.byPath[name] + if !ok { + return nil, &fs.PathError{Op: "open", Path: name, Err: fs.ErrNotExist} + } + + typ := e.mode & disk.StatTypeMask + switch typ { + case disk.StatTypeDir: + return &readDir{fsys: fsys, entry: e}, nil + + case disk.StatTypeReg: + if !e.fileClosed { + return nil, &fs.PathError{Op: "open", Path: name, Err: fmt.Errorf("file not yet closed for writing")} + } + var sr *io.SectionReader + if fsys.dataFile != nil { + sr = io.NewSectionReader(fsys.dataFile, e.dataStartOff, int64(e.size)) + } else if fsys.spool != nil && e.size > 0 { + sr = io.NewSectionReader(fsys.spool, e.dataStartOff, int64(e.size)) + } + return &readFile{entry: e, reader: sr}, nil + + default: + // Symlinks, devices, etc.: stat-only, no readable data. + return &readFile{entry: e}, nil + } +} + +// --- File methods --- + +// Write appends data to the file. +func (f *File) Write(p []byte) (int, error) { + if f.closed { + return 0, fmt.Errorf("mkfs: write to closed file") + } + + if f.fs.dataFile != nil { + n, err := f.fs.dataFile.Write(p) + f.written += int64(n) + f.fs.dataOff += int64(n) + return n, err + } + + n, err := f.fs.spool.Write(p) + f.written += int64(n) + f.fs.spoolOff += int64(n) + return n, err +} + +// ReadFrom implements io.ReaderFrom, allowing io.Copy(f, src) to use +// a shared buffer instead of allocating a new 32KB buffer per call. +func (f *File) ReadFrom(r io.Reader) (int64, error) { + buf := f.fs.copyBuf() + var written int64 + for { + nr, er := r.Read(buf) + if nr > 0 { + nw, ew := f.Write(buf[:nr]) + written += int64(nw) + if ew != nil { + return written, ew + } + } + if er != nil { + if er == io.EOF { + return written, nil + } + return written, er + } + } +} + +// Close commits the file entry. For data file mode, pads to block +// boundary and records chunk indexes. +func (f *File) Close() error { + if f.closed { + return fmt.Errorf("mkfs: file already closed") + } + f.closed = true + f.entry.fileClosed = true + f.entry.size = uint64(f.written) + + if f.fs.dataFile != nil { + return f.closeDataFile() + } + return nil +} + +// Chmod sets permission bits on the file, matching os.File.Chmod. +func (f *File) Chmod(mode fs.FileMode) error { + perm := goModeToUnixMode(mode) & 0o7777 + f.entry.mode = (f.entry.mode & disk.StatTypeMask) | perm + return nil +} + +// Chown sets the owner UID and GID on the file, matching os.File.Chown. +func (f *File) Chown(uid, gid int) error { + f.entry.uid = uint32(uid) + f.entry.gid = uint32(gid) + return nil +} + +// Chtimes sets access and modification times on the open file. EROFS only +// stores mtime on disk; atime is retained on the in-memory entry for +// read-back before [Writer.Close]. +func (f *File) Chtimes(atime, mtime time.Time) error { + f.entry.atime = uint64(atime.Unix()) + f.entry.atimeNs = uint32(atime.Nanosecond()) + f.entry.mtime = uint64(mtime.Unix()) + f.entry.mtimeNs = uint32(mtime.Nanosecond()) + return nil +} + +// --- Internal types --- + +// fsEntry is the in-memory representation of a filesystem entry held by Writer. +type fsEntry struct { + path string + mode uint16 + uid, gid uint32 + atime uint64 + atimeNs uint32 + mtime uint64 + mtimeNs uint32 + nlink uint32 + nlinkSet bool // true if SetNlink was called + size uint64 + rdev uint32 + xattrs map[string]string + linkTarget string + chunks []builder.Chunk + contiguous bool // data blocks are contiguous; flat-plain is sufficient + + // Hardlink support: linkedTo points to the canonical fsEntry this entry + // is a hardlink of. hardlinks collects alias paths on the canonical entry. + // Only one of these is non-nil/non-empty per entry. + linkedTo *fsEntry // non-nil if this is an alias (hardlink) of another entry + hardlinks []string // alias paths on the canonical entry; nil if no hardlinks + + // Tree structure — maintained during add/remove. + parent *fsEntry + children []*fsEntry + + // data location in spool file + spoolOff int64 + dataStartOff int64 // byte offset where file data begins (spool or data file) + fileClosed bool // true after File.Close() is called + directData io.Reader // bypasses spool; set by add() for source-provided data + + removed bool // true if removed by a whiteout in a merge layer + metadataOnly bool // from a metadata-only CopyFrom; use chunk-based layout +} + +// createOptions holds the parsed option values for Create. +type createOptions struct { + buildTime uint64 + buildTimeNs uint32 + hasBuildTime bool + blockSize int // 0 = use default + dataFile *os.File // external data file for metadata-only mode + tempDir string // temp directory for spool file +} + +// blockSizer may be implemented by an fs.FS to declare its block size. +// Writer.CopyFrom uses this to set the image block size automatically. +type blockSizer interface { + BlockSize() uint32 +} + +// buildTimer may be implemented by an fs.FS to suggest a build timestamp. +// If the caller hasn't set WithBuildTime, CopyFrom uses this value. +// Entries whose mtime matches the build time can use compact (32-byte) inodes. +type buildTimer interface { + BuildTime() uint64 +} + +// deviceBlocker may be implemented by an fs.FS to declare the total +// block count of its backing device. Writer.CopyFrom uses this to +// configure the device slot for metadata-only mode. +type deviceBlocker interface { + DeviceBlocks() uint64 +} + +// readLinker is an interface for filesystems that support reading symlink targets. +type readLinker interface { + ReadLink(name string) (string, error) +} + +// dataRanger may be implemented by fs.FileInfo to provide the physical +// location of uncompressed file data in backing devices. CopyFrom checks +// this via type assertion in metadata-only mode to build chunk indexes +// without requiring the caller to construct internal chunk types. +// +// This interface should only be implemented for files whose device data +// is stored verbatim (uncompressed). For compressed files, return nil or +// do not implement the interface. In full-image mode CopyFrom then falls +// back to reading through Open(), which decompresses transparently. In +// MetadataOnly mode there is no such fallback: the file is stored as a +// chunk-based inode with no physical mappings (all holes). +type dataRanger interface { + DataRange() []DataRange +} + +// --- Internal types --- + +// erofsEntry is the internal representation of a file/dir/symlink used by the builder. +type erofsEntry struct { + mode uint16 + uid uint32 + gid uint32 + mtime uint64 + mtimeNs uint32 + nlink uint32 + size uint64 + rdev uint32 + + name string + path string + children []*erofsEntry + symTarget string + + // linkTo is non-nil for hardlink alias entries. These entries are only + // emitted as dirents pointing at linkTo's NID; no inode is written for them. + linkTo *erofsEntry + + // For regular files — metadata-only mode + chunks []builder.Chunk + contiguous bool // data blocks are contiguous; use large chunk size + chunkBits uint8 // per-entry chunk bits (0 = use global) + metadataOnly bool // chunk-based layout even without chunks + + // For regular files — full-image mode + data io.Reader + + // Extended attributes + xattrs map[string]string + + // EROFS layout (assigned during planning) + nid uint64 + parentNid uint64 + erofsFileType uint8 + layout uint8 + compact bool // true = 32-byte compact inode; false = 64-byte extended + xattrSize int // bytes of xattr area (0 if no xattrs) + trailingSize int + + // Data block address for flat-plain files (full-image mode) + dataBlkAddr uint32 +} + +// --- Internal helpers --- + +// cleanPath normalizes a filesystem path to an absolute rooted form. +func cleanPath(p string) string { + if p == "" || p == "." || p == "/" { + return "/" + } + p = path.Clean(p) + if !strings.HasPrefix(p, "/") { + p = "/" + p + } + return p +} + +// fixParentNids sets the parent NID in the ".." dirent for all directories. +// This must be called after planLayout has assigned NIDs. +func fixParentNids(e *erofsEntry, parent *erofsEntry) { + e.parentNid = parent.nid + for _, c := range e.children { + if c.mode&disk.StatTypeMask == disk.StatTypeDir { + fixParentNids(c, e) + } + } +} + +// entryFileInfo wraps an fs.FileInfo but overrides Sys() to return a *builder.Entry. +type entryFileInfo struct { + info fs.FileInfo + sys *builder.Entry +} + +func (fi *entryFileInfo) Name() string { return fi.info.Name() } +func (fi *entryFileInfo) Size() int64 { return fi.info.Size() } +func (fi *entryFileInfo) Mode() fs.FileMode { return fi.info.Mode() } +func (fi *entryFileInfo) ModTime() time.Time { return fi.info.ModTime() } +func (fi *entryFileInfo) IsDir() bool { return fi.info.IsDir() } +func (fi *entryFileInfo) Sys() any { return fi.sys } + +// writerFileInfo implements fs.FileInfo for an fsEntry. +type writerFileInfo struct { + entry *fsEntry +} + +func (fi *writerFileInfo) Name() string { return path.Base(fi.entry.path) } +func (fi *writerFileInfo) Size() int64 { return int64(fi.entry.size) } +func (fi *writerFileInfo) Mode() fs.FileMode { return disk.EroFSModeToGoFileMode(fi.entry.mode) } +func (fi *writerFileInfo) ModTime() time.Time { + return time.Unix(int64(fi.entry.mtime), int64(fi.entry.mtimeNs)) +} +func (fi *writerFileInfo) IsDir() bool { return fi.entry.mode&disk.StatTypeMask == disk.StatTypeDir } +func (fi *writerFileInfo) Sys() any { return nil } + +// readFile implements fs.File for reading back a finalized file's data. +type readFile struct { + entry *fsEntry + reader *io.SectionReader // nil for empty files or non-regular types + closed bool +} + +func (f *readFile) Stat() (fs.FileInfo, error) { + return &writerFileInfo{entry: f.entry}, nil +} + +func (f *readFile) Read(p []byte) (int, error) { + if f.closed { + return 0, fmt.Errorf("mkfs: read from closed file") + } + if f.reader == nil { + return 0, io.EOF + } + return f.reader.Read(p) +} + +func (f *readFile) Close() error { + if f.closed { + return fmt.Errorf("mkfs: file already closed") + } + f.closed = true + return nil +} + +// readDir implements fs.ReadDirFile for a directory in Writer. +type readDir struct { + fsys *Writer + entry *fsEntry + children []fs.DirEntry // lazily populated + offset int + closed bool +} + +func (d *readDir) Stat() (fs.FileInfo, error) { + return &writerFileInfo{entry: d.entry}, nil +} + +func (d *readDir) Read([]byte) (int, error) { + return 0, &fs.PathError{Op: "read", Path: d.entry.path, Err: fmt.Errorf("is a directory")} +} + +func (d *readDir) Close() error { + if d.closed { + return fmt.Errorf("mkfs: dir already closed") + } + d.closed = true + return nil +} + +func (d *readDir) ReadDir(n int) ([]fs.DirEntry, error) { + if d.closed { + return nil, fmt.Errorf("mkfs: read from closed dir") + } + if d.children == nil { + d.children = d.collectChildren() + } + + if n <= 0 { + entries := d.children[d.offset:] + d.offset = len(d.children) + return entries, nil + } + + remaining := d.children[d.offset:] + if len(remaining) == 0 { + return nil, io.EOF + } + if n > len(remaining) { + n = len(remaining) + } + entries := remaining[:n] + d.offset += n + if d.offset >= len(d.children) { + return entries, io.EOF + } + return entries, nil +} + +func (d *readDir) collectChildren() []fs.DirEntry { + children := make([]fs.DirEntry, 0, len(d.entry.children)) + for _, e := range d.entry.children { + if e.removed { + continue + } + children = append(children, &dirEntry{entry: e}) + } + sort.Slice(children, func(i, j int) bool { + return children[i].Name() < children[j].Name() + }) + return children +} + +// dirEntry implements fs.DirEntry for an fsEntry. +type dirEntry struct { + entry *fsEntry +} + +func (de *dirEntry) Name() string { return path.Base(de.entry.path) } +func (de *dirEntry) IsDir() bool { return de.entry.mode&disk.StatTypeMask == disk.StatTypeDir } +func (de *dirEntry) Type() fs.FileMode { return disk.EroFSModeToGoFileMode(de.entry.mode).Type() } +func (de *dirEntry) Info() (fs.FileInfo, error) { return &writerFileInfo{entry: de.entry}, nil } + +// add adds a single entry. Mode and Size come from info; extended metadata +// comes from info.Sys(). Checks Sys() for *builder.Entry first, then +// platform-specific stat types as a fallback for plain fs.FS sources. +func (fsys *Writer) add(p string, info fs.FileInfo) error { + p = cleanPath(p) + mode := goModeToUnixMode(info.Mode()) + size := uint64(info.Size()) + typ := mode & disk.StatTypeMask + + be := entryFromSys(info) + if be == nil { + be = &builder.Entry{} + } + + if p == "/" { + root := fsys.root + root.mode = mode + root.uid = be.UID + root.gid = be.GID + root.mtime = be.Mtime + root.mtimeNs = be.MtimeNs + if be.Nlink > 0 { + root.nlink = be.Nlink + root.nlinkSet = true + } + root.xattrs = be.Xattrs + return nil + } + + fsys.ensureParent(p) + + fe := &fsEntry{ + path: p, + mode: mode, + uid: be.UID, + gid: be.GID, + mtime: be.Mtime, + mtimeNs: be.MtimeNs, + size: size, + rdev: be.Rdev, + xattrs: be.Xattrs, + linkTarget: be.LinkTarget, + chunks: be.Chunks, + contiguous: be.Contiguous, + } + if be.Nlink > 0 { + fe.nlink = be.Nlink + fe.nlinkSet = true + } + + // Handle duplicate paths (overwrite semantics). + if existing, ok := fsys.byPath[p]; ok { + // Preserve tree linkage when overwriting. + savedParent := existing.parent + savedChildren := existing.children + *existing = *fe + existing.parent = savedParent + existing.children = savedChildren + fe = existing + } else { + fsys.addChild(fe) + } + + if fsys.copyMetadataOnly { + fe.metadataOnly = true + // Remap chunk DeviceIDs from source-relative to absolute. + // For single-device sources, all chunks use DeviceID=1 + // and get mapped to copyDeviceID. + // For multi-device sources (e.g. EROFS images), chunks have + // DeviceIDs 1..N that get offset by copyDeviceID-1. + if fsys.copyDeviceID > 0 { + offset := fsys.copyDeviceID - 1 + for i := range fe.chunks { + fe.chunks[i].DeviceID += offset + } + } + } + + // Write regular file data. + // Skip entirely in metadata-only mode. + needData := typ == disk.StatTypeReg && size > 0 && be.Data != nil && + !fsys.copyMetadataOnly + if needData { + // Data is stored locally; clear any source chunk mappings. + fe.chunks = nil + fe.contiguous = false + if fsys.dataFile != nil { + // Data file mode: copy through File for block-aligned padding and chunk recording. + f := &File{fs: fsys, entry: fe} + f.dataStartOff = fsys.dataOff + fe.dataStartOff = fsys.dataOff + if _, err := f.ReadFrom(be.Data); err != nil { + return err + } + if err := f.Close(); err != nil { + return err + } + } else { + // Spool mode: keep a direct reference to avoid copying. + fe.directData = be.Data + fe.fileClosed = true + } + } else { + fe.fileClosed = true + } + + return nil +} + +// checkPath validates that a path hasn't already been registered. +func (fsys *Writer) checkPath(name string) error { + if fsys.closed { + return fmt.Errorf("mkfs: FS is closed") + } + if _, ok := fsys.byPath[name]; ok { + return fmt.Errorf("mkfs: duplicate path %q", name) + } + return nil +} + +// ensureParent creates implicit parent directories for name. +func (fsys *Writer) ensureParent(name string) { + dir := path.Dir(name) + if dir == "/" { + return + } + // Walk up to find existing ancestors. + var missing []string + for d := dir; d != "/"; d = path.Dir(d) { + if _, ok := fsys.byPath[d]; ok { + break + } + missing = append(missing, d) + } + // Create in top-down order. + for i := len(missing) - 1; i >= 0; i-- { + d := missing[i] + e := &fsEntry{ + path: d, + mode: disk.StatTypeDir | 0o755, + } + fsys.addChild(e) + } +} + +// addChild registers an entry in the tree and byPath map. +// The entry's parent is resolved from its path. +func (fsys *Writer) addChild(e *fsEntry) { + parent := fsys.byPath[path.Dir(e.path)] + if parent == nil { + parent = fsys.root + } + e.parent = parent + parent.children = append(parent.children, e) + fsys.byPath[e.path] = e +} + +// remove marks an entry and all its descendants as removed. +// Used by Merge to process whiteout deletions. +func (fsys *Writer) remove(p string) { + p = cleanPath(p) + e, ok := fsys.byPath[p] + if !ok { + return + } + e.removed = true + delete(fsys.byPath, p) + if e.mode&disk.StatTypeMask == disk.StatTypeDir { + fsys.removeSubtree(e) + } +} + +// removeChildren marks all descendants of a directory as removed. +// The directory itself is not removed. +func (fsys *Writer) removeChildren(dir string) { + dir = cleanPath(dir) + e, ok := fsys.byPath[dir] + if !ok { + return + } + fsys.removeSubtree(e) +} + +// removeSubtree recursively marks all descendants of e as removed. +func (fsys *Writer) removeSubtree(e *fsEntry) { + for _, c := range e.children { + if !c.removed { + c.removed = true + delete(fsys.byPath, c.path) + if c.mode&disk.StatTypeMask == disk.StatTypeDir { + fsys.removeSubtree(c) + } + } + } +} + +// buildErofsTree converts the fsEntry tree into an erofsEntry tree via BFS. +// Children are sorted for deterministic output. The Writer is consumed. +// +// Hardlink aliases (fsEntry.linkedTo != nil) do not produce their own inode. +// Instead they contribute a dirent in their parent directory that points at +// the canonical entry's erofsEntry (via erofsEntry.linkTo). +func (fsys *Writer) buildErofsTree() *erofsEntry { + type pair struct { + fs *fsEntry + er *erofsEntry + } + + // Map from canonical fsEntry to its erofsEntry, for hardlink alias resolution. + canonical := make(map[*fsEntry]*erofsEntry) + + rootEr := fsys.fsToErofs(fsys.root) + canonical[fsys.root] = rootEr + queue := []pair{{fsys.root, rootEr}} + + for len(queue) > 0 { + cur := queue[0] + queue = queue[1:] + + // Count child directories for nlink (aliases are never dirs). + var childDirs uint32 + for _, c := range cur.fs.children { + if !c.removed && c.linkedTo == nil && c.mode&disk.StatTypeMask == disk.StatTypeDir { + childDirs++ + } + } + if !cur.fs.nlinkSet && cur.fs.mode&disk.StatTypeMask == disk.StatTypeDir { + cur.er.nlink = 2 + childDirs + } + + // Convert and enqueue children. + if len(cur.fs.children) > 0 { + cur.er.children = make([]*erofsEntry, 0, len(cur.fs.children)) + } + for _, c := range cur.fs.children { + if c.removed { + continue + } + if c.linkedTo != nil { + // Hardlink alias: create a stub erofsEntry that references the + // canonical erofsEntry. The canonical entry may not be converted + // yet (it lives in a different directory), so we resolve lazily + // after the full BFS using the canonical map. + alias := &erofsEntry{ + name: path.Base(c.path), + path: c.path, + } + // Store the canonical fsEntry pointer in a side-channel so we + // can patch alias.linkTo after the BFS. + // We use a temporary trick: store it in linkTo as *erofsEntry + // only after the canonical has been created. + // For now, remember (alias, c.linkedTo) to patch later. + cur.er.children = append(cur.er.children, alias) + // We need the canonical erofsEntry — look it up or defer. + if ce, ok := canonical[c.linkedTo]; ok { + alias.linkTo = ce + alias.erofsFileType = ce.erofsFileType + } else { + // The canonical entry hasn't been created yet (it's in a + // directory later in the BFS). We'll patch it in a second + // pass below. Temporarily stash the fsEntry in a map. + _ = c // patched below via patchList + } + continue + } + ent := fsys.fsToErofs(c) + canonical[c] = ent + cur.er.children = append(cur.er.children, ent) + if c.mode&disk.StatTypeMask == disk.StatTypeDir { + queue = append(queue, pair{c, ent}) + } + } + + // Sort children for deterministic output. + sort.Slice(cur.er.children, func(i, j int) bool { + return cur.er.children[i].name < cur.er.children[j].name + }) + } + + // Second pass: patch any alias entries whose canonical erofsEntry was not + // yet available during BFS (cross-directory hardlinks where the target + // directory appears later in BFS order). + fsys.patchHardlinkAliases(rootEr, canonical) + + // Third pass: set nlink on canonical entries that have hardlink aliases. + for fs, er := range canonical { + if len(fs.hardlinks) > 0 && !fs.nlinkSet { + er.nlink = uint32(len(fs.hardlinks) + 1) + } + } + + return rootEr +} + +// patchHardlinkAliases resolves any alias erofsEntry nodes whose linkTo was +// not yet known during the BFS (because the canonical entry was in a later +// directory). It does a DFS over the erofsEntry tree. +func (fsys *Writer) patchHardlinkAliases(e *erofsEntry, canonical map[*fsEntry]*erofsEntry) { + for _, c := range e.children { + if c.linkTo == nil && c.mode == 0 && len(c.children) == 0 { + // This is an unpatched alias stub: look up via byPath. + if fe, ok := fsys.byPath[c.path]; ok && fe.linkedTo != nil { + if ce, ok := canonical[fe.linkedTo]; ok { + c.linkTo = ce + c.erofsFileType = ce.erofsFileType + } + } + } + if c.mode&disk.StatTypeMask == disk.StatTypeDir { + fsys.patchHardlinkAliases(c, canonical) + } + } +} + +// fsToErofs converts a single fsEntry to an erofsEntry, resolving data readers. +func (fsys *Writer) fsToErofs(e *fsEntry) *erofsEntry { + var nlink uint32 + switch { + case e.nlinkSet: + nlink = e.nlink + case e.mode&disk.StatTypeMask == disk.StatTypeDir: + nlink = 2 // adjusted by buildErofsTree + default: + nlink = 1 + } + + var data io.Reader + if fsys.dataFile == nil && len(e.chunks) == 0 && !e.metadataOnly && + e.mode&disk.StatTypeMask == disk.StatTypeReg && e.size > 0 { + if e.directData != nil { + data = e.directData + } else if fsys.spool != nil { + data = io.NewSectionReader(fsys.spool, e.spoolOff, int64(e.size)) + } + } + + return &erofsEntry{ + mode: e.mode, + uid: e.uid, + gid: e.gid, + mtime: e.mtime, + mtimeNs: e.mtimeNs, + nlink: nlink, + size: e.size, + rdev: e.rdev, + name: path.Base(e.path), + path: e.path, + symTarget: e.linkTarget, + chunks: e.chunks, + contiguous: e.contiguous, + metadataOnly: e.metadataOnly, + data: data, + xattrs: e.xattrs, + erofsFileType: modeToFileType(e.mode), + } +} + +// setBlockSize sets the image block size. If already set to a different +// value, it returns an error. Safe to call multiple times with the same value. +func (fsys *Writer) setBlockSize(n int) error { + if n < minBlockSize || n > maxBlockSize { + return fmt.Errorf("mkfs: invalid block size %d: must be between %d and %d", n, minBlockSize, maxBlockSize) + } + if bits.OnesCount(uint(n)) != 1 { + return fmt.Errorf("mkfs: invalid block size %d: must be a power of two", n) + } + if fsys.blockSize == 0 { + fsys.blockSize = n + return nil + } + if fsys.blockSize != n { + return fmt.Errorf("mkfs: block size conflict: already %d, requested %d", fsys.blockSize, n) + } + return nil +} + +// resolveBlockSize returns the block size, defaulting to 4096 if unset. +func (fsys *Writer) resolveBlockSize() int { + if fsys.blockSize == 0 { + fsys.blockSize = defaultBlockSize + } + return fsys.blockSize +} + +// copyBuf returns a shared 32KB buffer for io.Copy operations. +func (fsys *Writer) copyBuf() []byte { + if fsys.cpBuf == nil { + fsys.cpBuf = make([]byte, 32*1024) + } + return fsys.cpBuf +} + +// zeroPad returns a shared zero buffer sized to the resolved block size. +func (fsys *Writer) zeroPad() []byte { + if fsys.padBuf == nil { + fsys.padBuf = make([]byte, fsys.resolveBlockSize()) + } + return fsys.padBuf +} + +// chunksFromRanges converts DataRange entries into internal chunk entries. +// fileSize is the logical size of the file; the sum of all range Sizes must +// equal fileSize exactly, or an error is returned. +// +// The block size used is the Writer's resolved block size. DataRange.Device +// values are offset by 1 to produce chunk DeviceIDs: DataRange Device 0 +// becomes chunk DeviceID 1 (the first extra device), matching the EROFS +// convention where DeviceID 0 is the primary image. +// +// Validation rules: +// - sum(Size) == fileSize; a mismatch is rejected. +// - r.Size > 0 for every entry. +// - Hole entries (Offset == -1) emit [builder.NullPhysicalBlock] chunks. +// Hole Size must be block-aligned for non-final entries; the final entry +// may end mid-block to match the file tail. +// - For data entries: r.Offset >= 0 and block-aligned; r.Device == 0. +// - For non-final data entries: r.Size must be a multiple of blockSize. +// The final entry may have a partial last block to match the file tail. +func (fsys *Writer) chunksFromRanges(ranges []DataRange, fileSize int64) ([]builder.Chunk, error) { + blockSize := uint64(fsys.resolveBlockSize()) + + // Validate total coverage first. + var total int64 + for _, r := range ranges { + total += r.Size + } + if total != fileSize { + return nil, fmt.Errorf("DataRange total size %d does not match file size %d", total, fileSize) + } + + last := len(ranges) - 1 + var chunks []builder.Chunk + for i, r := range ranges { + if r.Size <= 0 { + return nil, fmt.Errorf("DataRange[%d]: non-positive Size %d", i, r.Size) + } + // Non-final entries must be block-aligned in size; the final entry may + // end mid-block to match the file tail. + if i < last && uint64(r.Size)%blockSize != 0 { + return nil, fmt.Errorf("DataRange[%d]: non-final Size %d is not block-aligned (block size %d)", i, r.Size, blockSize) + } + if r.Offset == holeOffset { + // Hole: emit NullPhysicalBlock chunks covering the hole span. + totalBlocks := (uint64(r.Size) + blockSize - 1) / blockSize + for totalBlocks > 0 { + count := totalBlocks + if count > 65535 { + count = 65535 + } + chunks = append(chunks, builder.Chunk{ + PhysicalBlock: builder.NullPhysicalBlock, + Count: uint16(count), + }) + totalBlocks -= count + } + continue + } + if r.Offset < 0 { + return nil, fmt.Errorf("DataRange[%d]: negative Offset %d", i, r.Offset) + } + if uint64(r.Offset)%blockSize != 0 { + return nil, fmt.Errorf("DataRange[%d]: Offset %d is not block-aligned (block size %d)", i, r.Offset, blockSize) + } + // Non-EROFS sources register exactly one device via DeviceBlocks(); + // only Device=0 is valid. Device=0xFFFF would also wrap deviceID to 0 + // (the primary image), producing an invalid mapping. + if r.Device != 0 { + return nil, fmt.Errorf("DataRange[%d]: Device %d out of range (source declared one device, only Device=0 is valid)", i, r.Device) + } + deviceID := r.Device + 1 + startBlock := uint64(r.Offset) / blockSize + totalBlocks := (uint64(r.Size) + blockSize - 1) / blockSize + for totalBlocks > 0 { + count := totalBlocks + if count > 65535 { + count = 65535 + } + chunks = append(chunks, builder.Chunk{ + PhysicalBlock: startBlock, + Count: uint16(count), + DeviceID: deviceID, + }) + startBlock += count + totalBlocks -= count + } + } + return chunks, nil +} + +// ensureSpool lazily creates the spool temp file. +func (fsys *Writer) ensureSpool() error { + if fsys.spool != nil { + return nil + } + tmp, err := os.CreateTemp(fsys.tempDir, "erofs-mkfs-*") + if err != nil { + return fmt.Errorf("mkfs: create spool: %w", err) + } + _ = os.Remove(tmp.Name()) // unlink immediately; fd keeps data accessible + fsys.spool = tmp + return nil +} + +func (fsys *Writer) lookup(name string) (*fsEntry, error) { + name = cleanPath(name) + e, ok := fsys.byPath[name] + if !ok { + return nil, fmt.Errorf("mkfs: path not found %q", name) + } + return e, nil +} + +// closeDataFile pads the data file to a block boundary and records chunks. +func (f *File) closeDataFile() error { + if f.written == 0 { + return nil + } + + // Pad to block boundary. + bs := int64(f.fs.resolveBlockSize()) + rem := f.fs.dataOff % bs + if rem != 0 { + padSize := bs - rem + n, err := f.fs.dataFile.Write(f.fs.zeroPad()[:padSize]) + f.fs.dataOff += int64(n) + if err != nil { + return fmt.Errorf("mkfs: pad data file: %w", err) + } + } + + // Compute chunks from the start offset and written bytes. + startBlock := uint64(f.dataStartOff) / uint64(f.fs.resolveBlockSize()) + totalBlocks := (uint64(f.written) + uint64(f.fs.resolveBlockSize()) - 1) / uint64(f.fs.resolveBlockSize()) + + for totalBlocks > 0 { + count := totalBlocks + if count > 65535 { + count = 65535 + } + f.entry.chunks = append(f.entry.chunks, builder.Chunk{ + PhysicalBlock: startBlock, + Count: uint16(count), + DeviceID: 1, + }) + startBlock += count + totalBlocks -= count + } + + return nil +} + +// --- Constants --- + +const ( + minBlockSize = 512 + defaultBlockSize = 4096 + nullAddr = 0xFFFFFFFF // marks a hole/sparse chunk + + // Overlay whiteout markers (AUFS convention used by OCI layers). + whiteoutPrefix = ".wh." + opaqueWhiteout = ".wh..wh..opq" +) + +// blkBits returns log2(blockSize). +func blkBits(blockSize int) uint8 { + return uint8(bits.TrailingZeros(uint(blockSize))) +} diff --git a/vendor/github.com/erofs/go-erofs/mkfs_darwin.go b/vendor/github.com/erofs/go-erofs/mkfs_darwin.go new file mode 100644 index 0000000..dbb418a --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/mkfs_darwin.go @@ -0,0 +1,26 @@ +package erofs + +import ( + "io/fs" + "syscall" + + "github.com/erofs/go-erofs/internal/builder" +) + +func entryFromSys(info fs.FileInfo) *builder.Entry { + switch sys := info.Sys().(type) { + case *builder.Entry: + return sys + case *syscall.Stat_t: + return &builder.Entry{ + UID: sys.Uid, + GID: sys.Gid, + Mtime: uint64(sys.Mtimespec.Sec), + MtimeNs: uint32(sys.Mtimespec.Nsec), + Nlink: uint32(sys.Nlink), + Rdev: uint32(sys.Rdev), + } + default: + return nil + } +} diff --git a/vendor/github.com/erofs/go-erofs/mkfs_image.go b/vendor/github.com/erofs/go-erofs/mkfs_image.go new file mode 100644 index 0000000..0c33b5b --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/mkfs_image.go @@ -0,0 +1,526 @@ +package erofs + +import ( + "encoding/binary" + "fmt" + "io" + "path" + + "github.com/erofs/go-erofs/internal/builder" + "github.com/erofs/go-erofs/internal/disk" +) + +// newMetaReader returns an at() function backed by an eagerly-read +// metadata buffer plus an on-demand block cache for data blocks +// outside the metadata region. +func newMetaReader(ra io.ReaderAt, metaStart, totalBytes int64, blockSize int) func(int64) []byte { + metaSize := totalBytes - metaStart + if metaSize <= 0 { + return func(int64) []byte { return nil } + } + metaBuf := make([]byte, metaSize) + if n, err := ra.ReadAt(metaBuf, metaStart); err != nil || int64(n) != metaSize { + return func(int64) []byte { return nil } + } + + cache := make(map[int64][]byte) + + return func(off int64) []byte { + // Fast path: offset in metadata region. + if off >= metaStart { + o := off - metaStart + if o >= int64(len(metaBuf)) { + return nil + } + return metaBuf[o:] + } + // Outside metadata — flat-plain data block. Load on demand. + if off < 0 || off >= totalBytes { + return nil + } + blkAddr := off - off%int64(blockSize) + if cached, ok := cache[blkAddr]; ok { + return cached[off-blkAddr:] + } + sz := int64(blockSize) + if blkAddr+sz > totalBytes { + sz = totalBytes - blkAddr + } + buf := make([]byte, sz) + if n, err := ra.ReadAt(buf, blkAddr); err != nil || int64(n) != sz { + return nil + } + cache[blkAddr] = buf + return buf[off-blkAddr:] + } +} + +// imgQEntry is a BFS queue entry for the image metadata walk. +type imgQEntry struct { + nid uint64 + path string +} + +// copyFromImage is a fast path for CopyFrom when the source is an *image. +// Instead of walking via the fs.FS interface (which does per-inode ReadAt +// syscalls), it reads the entire metadata area into memory and parses +// inodes, directory entries, xattrs, and chunk indexes directly from the +// buffer. This reduces thousands of syscalls to a single ReadAt. +// +// Hardlinks are preserved: when two directory entries share the same NID and +// the inode is not a directory, the second (and subsequent) paths are +// registered via Writer.Link rather than as independent inodes. +func (fsys *Writer) copyFromImage(img *image) error { + metaStart := img.metaStartPos() + totalBytes := int64(img.sb.Blocks) << img.sb.BlkSizeBits + if totalBytes <= 0 { + return nil + } + + blkBits := img.sb.BlkSizeBits + buildTime := img.sb.BuildTime + buildTimeNs := img.sb.BuildTimeNs + + blockSize := int(1 << blkBits) + + // Get an accessor for image data. Reads the metadata region eagerly + // and loads flat-plain data blocks on demand. + at := newMetaReader(img.meta, metaStart, totalBytes, blockSize) + + // Shared xattr block address (if present). The at() function + // will load the block on demand when xattrs are parsed. + var sharedXattrOff int64 + if img.sb.XattrBlkAddr > 0 { + sharedXattrOff = int64(img.sb.XattrBlkAddr) << blkBits + } + + // Pre-allocate based on inode count from superblock. + inodeCount := int(img.sb.Inos) + if inodeCount == 0 { + inodeCount = 64 + } + queue := make([]imgQEntry, 0, inodeCount) + queue = append(queue, imgQEntry{nid: uint64(img.sb.RootNid), path: "/"}) + + // seenNID tracks the first destination path for each source NID that has + // nlink > 1 and is not a directory. When a NID is seen a second time, we + // call Writer.Link instead of creating a new inode, preserving hardlinks. + seenNID := make(map[uint64]string) + + for len(queue) > 0 { + cur := queue[0] + queue = queue[1:] + + // Merge mode: process whiteout markers. + if fsys.copyMerge && cur.path != "/" { + base := path.Base(cur.path) + if len(base) > len(whiteoutPrefix) && base[:len(whiteoutPrefix)] == whiteoutPrefix { + if base == opaqueWhiteout { + fsys.removeChildren(path.Dir(cur.path)) + } else { + target := path.Dir(cur.path) + "/" + base[len(whiteoutPrefix):] + if path.Dir(cur.path) == "/" { + target = "/" + base[len(whiteoutPrefix):] + } + fsys.remove(target) + } + continue + } + } + + inodeAddr := metaStart + int64(cur.nid*disk.SizeInodeCompact) + buf := at(inodeAddr) + if len(buf) < disk.SizeInodeCompact { + return fmt.Errorf("inode %d out of range", cur.nid) + } + + format := binary.LittleEndian.Uint16(buf[:2]) + layout := uint8((format & 0x0E) >> 1) + compact := format&0x01 == 0 + + if compact && len(buf) < disk.SizeInodeCompact { + return fmt.Errorf("compact inode %d out of range", cur.nid) + } + if !compact && len(buf) < disk.SizeInodeExtended { + return fmt.Errorf("extended inode %d out of range", cur.nid) + } + + var ( + mode uint16 + uid uint32 + gid uint32 + nlink uint32 + size uint64 + idata uint32 + mtime uint64 + mtimeNs uint32 + xcnt uint16 + icSize int + ) + + if compact { + var ino disk.InodeCompact + if _, err := binary.Decode(buf[:disk.SizeInodeCompact], binary.LittleEndian, &ino); err != nil { + return fmt.Errorf("decode compact inode %d: %w", cur.nid, err) + } + mode = ino.Mode + uid = uint32(ino.UID) + gid = uint32(ino.GID) + nlink = uint32(ino.Nlink) + size = uint64(ino.Size) + idata = ino.InodeData + mtime = buildTime + mtimeNs = buildTimeNs + xcnt = ino.XattrCount + icSize = disk.SizeInodeCompact + } else { + var ino disk.InodeExtended + if _, err := binary.Decode(buf[:disk.SizeInodeExtended], binary.LittleEndian, &ino); err != nil { + return fmt.Errorf("decode extended inode %d: %w", cur.nid, err) + } + mode = ino.Mode + uid = ino.UID + gid = ino.GID + nlink = ino.Nlink + size = ino.Size + idata = ino.InodeData + mtime = ino.Mtime + mtimeNs = ino.MtimeNs + xcnt = ino.XattrCount + icSize = disk.SizeInodeExtended + } + + // Parse xattr area. + xattrSize := 0 + if xcnt > 0 { + xattrSize = int(xcnt-1)*disk.SizeXattrEntry + disk.SizeXattrBodyHeader + } + var xattrs map[string]string + if xattrSize > 0 { + xattrAddr := inodeAddr + int64(icSize) + xb := at(xattrAddr) + if len(xb) >= xattrSize { + xattrs = parseXattrsFromBuf(xb[:xattrSize], at, sharedXattrOff, img.getLongPrefix) + } + } + + trailingAddr := inodeAddr + int64(icSize) + int64(xattrSize) + typ := mode & disk.StatTypeMask + + // Hardlink detection: if this is a non-directory inode with nlink > 1 + // that we've already registered under a different path, call Link() + // to share the inode rather than creating a duplicate. + if typ != disk.StatTypeDir && nlink > 1 { + if firstPath, seen := seenNID[cur.nid]; seen { + // Second (or later) path to this inode: emit a hardlink. + if cur.path != "/" { + if err := fsys.Link(firstPath, cur.path); err != nil { + return fmt.Errorf("link %s → %s: %w", firstPath, cur.path, err) + } + } + continue + } + // First time we see this NID; record it for future aliases. + seenNID[cur.nid] = cur.path + } + + // Build fsEntry directly, bypassing builder.Entry + add() overhead. + fe := &fsEntry{ + path: cur.path, + mode: mode, + uid: uid, + gid: gid, + mtime: mtime, + mtimeNs: mtimeNs, + size: size, + xattrs: xattrs, + } + if nlink > 0 { + fe.nlink = nlink + fe.nlinkSet = true + } + fe.fileClosed = true + if fsys.copyMetadataOnly { + fe.metadataOnly = true + } + + switch typ { + case disk.StatTypeDir: + dirSize := int(size) + if dirSize > 0 { + var dirData []byte + switch layout { + case disk.LayoutFlatPlain: + dataAddr := int64(idata) << blkBits + d := at(dataAddr) + if d != nil && len(d) >= dirSize { + dirData = d[:dirSize] + } else { + dirData = make([]byte, dirSize) + if _, err := img.meta.ReadAt(dirData, dataAddr); err != nil { + return fmt.Errorf("read dir data for nid %d: %w", cur.nid, err) + } + } + case disk.LayoutFlatInline: + d := at(trailingAddr) + if d != nil && len(d) >= dirSize { + dirData = d[:dirSize] + } + } + if dirData != nil { + fsys.parseDirBlock(dirData, dirSize, blockSize, cur.path, &queue) + } + } + + case disk.StatTypeSymlink: + if size > 0 { + var linkData []byte + if layout == disk.LayoutFlatPlain { + linkData = make([]byte, size) + if _, err := img.meta.ReadAt(linkData, int64(idata)< 0 { + chunkFmt := uint16(idata) + if chunkFmt&disk.LayoutChunkFormatIndexes != 0 { + chunkAddr := trailingAddr + if chunkAddr%8 != 0 { + chunkAddr = (chunkAddr + 7) & ^int64(7) + } + fe.chunks = fsys.parseChunks(at(chunkAddr), chunkFmt, size, blkBits, img.deviceIDMask) + fe.contiguous = true + } + } + + case disk.StatTypeChrdev, disk.StatTypeBlkdev: + fe.rdev = disk.RdevFromMode(mode, idata) + } + + // Remap chunk DeviceIDs for metadata-only sources. + if fsys.copyMetadataOnly && fsys.copyDeviceID > 0 { + offset := fsys.copyDeviceID - 1 + for i := range fe.chunks { + fe.chunks[i].DeviceID += offset + } + } + + // Register in the tree. + if cur.path == "/" { + // Update root metadata. + fsys.root.mode = fe.mode + fsys.root.uid = fe.uid + fsys.root.gid = fe.gid + fsys.root.mtime = fe.mtime + fsys.root.mtimeNs = fe.mtimeNs + fsys.root.nlink = fe.nlink + fsys.root.nlinkSet = fe.nlinkSet + fsys.root.xattrs = fe.xattrs + } else if existing, ok := fsys.byPath[cur.path]; ok { + // Merge overwrites: preserve tree linkage. + savedParent := existing.parent + savedChildren := existing.children + *existing = *fe + existing.parent = savedParent + existing.children = savedChildren + } else { + fsys.addChild(fe) + } + } + return nil +} + +// parseDirBlock extracts directory entries from dirent data and enqueues +// child inodes for BFS traversal. +func (fsys *Writer) parseDirBlock(data []byte, dirSize, blockSize int, parentPath string, queue *[]imgQEntry) { + pos := 0 + for pos < dirSize { + blockEnd := pos + blockSize + if blockEnd > dirSize { + blockEnd = dirSize + } + blk := data[pos:blockEnd] + if len(blk) < disk.SizeDirent { + break + } + + firstNameOff := binary.LittleEndian.Uint16(blk[8:10]) + nEntries := int(firstNameOff / disk.SizeDirent) + if nEntries == 0 || nEntries*disk.SizeDirent > len(blk) { + break + } + + for i := 0; i < nEntries; i++ { + off := i * disk.SizeDirent + nid := binary.LittleEndian.Uint64(blk[off : off+8]) + nameOff := int(binary.LittleEndian.Uint16(blk[off+8 : off+10])) + + var nameEnd int + if i < nEntries-1 { + nameEnd = int(binary.LittleEndian.Uint16(blk[(i+1)*disk.SizeDirent+8 : (i+1)*disk.SizeDirent+10])) + } else { + nameEnd = len(blk) + } + if nameOff >= len(blk) || nameEnd > len(blk) || nameOff >= nameEnd { + continue + } + + // Extract name, trimming trailing NUL padding. + nameBytes := blk[nameOff:nameEnd] + for len(nameBytes) > 0 && nameBytes[len(nameBytes)-1] == 0 { + nameBytes = nameBytes[:len(nameBytes)-1] + } + name := string(nameBytes) + if name == "." || name == ".." || name == "" { + continue + } + + childPath := parentPath + "/" + name + if parentPath == "/" { + childPath = "/" + name + } + *queue = append(*queue, imgQEntry{nid: nid, path: childPath}) + } + + pos = blockEnd + } +} + +// parseChunks extracts chunk index entries from an in-memory buffer. +func (fsys *Writer) parseChunks(data []byte, chunkFmt uint16, fileSize uint64, blkBits uint8, deviceIDMask uint16) []builder.Chunk { + chunkBits := blkBits + uint8(chunkFmt&disk.LayoutChunkFormatBits) + nchunks := int((fileSize-1)>>chunkBits) + 1 + blocksPerChunk := 1 << (chunkBits - blkBits) + + // Align to 8 bytes for index entries. + needed := nchunks * disk.SizeChunkIndex + if len(data) < needed { + return nil + } + + chunks := make([]builder.Chunk, 0, nchunks) + for i := range nchunks { + off := i * disk.SizeChunkIndex + startBlkLo := binary.LittleEndian.Uint32(data[off+4 : off+8]) + if ^startBlkLo == 0 { + continue // null/hole + } + startBlkHi := binary.LittleEndian.Uint16(data[off : off+2]) + deviceID := binary.LittleEndian.Uint16(data[off+2:off+4]) & deviceIDMask + physBlock := (uint64(startBlkHi) << 32) | uint64(startBlkLo) + + if len(chunks) > 0 { + prev := &chunks[len(chunks)-1] + if prev.DeviceID == deviceID && + prev.PhysicalBlock+uint64(prev.Count) == physBlock && + int(prev.Count)+blocksPerChunk <= 65535 { + prev.Count += uint16(blocksPerChunk) + continue + } + } + chunks = append(chunks, builder.Chunk{ + PhysicalBlock: physBlock, + Count: uint16(blocksPerChunk), + DeviceID: deviceID, + }) + } + return chunks +} + +// parseXattrsFromBuf parses xattr entries from an in-memory buffer. +// at provides on-demand access to the shared xattr block at sharedOff. +// longPrefix resolves long xattr prefix indexes (NameIndex with high bit set). +func parseXattrsFromBuf(buf []byte, at func(int64) []byte, sharedOff int64, longPrefix func(uint8) (string, error)) map[string]string { + if len(buf) < disk.SizeXattrBodyHeader { + return nil + } + + var xh disk.XattrHeader + if _, err := binary.Decode(buf[:disk.SizeXattrBodyHeader], binary.LittleEndian, &xh); err != nil { + return nil + } + pos := disk.SizeXattrBodyHeader + + xattrs := make(map[string]string) + + // Resolve shared xattr references. + for i := 0; i < int(xh.SharedCount) && pos+4 <= len(buf); i++ { + idx := binary.LittleEndian.Uint32(buf[pos : pos+4]) + pos += 4 + + if sharedOff == 0 { + continue + } + sharedBlock := at(sharedOff + int64(idx)*4) + if sharedBlock == nil || len(sharedBlock) < disk.SizeXattrEntry { + continue + } + var xe disk.XattrEntry + if _, err := binary.Decode(sharedBlock[:disk.SizeXattrEntry], binary.LittleEndian, &xe); err != nil { + continue + } + entryLen := int(xe.NameLen) + int(xe.ValueLen) + if disk.SizeXattrEntry+entryLen > len(sharedBlock) { + continue + } + sb := sharedBlock[disk.SizeXattrEntry:] + name := xattrName(xe, sb[:xe.NameLen], longPrefix) + value := string(sb[xe.NameLen : int(xe.NameLen)+int(xe.ValueLen)]) + xattrs[name] = value + } + + // Parse inline xattr entries. + for pos+disk.SizeXattrEntry <= len(buf) { + var xe disk.XattrEntry + if _, err := binary.Decode(buf[pos:pos+disk.SizeXattrEntry], binary.LittleEndian, &xe); err != nil { + break + } + pos += disk.SizeXattrEntry + + entryLen := int(xe.NameLen) + int(xe.ValueLen) + if pos+entryLen > len(buf) { + break + } + + name := xattrName(xe, buf[pos:pos+int(xe.NameLen)], longPrefix) + pos += int(xe.NameLen) + value := string(buf[pos : pos+int(xe.ValueLen)]) + pos += int(xe.ValueLen) + + xattrs[name] = value + + // Round up to 4-byte boundary. + if rem := pos % 4; rem != 0 { + pos += 4 - rem + } + } + if len(xattrs) == 0 { + return nil + } + return xattrs +} + +// xattrName builds the full xattr name from an entry and its raw name bytes. +// longPrefix resolves long prefix indexes when the high bit of NameIndex is set. +func xattrName(xe disk.XattrEntry, rawName []byte, longPrefix func(uint8) (string, error)) string { + var prefix string + if xe.NameIndex&0x80 != 0 { + // Long prefix: high bit set, low 7 bits index the prefix table. + if longPrefix != nil { + if p, err := longPrefix(xe.NameIndex & 0x7F); err == nil { + prefix = p + } + } + } else if xe.NameIndex != 0 { + prefix = xattrIndex(xe.NameIndex).String() + } + return prefix + string(rawName) +} diff --git a/vendor/github.com/erofs/go-erofs/mkfs_other.go b/vendor/github.com/erofs/go-erofs/mkfs_other.go new file mode 100644 index 0000000..3ab0730 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/mkfs_other.go @@ -0,0 +1,16 @@ +//go:build !linux && !darwin + +package erofs + +import ( + "io/fs" + + "github.com/erofs/go-erofs/internal/builder" +) + +func entryFromSys(info fs.FileInfo) *builder.Entry { + if be, ok := info.Sys().(*builder.Entry); ok { + return be + } + return nil +} diff --git a/vendor/github.com/erofs/go-erofs/mkfs_unix.go b/vendor/github.com/erofs/go-erofs/mkfs_unix.go new file mode 100644 index 0000000..b4a6a48 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/mkfs_unix.go @@ -0,0 +1,30 @@ +//go:build linux + +package erofs + +import ( + "io/fs" + "syscall" + + "github.com/erofs/go-erofs/internal/builder" +) + +// entryFromSys extracts metadata from info.Sys(). Returns nil if the +// type is not recognized, allowing the caller to use a default. +func entryFromSys(info fs.FileInfo) *builder.Entry { + switch sys := info.Sys().(type) { + case *builder.Entry: + return sys + case *syscall.Stat_t: + return &builder.Entry{ + UID: sys.Uid, + GID: sys.Gid, + Mtime: uint64(sys.Mtim.Sec), + MtimeNs: uint32(sys.Mtim.Nsec), + Nlink: uint32(sys.Nlink), + Rdev: uint32(sys.Rdev), + } + default: + return nil + } +} diff --git a/vendor/github.com/erofs/go-erofs/writer.go b/vendor/github.com/erofs/go-erofs/writer.go new file mode 100644 index 0000000..8bdaf9d --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/writer.go @@ -0,0 +1,689 @@ +package erofs + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "math" + "os" + "sort" + + "github.com/erofs/go-erofs/internal/builder" + "github.com/erofs/go-erofs/internal/disk" +) + +// maxBlockSize is the largest block size we support. EROFS images with +// larger block sizes are unmountable on common platforms (aarch64 caps +// page size at 64 KiB) and the reader rejects BlkSizeBits > 16. +const maxBlockSize = 1 << 16 + +// onlyWriter wraps an io.Writer to hide io.ReaderFrom so that +// io.CopyBuffer uses the caller-provided buffer instead of +// the destination's ReadFrom (which allocates its own). +type onlyWriter struct{ io.Writer } + +// erofsWriter serializes EROFS metadata to an io.Writer. +type erofsWriter struct { + entries []*erofsEntry // all entries in NID order + rootNid uint64 + metaBlkAddr uint32 + totalInodes uint64 + buildTime uint64 + buildTimeNs uint32 + devices []uint64 // per-device block counts (one slot per entry) + blockSize int + chunkBits uint8 // log2(chunkSize / blockSize); chunkSize = blockSize << chunkBits + copyBuf []byte // reusable buffer for io.CopyBuffer + zeroBuf []byte // blockSize-length zero buffer for padding + inodeBuf [disk.SizeInodeExtended]byte // scratch buffer for writeInode +} + +// inodeSize returns the on-disk inode header size for e. +func inodeCoreSize(e *erofsEntry) int { + if e.compact { + return disk.SizeInodeCompact + } + return disk.SizeInodeExtended +} + +// entryChunkBits returns the chunk bits for a specific entry. +// Contiguous entries use a larger chunk size to minimize chunk indexes. +func (w *erofsWriter) entryChunkBits(e *erofsEntry) uint8 { + if e.chunkBits > 0 { + return e.chunkBits + } + return w.chunkBits +} + +// entryChunkSize returns the chunk size in bytes for a specific entry. +func (w *erofsWriter) entryChunkSize(e *erofsEntry) int { + return w.blockSize << w.entryChunkBits(e) +} + +// minChunkBits returns the minimum chunkBits such that file size fits in +// one chunk (chunkSize >= size). Capped at 31 (LayoutChunkFormatBits max). +func (w *erofsWriter) minChunkBits(size uint64) uint8 { + bits := w.chunkBits + for uint64(w.blockSize)< totalMetaBytes { + totalMetaBytes = end + } + } + metaBlocks := (totalMetaBytes + w.blockSize - 1) / w.blockSize + addr := uint32(w.sbAreaBlocks() + metaBlocks) + for _, e := range w.entries { + if ds := w.flatPlainDataSize(e); ds > 0 { + e.dataBlkAddr = addr + addr += uint32((ds + w.blockSize - 1) / w.blockSize) + } + } + } else { + // Data-first: data starts after superblock area. + addr := uint32(w.sbAreaBlocks()) + for _, e := range w.entries { + if ds := w.flatPlainDataSize(e); ds > 0 { + e.dataBlkAddr = addr + addr += uint32((ds + w.blockSize - 1) / w.blockSize) + } + } + w.metaBlkAddr = addr // metadata follows data + } +} + +// sbAreaSize returns the number of bytes needed for the superblock area +// (blocks before metadata): 1024-byte pad + superblock + device slots, +// rounded up to block boundary. +func (w *erofsWriter) sbAreaSize() int { + n := disk.SuperBlockOffset + disk.SizeSuperBlock + if len(w.devices) > 0 { + n += len(w.devices) * disk.SizeDeviceSlot + } + return ((n + w.blockSize - 1) / w.blockSize) * w.blockSize +} + +// sbAreaBlocks returns the number of blocks occupied by the superblock area. +func (w *erofsWriter) sbAreaBlocks() int { + return w.sbAreaSize() / w.blockSize +} + +// metadataBytes computes the total size of the metadata area, including +// any zero-padding inserted to reach each inode's expected offset (NID * 32) +// and rounding each entry up to a 32-byte boundary. +func (w *erofsWriter) metadataBytes() int { + curOff := 0 + for _, e := range w.entries { + expectedOff := int(e.nid) * 32 + if curOff < expectedOff { + curOff = expectedOff + } + sz := inodeCoreSize(e) + e.xattrSize + e.trailingSize + if rem := sz % 32; rem != 0 { + sz += 32 - rem + } + curOff += sz + } + return curOff +} + +func (w *erofsWriter) writeBlock0(buf io.Writer) error { + sbArea := make([]byte, w.sbAreaSize()) + + totalMetaBytes := w.metadataBytes() + metaBlocks := (totalMetaBytes + w.blockSize - 1) / w.blockSize + + // Count data blocks. + dataBlocks := 0 + for _, e := range w.entries { + if ds := w.flatPlainDataSize(e); ds > 0 { + dataBlocks += (ds + w.blockSize - 1) / w.blockSize + } + } + totalBlocks := w.sbAreaBlocks() + metaBlocks + dataBlocks + + var featureIncompat uint32 + var extraDevices uint16 + var devtSlotOff uint16 + + if len(w.devices) > 0 { + featureIncompat |= disk.FeatureIncompatDeviceTable + extraDevices = uint16(len(w.devices)) + devtSlotOff = uint16(disk.SizeSuperBlock / 16) + } + for _, e := range w.entries { + if len(e.chunks) > 0 { + featureIncompat |= disk.FeatureIncompatChunkedFile + break + } + } + + sb := disk.SuperBlock{ + MagicNumber: disk.MagicNumber, + BlkSizeBits: blkBits(w.blockSize), + RootNid: uint16(w.rootNid), + Inos: w.totalInodes, + BuildTime: w.buildTime, + BuildTimeNs: w.buildTimeNs, + Blocks: uint32(totalBlocks), + MetaBlkAddr: w.metaBlkAddr, + FeatureIncompat: featureIncompat, + ExtraDevices: extraDevices, + DevtSlotOff: devtSlotOff, + } + + sbBuf := &bytes.Buffer{} + if err := binary.Write(sbBuf, binary.LittleEndian, &sb); err != nil { + return fmt.Errorf("write superblock: %w", err) + } + copy(sbArea[disk.SuperBlockOffset:], sbBuf.Bytes()) + + // Write device slots right after superblock. + for i, blocks := range w.devices { + if blocks > math.MaxUint32 { + return fmt.Errorf("device %d block count %d exceeds 32-bit limit", i+1, blocks) + } + devSlot := disk.DeviceSlot{ + Blocks: uint32(blocks), + } + devBuf := &bytes.Buffer{} + if err := binary.Write(devBuf, binary.LittleEndian, &devSlot); err != nil { + return fmt.Errorf("write device slot: %w", err) + } + off := disk.SuperBlockOffset + disk.SizeSuperBlock + i*disk.SizeDeviceSlot + copy(sbArea[off:], devBuf.Bytes()) + } + + _, err := buf.Write(sbArea) + return err +} + +// writeMetadataInodes writes inode metadata. Data block addresses must +// already be assigned on each entry before calling this method. +func (w *erofsWriter) writeMetadataInodes(buf io.Writer) error { + metaStart := 0 + for _, e := range w.entries { + expectedOff := int(e.nid) * 32 + if expectedOff > metaStart { + if _, err := buf.Write(w.zeroBuf[:expectedOff-metaStart]); err != nil { + return err + } + metaStart = expectedOff + } + + if err := w.writeInode(buf, e); err != nil { + return fmt.Errorf("write inode for %s: %w", e.path, err) + } + if e.compact { + metaStart += disk.SizeInodeCompact + } else { + metaStart += disk.SizeInodeExtended + } + + // Write xattr area + if e.xattrSize > 0 { + if err := w.writeXattrs(buf, e); err != nil { + return fmt.Errorf("write xattrs for %s: %w", e.path, err) + } + metaStart += e.xattrSize + } + + // Write trailing data + switch e.mode & disk.StatTypeMask { + case disk.StatTypeReg: + if e.layout == disk.LayoutChunkBased && (e.size > 0 || len(e.chunks) > 0) { + if err := w.writeChunkIndexes(buf, e); err != nil { + return fmt.Errorf("write chunks for %s: %w", e.path, err) + } + metaStart += e.trailingSize + } else if e.layout == disk.LayoutFlatInline && e.size > 0 && e.data != nil { + n, err := io.CopyBuffer(onlyWriter{buf}, io.LimitReader(e.data, int64(e.size)), w.copyBuf) + if c, ok := e.data.(io.Closer); ok { + _ = c.Close() + } + if err != nil { + return fmt.Errorf("write inline data for %s: %w", e.path, err) + } + metaStart += int(n) + } + case disk.StatTypeDir: + if e.layout == disk.LayoutFlatInline { + n, err := w.writeDirents(buf, e) + if err != nil { + return fmt.Errorf("write dirents for %s: %w", e.path, err) + } + metaStart += n + } + case disk.StatTypeSymlink: + if e.layout == disk.LayoutFlatInline { + if _, err := io.WriteString(buf, e.symTarget); err != nil { + return fmt.Errorf("write symlink for %s: %w", e.path, err) + } + metaStart += len(e.symTarget) + } + } + + // Pad to 32-byte boundary + inodeSize := disk.SizeInodeExtended + if e.compact { + inodeSize = disk.SizeInodeCompact + } + totalWritten := inodeSize + e.xattrSize + e.trailingSize + if totalWritten%32 != 0 { + padSize := 32 - (totalWritten % 32) + if _, err := buf.Write(w.zeroBuf[:padSize]); err != nil { + return err + } + metaStart += padSize + } + } + + // Pad metadata to full block boundary + if metaStart%w.blockSize != 0 { + padSize := w.blockSize - (metaStart % w.blockSize) + if _, err := buf.Write(w.zeroBuf[:padSize]); err != nil { + return err + } + } + + return nil +} + +func (w *erofsWriter) writeInode(buf io.Writer, e *erofsEntry) error { + var inodeData uint32 + + switch e.mode & disk.StatTypeMask { + case disk.StatTypeReg: + if e.layout == disk.LayoutChunkBased { + inodeData = disk.LayoutChunkFormatIndexes | uint32(w.entryChunkBits(e)) + } else if e.layout == disk.LayoutFlatPlain && e.size > 0 { + inodeData = e.dataBlkAddr + } + case disk.StatTypeDir, disk.StatTypeSymlink: + if e.layout == disk.LayoutFlatPlain { + inodeData = e.dataBlkAddr + } + case disk.StatTypeChrdev, disk.StatTypeBlkdev, disk.StatTypeFifo, disk.StatTypeSock: + inodeData = e.rdev + } + + fileSize := e.size + switch e.mode & disk.StatTypeMask { + case disk.StatTypeDir: + fileSize = uint64(w.direntDataSize(e)) + case disk.StatTypeSymlink: + fileSize = uint64(len(e.symTarget)) + } + + b := &w.inodeBuf + clear(b[:]) + + if e.compact { + binary.LittleEndian.PutUint16(b[0:2], inodeFormat(e.layout, true)) + binary.LittleEndian.PutUint16(b[2:4], xattrCount(e.xattrSize)) + binary.LittleEndian.PutUint16(b[4:6], e.mode) + binary.LittleEndian.PutUint16(b[6:8], uint16(e.nlink)) + binary.LittleEndian.PutUint32(b[8:12], uint32(fileSize)) + binary.LittleEndian.PutUint32(b[16:20], inodeData) + binary.LittleEndian.PutUint16(b[24:26], uint16(e.uid)) + binary.LittleEndian.PutUint16(b[26:28], uint16(e.gid)) + _, err := buf.Write(b[:disk.SizeInodeCompact]) + return err + } + + binary.LittleEndian.PutUint16(b[0:2], inodeFormat(e.layout, false)) + binary.LittleEndian.PutUint16(b[2:4], xattrCount(e.xattrSize)) + binary.LittleEndian.PutUint16(b[4:6], e.mode) + binary.LittleEndian.PutUint64(b[8:16], fileSize) + binary.LittleEndian.PutUint32(b[16:20], inodeData) + binary.LittleEndian.PutUint32(b[24:28], e.uid) + binary.LittleEndian.PutUint32(b[28:32], e.gid) + binary.LittleEndian.PutUint64(b[32:40], e.mtime) + binary.LittleEndian.PutUint32(b[40:44], e.mtimeNs) + binary.LittleEndian.PutUint32(b[44:48], e.nlink) + _, err := buf.Write(b[:disk.SizeInodeExtended]) + return err +} + +func (w *erofsWriter) writeXattrs(buf io.Writer, e *erofsEntry) error { + // XattrHeader: 4-byte name filter + 1-byte shared count + 7 reserved = 12 bytes + var xhdr [12]byte + binary.LittleEndian.PutUint32(xhdr[0:4], 0xFFFFFFFF) // name filter unused + if _, err := buf.Write(xhdr[:]); err != nil { + return err + } + + for _, name := range sortedXattrKeys(e.xattrs) { + value := e.xattrs[name] + nameIndex, suffix := xattrSplit(name) + + var xent [disk.SizeXattrEntry]byte + xent[0] = uint8(len(suffix)) + xent[1] = nameIndex + binary.LittleEndian.PutUint16(xent[2:4], uint16(len(value))) + if _, err := buf.Write(xent[:]); err != nil { + return err + } + if _, err := io.WriteString(buf, suffix); err != nil { + return err + } + if _, err := io.WriteString(buf, value); err != nil { + return err + } + + // Pad to 4-byte boundary + entryLen := disk.SizeXattrEntry + len(suffix) + len(value) + if entryLen%4 != 0 { + if _, err := buf.Write(w.zeroBuf[:4-entryLen%4]); err != nil { + return err + } + } + } + return nil +} + +// writeChunkIndexes writes chunk index entries for a regular file. +// Each index entry covers one logical chunk (chunkSize bytes). +func (w *erofsWriter) writeChunkIndexes(buf io.Writer, e *erofsEntry) error { + cs := w.entryChunkSize(e) + blocksPerChunk := cs / w.blockSize + nchunks := (int(e.size) + cs - 1) / cs + + // Null chunk index (no mapping): StartBlkHi=0xFFFF, DeviceID=0, StartBlkLo=NullAddr. + var nullIdx [disk.SizeChunkIndex]byte + binary.LittleEndian.PutUint16(nullIdx[0:2], 0xFFFF) + binary.LittleEndian.PutUint32(nullIdx[4:8], nullAddr) + + if len(e.chunks) > 0 { + // Walk source chunks and emit one index per logical chunk. + // Source chunks use block-granularity counts; we step by blocksPerChunk. + // A chunk with PhysicalBlock == builder.NullPhysicalBlock is a hole: + // emit nullIdx entries for its block span. + var scratch [disk.SizeChunkIndex]byte + ci := 0 // index into source chunks + coff := 0 // block offset within current source chunk + for n := 0; n < nchunks; n++ { + if ci >= len(e.chunks) { + if _, err := buf.Write(nullIdx[:]); err != nil { + return err + } + continue + } + c := e.chunks[ci] + if c.PhysicalBlock == builder.NullPhysicalBlock { + // Hole chunk: emit a null index entry. + if _, err := buf.Write(nullIdx[:]); err != nil { + return err + } + } else { + phys := c.PhysicalBlock + uint64(coff) + binary.LittleEndian.PutUint16(scratch[0:2], uint16(phys>>32)) + binary.LittleEndian.PutUint16(scratch[2:4], c.DeviceID) + binary.LittleEndian.PutUint32(scratch[4:8], uint32(phys)) + if _, err := buf.Write(scratch[:]); err != nil { + return err + } + } + coff += blocksPerChunk + for ci < len(e.chunks) && coff >= int(e.chunks[ci].Count) { + coff -= int(e.chunks[ci].Count) + ci++ + } + } + } else { + for n := 0; n < nchunks; n++ { + if _, err := buf.Write(nullIdx[:]); err != nil { + return err + } + } + } + + return nil +} + +// writeDirents writes EROFS directory entries packed into block-sized chunks. +func (w *erofsWriter) writeDirents(buf io.Writer, e *erofsEntry) (int, error) { + type direntInfo struct { + name string + nid uint64 + fileType uint8 + } + + // Build the full entry list including "." and ".." then sort + // alphabetically. EROFS requires dirents to be sorted within + // each block; "." and ".." are not guaranteed to be first. + allEnts := make([]direntInfo, 0, len(e.children)+2) + allEnts = append(allEnts, direntInfo{".", e.nid, disk.FileTypeDir}) + allEnts = append(allEnts, direntInfo{"..", e.parentNid, disk.FileTypeDir}) + for _, c := range e.children { + nid := c.nid + if c.linkTo != nil { + nid = c.linkTo.nid // hardlink alias: point at the canonical inode + } + allEnts = append(allEnts, direntInfo{ + name: c.name, + nid: nid, + fileType: c.erofsFileType, + }) + } + sort.Slice(allEnts, func(i, j int) bool { + return allEnts[i].name < allEnts[j].name + }) + + totalWritten := 0 + i := 0 + for i < len(allEnts) { + // Determine how many entries fit in this block + start := i + blockUsed := 0 + nameSize := 0 + for j := i; j < len(allEnts); j++ { + headerSize := (j - start + 1) * disk.SizeDirent + nameSize += len(allEnts[j].name) + needed := headerSize + nameSize + if needed > w.blockSize { + break + } + blockUsed = needed + i = j + 1 + } + if i == start { + // Single entry too large for a block (shouldn't happen) + blockUsed = disk.SizeDirent + len(allEnts[i].name) + i++ + } + + blockEnts := allEnts[start:i] + blockHeaderSize := len(blockEnts) * disk.SizeDirent + + // Write dirent headers + var scratch [disk.SizeDirent]byte + nameOff := uint16(blockHeaderSize) + for j, de := range blockEnts { + if j > 0 { + nameOff += uint16(len(blockEnts[j-1].name)) + } + binary.LittleEndian.PutUint64(scratch[0:8], de.nid) + binary.LittleEndian.PutUint16(scratch[8:10], nameOff) + scratch[10] = de.fileType + scratch[11] = 0 + if _, err := buf.Write(scratch[:]); err != nil { + return totalWritten, err + } + totalWritten += disk.SizeDirent + } + + // Write names + for _, de := range blockEnts { + n, err := io.WriteString(buf, de.name) + if err != nil { + return totalWritten, err + } + totalWritten += n + } + + // Pad to block boundary if there are more entries + if i < len(allEnts) && blockUsed%w.blockSize != 0 { + padSize := w.blockSize - (blockUsed % w.blockSize) + if _, err := buf.Write(w.zeroBuf[:padSize]); err != nil { + return totalWritten, err + } + totalWritten += padSize + } + } + + return totalWritten, nil +} + +// writeDataBlocks writes data blocks for flat-plain entries directly to out. +func (w *erofsWriter) writeDataBlocks(out io.Writer) error { + for _, e := range w.entries { + ds := w.flatPlainDataSize(e) + if ds == 0 { + continue + } + + var n int + switch e.mode & disk.StatTypeMask { + case disk.StatTypeReg: + expected := int64(ds) + var written int64 + var err error + limited := io.LimitReader(e.data, expected) + // Use io.Copy for *os.File sources to enable copy_file_range. + if _, ok := e.data.(*os.File); ok { + written, err = io.Copy(out, limited) + } else { + written, err = io.CopyBuffer(onlyWriter{out}, limited, w.copyBuf) + } + if c, ok := e.data.(io.Closer); ok { + _ = c.Close() + } + if err != nil { + return fmt.Errorf("write data for %s: %w", e.path, err) + } + if written != expected { + return fmt.Errorf("write data for %s: short read: got %d bytes, expected %d", e.path, written, expected) + } + n = int(written) + case disk.StatTypeDir: + written, err := w.writeDirents(out, e) + if err != nil { + return fmt.Errorf("write dirents for %s: %w", e.path, err) + } + n = written + case disk.StatTypeSymlink: + written, err := io.WriteString(out, e.symTarget) + if err != nil { + return fmt.Errorf("write symlink data for %s: %w", e.path, err) + } + n = written + } + + if n%w.blockSize != 0 { + padSize := w.blockSize - (n % w.blockSize) + if _, err := out.Write(w.zeroBuf[:padSize]); err != nil { + return fmt.Errorf("write padding for %s: %w", e.path, err) + } + } + } + return nil +} + +// flatPlainDataSize returns the data size for a flat-plain entry, or 0. +func (w *erofsWriter) flatPlainDataSize(e *erofsEntry) int { + if e.layout != disk.LayoutFlatPlain { + return 0 + } + switch e.mode & disk.StatTypeMask { + case disk.StatTypeReg: + if e.size > 0 && e.data != nil { + return int(e.size) + } + case disk.StatTypeDir: + return w.direntDataSize(e) + case disk.StatTypeSymlink: + return len(e.symTarget) + } + return 0 +} diff --git a/vendor/github.com/erofs/go-erofs/xattr.go b/vendor/github.com/erofs/go-erofs/xattr.go new file mode 100644 index 0000000..a08fb13 --- /dev/null +++ b/vendor/github.com/erofs/go-erofs/xattr.go @@ -0,0 +1,222 @@ +package erofs + +import ( + "encoding/binary" + "fmt" + + "github.com/erofs/go-erofs/internal/disk" +) + +/* +#define EROFS_XATTR_INDEX_USER 1 +#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 +#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 +#define EROFS_XATTR_INDEX_TRUSTED 4 +#define EROFS_XATTR_INDEX_LUSTRE 5 +#define EROFS_XATTR_INDEX_SECURITY 6 +*/ + +type xattrIndex uint8 + +func (idx xattrIndex) String() string { + switch idx { + case 1: + return "user." + case 2: + return "system.posix_acl_access." + case 3: + return "system.posix_acl_default." + case 4: + return "trusted." + case 5: + return "lustre." + case 6: + return "security." + default: + return "" + } +} + +// loadXattrs reads the extended attributes for the file's inode and +// populates the given Stat's Xattrs map. +func loadXattrs(b *file, stat *Stat) (err error) { + ino := b.info + addr := b.img.metaStartPos() + int64(ino.nid*disk.SizeInodeCompact) + int64(ino.icsize) + xsize := ino.xsize + + stat.Xattrs = map[string]string{} + + blk, err := b.img.loadAt(addr, int64(xsize)) + if err != nil { + return fmt.Errorf("failed to read xattr body for nid %d: %w", b.nid, err) + } + defer func() { + if blk != nil { + b.img.putBlock(blk) + } + }() + + xb := blk.bytes() + if len(xb) < disk.SizeXattrBodyHeader { + return fmt.Errorf("xattr body too small for nid %d: %w", b.nid, ErrInvalid) + } + var xh disk.XattrHeader + if _, err := binary.Decode(xb[:disk.SizeXattrBodyHeader], binary.LittleEndian, &xh); err != nil { + return err + } + xb = xb[disk.SizeXattrBodyHeader:] + + for i := 0; i < int(xh.SharedCount); i++ { + if len(xb) < 4 { + pos := disk.SizeXattrBodyHeader + int64(i)*4 + b.img.putBlock(blk) + blk, err = b.img.loadAt(addr+pos, int64(xsize)-pos) + if err != nil { + return fmt.Errorf("failed to read xattr body for nid %d: %w", b.nid, err) + } + xb = blk.bytes() + if len(xb) < 4 { + return fmt.Errorf("xattr shared block too small for nid %d: %w", b.nid, ErrInvalid) + } + } + var xattrAddr uint32 + if _, err := binary.Decode(xb[:4], binary.LittleEndian, &xattrAddr); err != nil { + return err + } + + // TODO: Cache shared xattr blocks + sblk, err := b.img.loadAt(int64(b.img.sb.XattrBlkAddr)< 0 { + copySize := len(xb) + if copySize == 0 { + if err := reload(); err != nil { + return err + } + copySize = len(xb) + if copySize == 0 { + return fmt.Errorf("empty xattr block while reading value: %w", ErrInvalid) + } + } + if remaining < copySize { + copySize = remaining + } + buf = append(buf, xb[:copySize]...) + remaining -= copySize + pos += copySize + xb = xb[copySize:] + } + value = string(buf) + } else { + value = string(xb[:xattrEntry.ValueLen]) + pos += int(xattrEntry.ValueLen) + xb = xb[xattrEntry.ValueLen:] + } + stat.Xattrs[name] = value + + // Round up to next 4 byte boundary + if rem := pos % 4; rem != 0 { + pad := 4 - rem + pos += pad + if len(xb) < pad { + xb = nil + } else { + xb = xb[pad:] + } + } + } + return nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index c6943f5..fb8d820 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -8,6 +8,11 @@ github.com/Microsoft/go-winio/pkg/guid # github.com/containerd/log v0.1.0 ## explicit; go 1.20 github.com/containerd/log +# github.com/erofs/go-erofs v0.3.1-0.20260531080512-069dc32d83e6 +## explicit; go 1.23 +github.com/erofs/go-erofs +github.com/erofs/go-erofs/internal/builder +github.com/erofs/go-erofs/internal/disk # github.com/opencontainers/go-digest v1.0.0 ## explicit; go 1.13 github.com/opencontainers/go-digest