Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 54 additions & 23 deletions cmd/containerd-shim-runhcs-v1/task_hcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,11 @@ func newHcsTask(
ownsParent bool,
req *task.CreateTaskRequest,
s *specs.Spec) (_ shimTask, err error) {
log.G(ctx).WithFields(logrus.Fields{
"tid": req.ID,
"ownsParent": ownsParent,
ctx, entry := log.SetEntry(ctx, logrus.Fields{logfields.TaskID: req.ID})
entry.WithFields(logrus.Fields{
"hasHost": parent != nil,
"isVirtualPod": s.Annotations != nil && s.Annotations[annotations.VirtualPodID] != "",
"ownsHost": ownsParent,
}).Debug("newHcsTask")

owner := filepath.Base(os.Args[0])
Expand Down Expand Up @@ -495,22 +497,30 @@ func (ht *hcsTask) DeleteExec(ctx context.Context, eid string) (int, uint32, tim
}

if eid == "" {
entry := log.G(ctx).WithFields(logrus.Fields{
logfields.TaskID: ht.id,
logfields.ExecID: eid,
})

// We are killing the init task, so we expect the container to be
// stopped after this.
//
// The task process may have already exited, and the status set to
// shimExecStateExited, but resources may still be in the process
// of being cleaned up. Wait for ht.closed to be closed. This signals
// that waitInitExit() has finished destroying container resources,
// and layers were umounted.
// that [hcsTask.waitInitExit]/[hcsTask.waitForHostExit] has finished destroying
// container resources and that layers were umounted.
// If the shim exits before resources are cleaned up, those resources
// will remain locked and untracked, which leads to lingering sandboxes
// and container resources like base vhdx.
const timeout = 30 * time.Second
entry.WithField(logfields.Timeout, timeout).Trace("waiting for task to be closed")
select {
case <-time.After(30 * time.Second):
log.G(ctx).Error("timed out waiting for resource cleanup")
case <-time.After(timeout):
entry.WithField(logfields.Timeout, timeout).Error("timed out waiting for task to close while deleting init exec")
return 0, 0, time.Time{}, errors.Wrap(hcs.ErrTimeout, "waiting for container resource cleanup")
case <-ht.closed:
entry.Trace("received task close signal")
}

// The init task has now exited. A ForceExit() has already been sent to
Expand Down Expand Up @@ -608,7 +618,10 @@ func (ht *hcsTask) Wait() *task.StateResponse {
func (ht *hcsTask) waitInitExit() {
ctx, span := oc.StartSpan(context.Background(), "hcsTask::waitInitExit")
defer span.End()
span.AddAttributes(trace.StringAttribute("tid", ht.id))
span.AddAttributes(
trace.StringAttribute("tid", ht.id),
trace.BoolAttribute("host", ht.host != nil),
trace.BoolAttribute("ownsHost", ht.ownsHost))

// Wait for it to exit on its own
ht.init.Wait()
Expand All @@ -627,7 +640,10 @@ func (ht *hcsTask) waitInitExit() {
func (ht *hcsTask) waitForHostExit() {
ctx, span := oc.StartSpan(context.Background(), "hcsTask::waitForHostExit")
defer span.End()
span.AddAttributes(trace.StringAttribute("tid", ht.id))
span.AddAttributes(
trace.StringAttribute("tid", ht.id),
trace.BoolAttribute("host", ht.host != nil),
trace.BoolAttribute("ownsHost", ht.ownsHost))

err := ht.host.WaitCtx(ctx)
if err != nil {
Expand Down Expand Up @@ -655,13 +671,16 @@ func (ht *hcsTask) waitForHostExit() {
// == nil`.
func (ht *hcsTask) close(ctx context.Context) {
ht.closeOnce.Do(func() {
log.G(ctx).Debug("hcsTask::closeOnce")
entry := log.G(ctx)
entry.Debug("hcsTask::closeOnce")

// ht.c should never be nil for a real task but in testing we stub
// this to avoid a nil dereference. We really should introduce a
// method or interface for ht.c operations that we can stub for
// testing.
if ht.c != nil {
const tearDownTimeout = 30 * time.Second

// Do our best attempt to tear down the container.
// TODO: unify timeout select statements and use [ht.c.WaitCtx] and [context.WithTimeout]
var werr error
Expand All @@ -670,52 +689,63 @@ func (ht *hcsTask) close(ctx context.Context) {
werr = ht.c.Wait()
close(ch)
}()

err := ht.c.Shutdown(ctx)
if err != nil {
log.G(ctx).WithError(err).Error("failed to shutdown container")
entry.WithError(err).Error("failed to shutdown container")
} else {
t := time.NewTimer(time.Second * 30)
t := time.NewTimer(tearDownTimeout)
select {
case <-ch:
err = werr
t.Stop()
if err != nil {
log.G(ctx).WithError(err).Error("failed to wait for container shutdown")
entry.WithError(err).Error("failed to wait for container shutdown")
}
case <-t.C:
err = hcs.ErrTimeout
log.G(ctx).WithError(err).Error("failed to wait for container shutdown")
entry.WithFields(logrus.Fields{
logfields.Timeout: tearDownTimeout,
logrus.ErrorKey: err,
}).Error("timed out while waiting for container shutdown")
}
}

if err != nil {
err = ht.c.Terminate(ctx)
if err != nil {
log.G(ctx).WithError(err).Error("failed to terminate container")
entry.WithError(err).Error("failed to terminate container")
} else {
t := time.NewTimer(time.Second * 30)
t := time.NewTimer(tearDownTimeout)
select {
case <-ch:
err = werr
t.Stop()
if err != nil {
log.G(ctx).WithError(err).Error("failed to wait for container terminate")
entry.WithError(err).Error("failed to wait for container terminate")
}
case <-t.C:
log.G(ctx).WithError(hcs.ErrTimeout).Error("failed to wait for container terminate")
entry.WithFields(logrus.Fields{
logfields.Timeout: tearDownTimeout,
logrus.ErrorKey: hcs.ErrTimeout,
}).Error("timed out while waiting for container terminate")
}
}
}

// Release any resources associated with the container.
entry.Trace("starting task resource cleanup")

if err := resources.ReleaseResources(ctx, ht.cr, ht.host, true); err != nil {
log.G(ctx).WithError(err).Error("failed to release container resources")
entry.WithError(err).Error("failed to release container resources")
}

// Close the container handle invalidating all future access.
if err := ht.c.Close(); err != nil {
log.G(ctx).WithError(err).Error("failed to close container")
entry.WithError(err).Error("failed to close container")
}

entry.Trace("task resource cleanup completed")
}
ht.closeHost(ctx)
})
Expand All @@ -730,11 +760,12 @@ func (ht *hcsTask) close(ctx context.Context) {
// This call is idempotent and safe to call multiple times.
func (ht *hcsTask) closeHost(ctx context.Context) {
ht.closeHostOnce.Do(func() {
log.G(ctx).Debug("hcsTask::closeHostOnce")
entry := log.G(ctx)
entry.Debug("hcsTask::closeHostOnce")

if ht.ownsHost && ht.host != nil {
if err := ht.host.Close(); err != nil {
log.G(ctx).WithError(err).Error("failed host vm shutdown")
entry.WithError(err).Error("failed host vm shutdown")
}
}
// Send the `init` exec exit notification always.
Expand All @@ -750,7 +781,7 @@ func (ht *hcsTask) closeHost(ctx context.Context) {
ExitStatus: exit.ExitStatus,
ExitedAt: exit.ExitedAt,
}); err != nil {
log.G(ctx).WithError(err).Error("failed to publish TaskExitEventTopic")
entry.WithError(err).Error("failed to publish TaskExitEventTopic")
}
close(ht.closed)
})
Expand Down
6 changes: 3 additions & 3 deletions internal/guest/network/netns.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ func NetNSConfig(ctx context.Context, ifStr string, nsPid int, adapter *guestres
addrsStr = append(addrsStr, fmt.Sprintf("%v", addr))
}

entry.WithField("addresses", addrsStr).Debugf("%v: %s[idx=%d,type=%s] is %v",
entry.WithField("addresses", strings.Join(addrsStr, "; ")).Debugf("%v: %s[idx=%d,type=%s] is %v",
curNS, attr.Name, attr.Index, link.Type(), attr.OperState)
}

Expand Down Expand Up @@ -209,7 +209,7 @@ func configureLink(ctx context.Context,
addr.IP = ip
log.G(ctx).WithFields(logrus.Fields{
"allocatedIP": ip,
"IP": addr,
"IP": addr.String(),
}).Debugf("parsed ip address %s/%d", ipConfig.IPAddress, ipConfig.PrefixLength)
ipAddr := &netlink.Addr{IPNet: addr, Label: ""}
if err := netlinkAddrAdd(link, ipAddr); err != nil {
Expand Down Expand Up @@ -237,7 +237,7 @@ func configureLink(ctx context.Context,
}

for _, r := range adapter.Routes {
log.G(ctx).WithField("route", r).Debugf("adding a route to interface %s", link.Attrs().Name)
log.G(ctx).WithField("route", log.Format(ctx, r)).Debugf("adding a route to interface %s", link.Attrs().Name)

if (r.DestinationPrefix == ipv4GwDestination || r.DestinationPrefix == ipv6GwDestination) &&
(r.NextHop == ipv4EmptyGw || r.NextHop == ipv6EmptyGw) {
Expand Down
86 changes: 56 additions & 30 deletions internal/guest/runtime/hcsv2/uvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -1368,7 +1368,14 @@ func modifyNetwork(ctx context.Context, rt guestrequest.RequestType, na *guestre
// container or not so it must always call `Sync`.
return ns.Sync(ctx)
case guestrequest.RequestTypeRemove:
ns := GetOrAddNetworkNamespace(na.ID)
ns, err := getNetworkNamespace(na.NamespaceID)
if err != nil {
log.G(ctx).WithFields(logrus.Fields{
logfields.NamespaceID: na.NamespaceID,
"adapterID": na.ID,
}).WithError(err).Warn("namespace not found for adapter removal, skipping")
return nil
}
if err := ns.RemoveAdapter(ctx, na.ID); err != nil {
return err
}
Expand Down Expand Up @@ -1540,56 +1547,75 @@ func (h *Host) RemoveContainerFromVirtualPod(containerID string) {
return // Container not in any virtual pod
}

ctx, entry := log.SetEntry(context.Background(), logrus.Fields{
"virtualSandboxID": virtualSandboxID,
logfields.ContainerID: containerID,
})

// Remove from virtual pod
if vp, vpExists := h.virtualPods[virtualSandboxID]; vpExists {
delete(vp.Containers, containerID)

// If this is the sandbox container, delete the network namespace
// If this is the sandbox container, clean up the network namespace adapters first,
// then remove the namespace.

// NOTE: Do NOT call [RemoveNetworkNamespace] here.
// When cleaning up the virtual pod's sandbox container on the host,
// [UtilityVM.TearDownNetworking] calls [UtilityVM.RemoveNetNS],
// which, via [UtilityVM.removeNIC], removes the NICs from the uVM and
// sends [guestresource.ResourceTypeNetwork] RPCs that ultimately cal [modifyNetwork]
// to remove adapters from guest tracking.
// However, that happens AFTER this function runs.
// Calling [RemoveNetworkNamespace] here would
// fail with "contains adapters" since the host hasn't removed them yet.
//
// The host-driven path handles cleanup in the correct order.
if containerID == virtualSandboxID && vp.NetworkNamespace != "" {
if err := RemoveNetworkNamespace(context.Background(), vp.NetworkNamespace); err != nil {
logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID).
Warn("Failed to remove virtual pod network namespace (sandbox container removal)")
}
entry.WithField(logfields.NamespaceID, vp.NetworkNamespace).Debug("skipping virtual pod network namespace removal")
}

// If this was the last container, cleanup the virtual pod
if len(vp.Containers) == 0 {
h.cleanupVirtualPod(virtualSandboxID)
h.cleanupVirtualPod(ctx, virtualSandboxID)
}
}

delete(h.containerToVirtualPod, containerID)

logrus.WithFields(logrus.Fields{
"containerID": containerID,
"virtualSandboxID": virtualSandboxID,
}).Info("Container removed from virtual pod")
entry.Info("Container removed from virtual pod")
}

// cleanupVirtualPod removes a virtual pod and its cgroup (should be called with mutex held)
func (h *Host) cleanupVirtualPod(virtualSandboxID string) {
if vp, exists := h.virtualPods[virtualSandboxID]; exists {
// Delete the cgroup
if err := vp.CgroupControl.Delete(); err != nil {
logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID).
Warn("Failed to delete virtual pod cgroup")
}
func (h *Host) cleanupVirtualPod(ctx context.Context, virtualSandboxID string) {
vp, exists := h.virtualPods[virtualSandboxID]
if !exists {
return // virtual pod does not exist
}

// Clean up network namespace if this is the last virtual pod using it
// Only remove if this virtual pod was managing the network namespace
if vp.NetworkNamespace != "" {
// For virtual pods, the network namespace is shared, so we only clean it up
// when the virtual pod itself is being destroyed
if err := RemoveNetworkNamespace(context.Background(), vp.NetworkNamespace); err != nil {
logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID).
Warn("Failed to remove virtual pod network namespace")
}
}
entry := log.G(ctx)

delete(h.virtualPods, virtualSandboxID)
// Delete the cgroup
if err := vp.CgroupControl.Delete(); err != nil {
entry.Warn("failed to delete virtual pod cgroup")
}

logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Virtual pod cleaned up")
// NOTE: Do NOT call [RemoveNetworkNamespace] here.
// See comment in [Host.RemoveContainerFromVirtualPod] for more info.

// Clean up the virtual pod root directory which contains hostname, hosts,
// resolv.conf, and logs/. These are NOT cleaned by Container.Delete() because
// they live under /run/gcs/c/virtual-pods/<id>/ which is separate from the
// container's ociBundlePath (/run/gcs/c/<id>/).
if vpRootDir := specGuest.VirtualPodRootDir(virtualSandboxID); vpRootDir != "" {
if err := os.RemoveAll(vpRootDir); err != nil {
entry.WithField("path", vpRootDir).WithError(err).Warn("Failed to remove virtual pod root directory")
} else {
entry.WithField("path", vpRootDir).Debug("Removed virtual pod root directory")
}
}

delete(h.virtualPods, virtualSandboxID)
entry.Info("Virtual pod cleaned up")
}

// setupVirtualPodMountsPath creates mount directories for virtual pods
Expand Down
20 changes: 16 additions & 4 deletions internal/guest/storage/mount.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"go.opencensus.io/trace"
"golang.org/x/sys/unix"

"github.com/Microsoft/hcsshim/internal/log"
"github.com/Microsoft/hcsshim/internal/oc"
)

Expand Down Expand Up @@ -131,13 +132,24 @@ func UnmountPath(ctx context.Context, target string, removeTarget bool) (err err
return errors.Wrapf(err, "failed to determine if path '%s' exists", target)
}

if err := unixUnmount(target, 0); err != nil {
// If `Unmount` returns `EINVAL` it's not mounted. Just delete the
// folder.
if !gerrors.Is(err, unix.EINVAL) {
// Loop to handle stacked mounts (e.g. tmpfs + bind mount from MountRShared).
// Each Unmount call only peels one layer off the stack, so we must keep
// unmounting until EINVAL tells us nothing is mounted anymore.
const maxUnmountAttempts = 5
unmounted := false
for i := 0; i < maxUnmountAttempts; i++ {
if err := unixUnmount(target, 0); err != nil {
// EINVAL means it's not mounted — we're done unmounting.
if gerrors.Is(err, unix.EINVAL) {
unmounted = true
break
}
return errors.Wrapf(err, "failed to unmount path '%s'", target)
}
}
if !unmounted {
log.G(ctx).WithField("target", target).Warnf("path still mounted after %d unmount attempts", maxUnmountAttempts)
}
if removeTarget {
return osRemoveAll(target)
}
Expand Down
Loading
Loading