diff --git a/internal/controller/linuxcontainer/container.go b/internal/controller/linuxcontainer/container.go index 5fb7cc67f3..72cbb16df5 100644 --- a/internal/controller/linuxcontainer/container.go +++ b/internal/controller/linuxcontainer/container.go @@ -12,6 +12,7 @@ import ( "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats" "github.com/Microsoft/hcsshim/internal/controller/process" "github.com/Microsoft/hcsshim/internal/gcs" + "github.com/Microsoft/hcsshim/internal/hcs" "github.com/Microsoft/hcsshim/internal/hcs/schema1" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" @@ -90,6 +91,9 @@ type Controller struct { // ioRetryTimeout is the duration to retry IO relay operations before giving up. ioRetryTimeout time.Duration + + // isContainerStateDeleted tracks whether guest-side container state has been deleted. + isContainerStateDeleted bool } // New creates a ready-to-use Controller. @@ -158,9 +162,8 @@ func (c *Controller) Create(ctx context.Context, spec *specs.Spec, opts *task.Cr if releaseErr := c.releaseResources(ctx); releaseErr != nil { log.G(ctx).WithError(releaseErr).Error("failed to release resources during create") } - if closeErr := c.closeContainer(ctx); closeErr != nil { - log.G(ctx).WithError(closeErr).Error("failed to close container during create") - } + // Close the container handle. + c.closeContainer() } }() @@ -200,17 +203,8 @@ func (c *Controller) Create(ctx context.Context, spec *specs.Spec, opts *task.Cr // closeContainer performs container teardown. It is safe to retry on // failure. Needs to be called while holding c.mu lock. -func (c *Controller) closeContainer(ctx context.Context) error { +func (c *Controller) closeContainer() { if c.container != nil { - // Delete the guest-side container state if supported. If this - // fails, return early without nil'ing c.container so a retry - // re-issues the request. - if c.guest.Capabilities().IsDeleteContainerStateSupported() { - if err := c.guest.DeleteContainerState(ctx, c.gcsContainerID); err != nil { - return fmt.Errorf("delete container state: %w", err) - } - } - // Close the container handle. The calling code never returns error. _ = c.container.Close() c.container = nil @@ -224,7 +218,6 @@ func (c *Controller) closeContainer(ctx context.Context) error { default: close(c.terminatedCh) } - return nil } // releaseResources undoes each allocation in reverse order. @@ -296,6 +289,20 @@ func (c *Controller) releaseResources(ctx context.Context) error { } } + // After layer overlay has been removed, we can safely delete the + // bundle path inside the UVM for the container. Therefore, delete + // the guest-side container state if supported. + if !c.isContainerStateDeleted && c.guest.Capabilities().IsDeleteContainerStateSupported() { + // GCS bridge evicts the container from its host-state map even if the inner Delete fails, + // so retries will always return not-found. + if err := c.guest.DeleteContainerState(ctx, c.gcsContainerID); err != nil && !hcs.IsNotExist(err) { + return fmt.Errorf("delete container state: %w", err) + } + + // Set isContainerStateDeleted to true so that we do not retry this post successful delete. + c.isContainerStateDeleted = true + } + return nil } @@ -345,12 +352,7 @@ func (c *Controller) handleInitProcessExit(ctx context.Context, initProcess *pro c.mu.Lock() c.state = StateStopped - if err := c.closeContainer(ctx); err != nil { - // Leave state as StateStopped so DeleteProcess can retry the - // teardown. The exit event below still informs the caller that - // the init process is gone. - log.G(ctx).WithError(err).Error("failed to close container after init exit") - } + c.closeContainer() c.mu.Unlock() // Publish the exit event after teardown is complete. @@ -549,7 +551,7 @@ func (c *Controller) KillProcess(ctx context.Context, execID string, signal uint // When "all" is requested, deliver the signal to every additional exec // on a best-effort basis. Errors are logged but do not prevent the // target process from being signaled. - if all { + if all || execID == "" { for eid, proc := range c.processes { if eid == "" { // The init process is signaled as the explicit target below. @@ -616,12 +618,11 @@ func (c *Controller) DeleteProcess(ctx context.Context, execID string) (*task.St // For containers that were created but never started, handleInitProcessExit // was never launched, so closeContainer was never called. Perform full // teardown now. closeContainer is retriable. - if err = c.closeContainer(ctx); err != nil { - return nil, fmt.Errorf("close container %s: %w", c.containerID, err) - } if err = c.releaseResources(ctx); err != nil { return nil, fmt.Errorf("releasing resources for container %s: %w", c.containerID, err) } + // Close container handle after the resources are released. + c.closeContainer() } // Remove the process entry only after all fallible operations have diff --git a/internal/controller/linuxcontainer/container_test.go b/internal/controller/linuxcontainer/container_test.go index 51071c6f4f..6f9ed6ece7 100644 --- a/internal/controller/linuxcontainer/container_test.go +++ b/internal/controller/linuxcontainer/container_test.go @@ -5,6 +5,7 @@ package linuxcontainer import ( "context" "errors" + "fmt" "strings" "sync" "testing" @@ -13,6 +14,7 @@ import ( "github.com/Microsoft/hcsshim/internal/controller/linuxcontainer/mocks" "github.com/Microsoft/hcsshim/internal/controller/process" "github.com/Microsoft/hcsshim/internal/gcs" + "github.com/Microsoft/hcsshim/internal/hcs" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" "github.com/Microsoft/hcsshim/internal/signals" @@ -517,6 +519,10 @@ func TestReleaseResources_AllResourceTypes(t *testing.T) { // Expect VPCI device removal. vpciCtrl.EXPECT().RemoveFromVM(gomock.Any(), deviceGUID).Return(nil) + // Capabilities is queried to decide whether to call DeleteContainerState. + // Default zero-value caps report no support, so DeleteContainerState is skipped. + guestCtrl.EXPECT().Capabilities().Return(&gcs.LCOWGuestDefinedCapabilities{}) + if err := c.releaseResources(t.Context()); err != nil { t.Fatalf("releaseResources returned error: %v", err) } @@ -538,12 +544,13 @@ func TestReleaseResources_AllResourceTypes(t *testing.T) { // case where no layers were allocated (only additional resources). func TestReleaseResources_NoLayers(t *testing.T) { t.Parallel() - c, scsiCtrl, _, _, _ := newContainerTestController(t) + c, scsiCtrl, _, _, guestCtrl := newContainerTestController(t) scsiGUID, _ := guid.NewV4() c.scsiResources = []guid.GUID{scsiGUID} scsiCtrl.EXPECT().UnmapFromGuest(gomock.Any(), scsiGUID).Return(nil) + guestCtrl.EXPECT().Capabilities().Return(&gcs.LCOWGuestDefinedCapabilities{}) if err := c.releaseResources(t.Context()); err != nil { t.Fatalf("releaseResources returned error: %v", err) @@ -554,7 +561,7 @@ func TestReleaseResources_NoLayers(t *testing.T) { // were not combined, RemoveCombinedLayers is not called. func TestReleaseResources_LayersNotCombined(t *testing.T) { t.Parallel() - c, scsiCtrl, _, _, _ := newContainerTestController(t) + c, scsiCtrl, _, _, guestCtrl := newContainerTestController(t) roGUID, _ := guid.NewV4() scratchGUID, _ := guid.NewV4() @@ -568,6 +575,7 @@ func TestReleaseResources_LayersNotCombined(t *testing.T) { // Only unmaps expected; no RemoveCombinedLayers call. scsiCtrl.EXPECT().UnmapFromGuest(gomock.Any(), scratchGUID).Return(nil) scsiCtrl.EXPECT().UnmapFromGuest(gomock.Any(), roGUID).Return(nil) + guestCtrl.EXPECT().Capabilities().Return(&gcs.LCOWGuestDefinedCapabilities{}) if err := c.releaseResources(t.Context()); err != nil { t.Fatalf("releaseResources returned error: %v", err) @@ -578,7 +586,7 @@ func TestReleaseResources_LayersNotCombined(t *testing.T) { // invoked multiple times without panicking. func TestReleaseResources_Idempotent(t *testing.T) { t.Parallel() - c, scsiCtrl, _, _, _ := newContainerTestController(t) + c, scsiCtrl, _, _, guestCtrl := newContainerTestController(t) scsiGUID, _ := guid.NewV4() c.scsiResources = []guid.GUID{scsiGUID} @@ -586,6 +594,9 @@ func TestReleaseResources_Idempotent(t *testing.T) { // The implementation does not clear the slice on success, so a second // call will retry the unmap. Both calls succeed. scsiCtrl.EXPECT().UnmapFromGuest(gomock.Any(), scsiGUID).Return(nil).Times(2) + // Capabilities returns no support → DeleteContainerState is skipped, but + // the gating call itself happens once per releaseResources invocation. + guestCtrl.EXPECT().Capabilities().Return(&gcs.LCOWGuestDefinedCapabilities{}).Times(2) if err := c.releaseResources(t.Context()); err != nil { t.Fatalf("first releaseResources returned error: %v", err) @@ -658,6 +669,7 @@ func TestReleaseResources_MultipleROLayers(t *testing.T) { for _, g := range roGUIDs { scsiCtrl.EXPECT().UnmapFromGuest(gomock.Any(), g).Return(nil) } + guestCtrl.EXPECT().Capabilities().Return(&gcs.LCOWGuestDefinedCapabilities{}) if err := c.releaseResources(t.Context()); err != nil { t.Fatalf("releaseResources returned error: %v", err) @@ -762,9 +774,11 @@ func TestReleaseResources_ROLayerUnmapMidwayFails(t *testing.T) { // when no resources were allocated. func TestReleaseResources_NoResources(t *testing.T) { t.Parallel() - c, _, _, _, _ := newContainerTestController(t) + c, _, _, _, guestCtrl := newContainerTestController(t) + + // Capabilities is still queried at the end to gate DeleteContainerState. + guestCtrl.EXPECT().Capabilities().Return(&gcs.LCOWGuestDefinedCapabilities{}) - // No mock calls expected. if err := c.releaseResources(t.Context()); err != nil { t.Fatalf("releaseResources returned error: %v", err) } @@ -780,9 +794,7 @@ func TestCloseContainer_IdempotentViaFlags(t *testing.T) { c, _, _, _, _ := newContainerTestController(t) // No container, no layers — closeContainer should just close terminatedCh. - if err := c.closeContainer(t.Context()); err != nil { - t.Fatalf("closeContainer returned error: %v", err) - } + c.closeContainer() // Verify terminatedCh is closed. select { @@ -792,8 +804,88 @@ func TestCloseContainer_IdempotentViaFlags(t *testing.T) { } // Second call must be a no-op (no panic from double-close on terminatedCh). - if err := c.closeContainer(t.Context()); err != nil { - t.Fatalf("second closeContainer returned error: %v", err) + c.closeContainer() +} + +// TestReleaseResources_DeleteContainerStateCalled verifies that, when the +// guest reports DeleteContainerStateSupported, releaseResources issues the +// DeleteContainerState RPC after all other resources are released and sets +// isContainerStateDeleted so subsequent calls do not re-issue it. +func TestReleaseResources_DeleteContainerStateCalled(t *testing.T) { + t.Parallel() + c, _, _, _, guestCtrl := newContainerTestController(t) + + caps := &gcs.LCOWGuestDefinedCapabilities{} + caps.DeleteContainerStateSupported = true + + gomock.InOrder( + guestCtrl.EXPECT().Capabilities().Return(caps), + guestCtrl.EXPECT().DeleteContainerState(gomock.Any(), c.gcsContainerID).Return(nil), + ) + + if err := c.releaseResources(t.Context()); err != nil { + t.Fatalf("releaseResources returned error: %v", err) + } + if !c.isContainerStateDeleted { + t.Fatal("isContainerStateDeleted should be true after successful DeleteContainerState") + } + + // A second call must not re-issue DeleteContainerState (the short-circuit + // on isContainerStateDeleted skips the Capabilities lookup entirely). + if err := c.releaseResources(t.Context()); err != nil { + t.Fatalf("second releaseResources returned error: %v", err) + } +} + +// TestReleaseResources_DeleteContainerStateFails verifies that when +// DeleteContainerState fails the error is surfaced and isContainerStateDeleted +// remains false so a subsequent releaseResources call retries the RPC. +func TestReleaseResources_DeleteContainerStateFails(t *testing.T) { + t.Parallel() + c, _, _, _, guestCtrl := newContainerTestController(t) + + caps := &gcs.LCOWGuestDefinedCapabilities{} + caps.DeleteContainerStateSupported = true + + wantErr := errors.New("delete container state failed") + guestCtrl.EXPECT().Capabilities().Return(caps) + guestCtrl.EXPECT().DeleteContainerState(gomock.Any(), c.gcsContainerID).Return(wantErr) + + err := c.releaseResources(t.Context()) + if !errors.Is(err, wantErr) { + t.Fatalf("releaseResources error = %v, want %v", err, wantErr) + } + if c.isContainerStateDeleted { + t.Fatal("isContainerStateDeleted must remain false after failed DeleteContainerState") + } +} + +// TestReleaseResources_DeleteContainerStateNotFoundIsSuccess verifies that a +// "compute system does not exist" error from DeleteContainerState is treated +// as success. This matters because the GCS bridge removes the container from +// its host-state map even when its inner Delete() fails, so any retry of the +// RPC will always come back with "not found" — looping forever would otherwise +// be the consequence. +func TestReleaseResources_DeleteContainerStateNotFoundIsSuccess(t *testing.T) { + t.Parallel() + c, _, _, _, guestCtrl := newContainerTestController(t) + + caps := &gcs.LCOWGuestDefinedCapabilities{} + caps.DeleteContainerStateSupported = true + + guestCtrl.EXPECT().Capabilities().Return(caps) + // The bridge's response Result code is wrapped through windows.Errno + // (= syscall.Errno) so errors.Is against hcs.ErrComputeSystemDoesNotExist + // matches. + guestCtrl.EXPECT(). + DeleteContainerState(gomock.Any(), c.gcsContainerID). + Return(fmt.Errorf("guest RPC failure: %w", hcs.ErrComputeSystemDoesNotExist)) + + if err := c.releaseResources(t.Context()); err != nil { + t.Fatalf("releaseResources returned error for not-found: %v", err) + } + if !c.isContainerStateDeleted { + t.Fatal("isContainerStateDeleted should be true even when the guest reports the container is gone") } } @@ -917,3 +1009,87 @@ func TestKillProcess_AllowedInPostCreatedStates(t *testing.T) { }) } } + +// TestKillProcess_EmptyExecIDFansOutToAllExecs verifies the new fan-out +// semantics: when KillProcess is invoked with an empty exec ID, the signal +// is delivered to every additional exec in the container in addition to +// init. This mirrors the "kill the whole container" behavior used by the +// delete path and is the change introduced by re-sequencing teardown. +func TestKillProcess_EmptyExecIDFansOutToAllExecs(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + all bool + }{ + // Pre-existing behavior: all=true fans out. + {name: "all=true", all: true}, + // New behavior introduced by this commit: execID=="" alone fans out. + {name: "all=false", all: false}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + c, _, _, _, guestCtrl := newContainerTestController(t) + c.state = StateRunning + + c.processes[""] = process.New(testContainerID, "", nil, 0) + c.processes["exec-1"] = process.New(testContainerID, "exec-1", nil, 0) + c.processes["exec-2"] = process.New(testContainerID, "exec-2", nil, 0) + + // SIGTERM (15) with no signal support → nil signalOptions → + // terminate path inside process.Kill. + guestCtrl.EXPECT(). + Capabilities(). + Return(&gcs.LCOWGuestDefinedCapabilities{}) + + if err := c.KillProcess(t.Context(), "", 15, tc.all); err != nil { + t.Fatalf("KillProcess(\"\", 15, %v) returned error: %v", tc.all, err) + } + + // Init plus every exec must be terminated. Map iteration order is + // unspecified, but every entry is required to flip — so just walk + // the map and assert. + for execID, proc := range c.processes { + if got, want := proc.State(), process.StateTerminated; got != want { + t.Errorf("process %q state = %s, want %s", execID, got, want) + } + } + }) + } +} + +// TestKillProcess_NonEmptyExecIDDoesNotFanOut verifies that when KillProcess +// targets a specific exec ID, the fan-out path is NOT taken. This guards the +// boundary of the new `if all || execID == ""` condition: a targeted kill +// must affect only the named exec. +func TestKillProcess_NonEmptyExecIDDoesNotFanOut(t *testing.T) { + t.Parallel() + c, _, _, _, guestCtrl := newContainerTestController(t) + c.state = StateRunning + + c.processes[""] = process.New(testContainerID, "", nil, 0) + c.processes["exec-1"] = process.New(testContainerID, "exec-1", nil, 0) + c.processes["exec-2"] = process.New(testContainerID, "exec-2", nil, 0) + + guestCtrl.EXPECT(). + Capabilities(). + Return(&gcs.LCOWGuestDefinedCapabilities{}) + + if err := c.KillProcess(t.Context(), "exec-1", 15, false); err != nil { + t.Fatalf("KillProcess(\"exec-1\", 15, false) returned error: %v", err) + } + + // Only exec-1 is expected to be terminated; init and exec-2 stay put. + wantStates := map[string]process.State{ + "": process.StateNotCreated, + "exec-1": process.StateTerminated, + "exec-2": process.StateNotCreated, + } + for execID, want := range wantStates { + if got := c.processes[execID].State(); got != want { + t.Errorf("process %q state = %s, want %s", execID, got, want) + } + } +} diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 36907e112b..13c0231801 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -7,10 +7,13 @@ import ( "context" "fmt" "os" + "path/filepath" + "strings" "sync" "sync/atomic" "syscall" + "github.com/Microsoft/hcsshim/internal/guestpath" v1 "github.com/containerd/cgroups/v3/cgroup1/stats" oci "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" @@ -272,6 +275,16 @@ func (c *Container) Delete(ctx context.Context) error { } } + // For V2 sandbox containers, remove the now-empty pod directory + // (parent of c.ociBundlePath, e.g. /run/gcs/pods/). + if c.isSandbox && strings.HasPrefix(c.ociBundlePath, guestpath.LCOWV2RootPrefixInVM) { + podDir := filepath.Dir(c.ociBundlePath) + if err := os.Remove(podDir); err != nil && !os.IsNotExist(err) { + entity.WithError(err).WithField("podDir", podDir). + Warn("failed to remove pod directory after sandbox delete") + } + } + return retErr }