feat: add agent exec pkg (#15577)

2025-03-14 10:09:57 +00:00 · 2024-11-25 17:22:12 +02:00
parent 7876dc5fb1
commit bbc549d2df
7 changed files with 603 additions and 0 deletions
--- a/agent/agentexec/cli_linux.go
+++ b/agent/agentexec/cli_linux.go
@ -0,0 +1,145 @@
+//go:build linux
+// +build linux
+
+package agentexec
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"os/exec"
+	"runtime"
+	"strconv"
+	"strings"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"golang.org/x/xerrors"
+)
+
+// unset is set to an invalid value for nice and oom scores.
+const unset = -2000
+
+// CLI runs the agent-exec command. It should only be called by the cli package.
+func CLI() error {
+	// We lock the OS thread here to avoid a race condition where the nice priority
+	// we get is on a different thread from the one we set it on.
+	runtime.LockOSThread()
+	// Nop on success but we do it anyway in case of an error.
+	defer runtime.UnlockOSThread()
+
+	var (
+		fs   = flag.NewFlagSet("agent-exec", flag.ExitOnError)
+		nice = fs.Int("coder-nice", unset, "")
+		oom  = fs.Int("coder-oom", unset, "")
+	)
+
+	if len(os.Args) < 3 {
+		return xerrors.Errorf("malformed command %+v", os.Args)
+	}
+
+	// Parse everything after "coder agent-exec".
+	err := fs.Parse(os.Args[2:])
+	if err != nil {
+		return xerrors.Errorf("parse flags: %w", err)
+	}
+
+	// Get everything after "coder agent-exec --"
+	args := execArgs(os.Args)
+	if len(args) == 0 {
+		return xerrors.Errorf("no exec command provided %+v", os.Args)
+	}
+
+	if *nice == unset {
+		// If an explicit nice score isn't set, we use the default.
+		*nice, err = defaultNiceScore()
+		if err != nil {
+			return xerrors.Errorf("get default nice score: %w", err)
+		}
+	}
+
+	if *oom == unset {
+		// If an explicit oom score isn't set, we use the default.
+		*oom, err = defaultOOMScore()
+		if err != nil {
+			return xerrors.Errorf("get default oom score: %w", err)
+		}
+	}
+
+	err = unix.Setpriority(unix.PRIO_PROCESS, 0, *nice)
+	if err != nil {
+		return xerrors.Errorf("set nice score: %w", err)
+	}
+
+	err = writeOOMScoreAdj(*oom)
+	if err != nil {
+		return xerrors.Errorf("set oom score: %w", err)
+	}
+
+	path, err := exec.LookPath(args[0])
+	if err != nil {
+		return xerrors.Errorf("look path: %w", err)
+	}
+
+	return syscall.Exec(path, args, os.Environ())
+}
+
+func defaultNiceScore() (int, error) {
+	score, err := unix.Getpriority(unix.PRIO_PROCESS, 0)
+	if err != nil {
+		return 0, xerrors.Errorf("get nice score: %w", err)
+	}
+	// See https://linux.die.net/man/2/setpriority#Notes
+	score = 20 - score
+
+	score += 5
+	if score > 19 {
+		return 19, nil
+	}
+	return score, nil
+}
+
+func defaultOOMScore() (int, error) {
+	score, err := oomScoreAdj()
+	if err != nil {
+		return 0, xerrors.Errorf("get oom score: %w", err)
+	}
+
+	// If the agent has a negative oom_score_adj, we set the child to 0
+	// so it's treated like every other process.
+	if score < 0 {
+		return 0, nil
+	}
+
+	// If the agent is already almost at the maximum then set it to the max.
+	if score >= 998 {
+		return 1000, nil
+	}
+
+	// If the agent oom_score_adj is >=0, we set the child to slightly
+	// less than the maximum. If users want a different score they set it
+	// directly.
+	return 998, nil
+}
+
+func oomScoreAdj() (int, error) {
+	scoreStr, err := os.ReadFile("/proc/self/oom_score_adj")
+	if err != nil {
+		return 0, xerrors.Errorf("read oom_score_adj: %w", err)
+	}
+	return strconv.Atoi(strings.TrimSpace(string(scoreStr)))
+}
+
+func writeOOMScoreAdj(score int) error {
+	return os.WriteFile("/proc/self/oom_score_adj", []byte(fmt.Sprintf("%d", score)), 0o600)
+}
+
+// execArgs returns the arguments to pass to syscall.Exec after the "--" delimiter.
+func execArgs(args []string) []string {
+	for i, arg := range args {
+		if arg == "--" {
+			return args[i+1:]
+		}
+	}
+	return nil
+}
--- a/agent/agentexec/cli_linux_test.go
+++ b/agent/agentexec/cli_linux_test.go
@ -0,0 +1,178 @@
+//go:build linux
+// +build linux
+
+package agentexec_test
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+	"golang.org/x/sys/unix"
+
+	"github.com/coder/coder/v2/testutil"
+)
+
+func TestCLI(t *testing.T) {
+	t.Parallel()
+
+	t.Run("OK", func(t *testing.T) {
+		t.Parallel()
+
+		ctx := testutil.Context(t, testutil.WaitMedium)
+		cmd, path := cmd(ctx, t, 123, 12)
+		err := cmd.Start()
+		require.NoError(t, err)
+		go cmd.Wait()
+
+		waitForSentinel(ctx, t, cmd, path)
+		requireOOMScore(t, cmd.Process.Pid, 123)
+		requireNiceScore(t, cmd.Process.Pid, 12)
+	})
+
+	t.Run("Defaults", func(t *testing.T) {
+		t.Parallel()
+
+		ctx := testutil.Context(t, testutil.WaitMedium)
+		cmd, path := cmd(ctx, t, 0, 0)
+		err := cmd.Start()
+		require.NoError(t, err)
+		go cmd.Wait()
+
+		waitForSentinel(ctx, t, cmd, path)
+
+		expectedNice := expectedNiceScore(t)
+		expectedOOM := expectedOOMScore(t)
+		requireOOMScore(t, cmd.Process.Pid, expectedOOM)
+		requireNiceScore(t, cmd.Process.Pid, expectedNice)
+	})
+}
+
+func requireNiceScore(t *testing.T, pid int, score int) {
+	t.Helper()
+
+	nice, err := unix.Getpriority(unix.PRIO_PROCESS, pid)
+	require.NoError(t, err)
+	// See https://linux.die.net/man/2/setpriority#Notes
+	require.Equal(t, score, 20-nice)
+}
+
+func requireOOMScore(t *testing.T, pid int, expected int) {
+	t.Helper()
+
+	actual, err := os.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid))
+	require.NoError(t, err)
+	score := strings.TrimSpace(string(actual))
+	require.Equal(t, strconv.Itoa(expected), score)
+}
+
+func waitForSentinel(ctx context.Context, t *testing.T, cmd *exec.Cmd, path string) {
+	t.Helper()
+
+	ticker := time.NewTicker(testutil.IntervalFast)
+	defer ticker.Stop()
+
+	// RequireEventually doesn't work well with require.NoError or similar require functions.
+	for {
+		err := cmd.Process.Signal(syscall.Signal(0))
+		require.NoError(t, err)
+
+		_, err = os.Stat(path)
+		if err == nil {
+			return
+		}
+
+		select {
+		case <-ticker.C:
+		case <-ctx.Done():
+			require.NoError(t, ctx.Err())
+		}
+	}
+}
+
+func cmd(ctx context.Context, t *testing.T, oom, nice int) (*exec.Cmd, string) {
+	var (
+		args = execArgs(oom, nice)
+		dir  = t.TempDir()
+		file = filepath.Join(dir, "sentinel")
+	)
+
+	args = append(args, "sh", "-c", fmt.Sprintf("touch %s && sleep 10m", file))
+	//nolint:gosec
+	cmd := exec.CommandContext(ctx, TestBin, args...)
+
+	// We set this so we can also easily kill the sleep process the shell spawns.
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		Setpgid: true,
+	}
+
+	cmd.Env = os.Environ()
+	var buf bytes.Buffer
+	cmd.Stdout = &buf
+	cmd.Stderr = &buf
+	t.Cleanup(func() {
+		// Print output of a command if the test fails.
+		if t.Failed() {
+			t.Logf("cmd %q output: %s", cmd.Args, buf.String())
+		}
+		if cmd.Process != nil {
+			// We use -cmd.Process.Pid to kill the whole process group.
+			_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGINT)
+		}
+	})
+	return cmd, file
+}
+
+func expectedOOMScore(t *testing.T) int {
+	t.Helper()
+
+	score, err := os.ReadFile(fmt.Sprintf("/proc/%d/oom_score_adj", os.Getpid()))
+	require.NoError(t, err)
+
+	scoreInt, err := strconv.Atoi(strings.TrimSpace(string(score)))
+	require.NoError(t, err)
+
+	if scoreInt < 0 {
+		return 0
+	}
+	if scoreInt >= 998 {
+		return 1000
+	}
+	return 998
+}
+
+func expectedNiceScore(t *testing.T) int {
+	t.Helper()
+
+	score, err := unix.Getpriority(unix.PRIO_PROCESS, os.Getpid())
+	require.NoError(t, err)
+
+	// Priority is niceness + 20.
+	score = 20 - score
+	score += 5
+	if score > 19 {
+		return 19
+	}
+	return score
+}
+
+func execArgs(oom int, nice int) []string {
+	execArgs := []string{"agent-exec"}
+	if oom != 0 {
+		execArgs = append(execArgs, fmt.Sprintf("--coder-oom=%d", oom))
+	}
+	if nice != 0 {
+		execArgs = append(execArgs, fmt.Sprintf("--coder-nice=%d", nice))
+	}
+	execArgs = append(execArgs, "--")
+	return execArgs
+}
--- a/agent/agentexec/cli_other.go
+++ b/agent/agentexec/cli_other.go
@ -0,0 +1,10 @@
+//go:build !linux
+// +build !linux
+
+package agentexec
+
+import "golang.org/x/xerrors"
+
+func CLI() error {
+	return xerrors.New("agent-exec is only supported on Linux")
+}
--- a/agent/agentexec/cmdtest/main_linux.go
+++ b/agent/agentexec/cmdtest/main_linux.go
@ -0,0 +1,19 @@
+//go:build linux
+// +build linux
+
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/coder/coder/v2/agent/agentexec"
+)
+
+func main() {
+	err := agentexec.CLI()
+	if err != nil {
+		_, _ = fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+}
--- a/agent/agentexec/exec.go
+++ b/agent/agentexec/exec.go
@ -0,0 +1,86 @@
+package agentexec
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strconv"
+
+	"golang.org/x/xerrors"
+)
+
+const (
+	// EnvProcPrioMgmt is the environment variable that determines whether
+	// we attempt to manage process CPU and OOM Killer priority.
+	EnvProcPrioMgmt  = "CODER_PROC_PRIO_MGMT"
+	EnvProcOOMScore  = "CODER_PROC_OOM_SCORE"
+	EnvProcNiceScore = "CODER_PROC_NICE_SCORE"
+)
+
+// CommandContext returns an exec.Cmd that calls "coder agent-exec" prior to exec'ing
+// the provided command if CODER_PROC_PRIO_MGMT is set, otherwise a normal exec.Cmd
+// is returned. All instances of exec.Cmd should flow through this function to ensure
+// proper resource constraints are applied to the child process.
+func CommandContext(ctx context.Context, cmd string, args ...string) (*exec.Cmd, error) {
+	_, enabled := os.LookupEnv(EnvProcPrioMgmt)
+	if runtime.GOOS != "linux" || !enabled {
+		return exec.CommandContext(ctx, cmd, args...), nil
+	}
+
+	executable, err := os.Executable()
+	if err != nil {
+		return nil, xerrors.Errorf("get executable: %w", err)
+	}
+
+	bin, err := filepath.EvalSymlinks(executable)
+	if err != nil {
+		return nil, xerrors.Errorf("eval symlinks: %w", err)
+	}
+
+	execArgs := []string{"agent-exec"}
+	if score, ok := envValInt(EnvProcOOMScore); ok {
+		execArgs = append(execArgs, oomScoreArg(score))
+	}
+
+	if score, ok := envValInt(EnvProcNiceScore); ok {
+		execArgs = append(execArgs, niceScoreArg(score))
+	}
+	execArgs = append(execArgs, "--", cmd)
+	execArgs = append(execArgs, args...)
+
+	return exec.CommandContext(ctx, bin, execArgs...), nil
+}
+
+// envValInt searches for a key in a list of environment variables and parses it to an int.
+// If the key is not found or cannot be parsed, returns 0 and false.
+func envValInt(key string) (int, bool) {
+	val, ok := os.LookupEnv(key)
+	if !ok {
+		return 0, false
+	}
+
+	i, err := strconv.Atoi(val)
+	if err != nil {
+		return 0, false
+	}
+	return i, true
+}
+
+// The following are flags used by the agent-exec command. We use flags instead of
+// environment variables to avoid having to deal with a caller overriding the
+// environment variables.
+const (
+	niceFlag = "coder-nice"
+	oomFlag  = "coder-oom"
+)
+
+func niceScoreArg(score int) string {
+	return fmt.Sprintf("--%s=%d", niceFlag, score)
+}
+
+func oomScoreArg(score int) string {
+	return fmt.Sprintf("--%s=%d", oomFlag, score)
+}
--- a/agent/agentexec/exec_test.go
+++ b/agent/agentexec/exec_test.go
@ -0,0 +1,119 @@
+package agentexec_test
+
+import (
+	"context"
+	"os"
+	"os/exec"
+	"runtime"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/coder/coder/v2/agent/agentexec"
+)
+
+//nolint:paralleltest // we need to test environment variables
+func TestExec(t *testing.T) {
+	//nolint:paralleltest // we need to test environment variables
+	t.Run("NonLinux", func(t *testing.T) {
+		t.Setenv(agentexec.EnvProcPrioMgmt, "true")
+
+		if runtime.GOOS == "linux" {
+			t.Skip("skipping on linux")
+		}
+
+		cmd, err := agentexec.CommandContext(context.Background(), "sh", "-c", "sleep")
+		require.NoError(t, err)
+
+		path, err := exec.LookPath("sh")
+		require.NoError(t, err)
+		require.Equal(t, path, cmd.Path)
+		require.Equal(t, []string{"sh", "-c", "sleep"}, cmd.Args)
+	})
+
+	//nolint:paralleltest // we need to test environment variables
+	t.Run("Linux", func(t *testing.T) {
+		//nolint:paralleltest // we need to test environment variables
+		t.Run("Disabled", func(t *testing.T) {
+			if runtime.GOOS != "linux" {
+				t.Skip("skipping on linux")
+			}
+
+			cmd, err := agentexec.CommandContext(context.Background(), "sh", "-c", "sleep")
+			require.NoError(t, err)
+			path, err := exec.LookPath("sh")
+			require.NoError(t, err)
+			require.Equal(t, path, cmd.Path)
+			require.Equal(t, []string{"sh", "-c", "sleep"}, cmd.Args)
+		})
+
+		//nolint:paralleltest // we need to test environment variables
+		t.Run("Enabled", func(t *testing.T) {
+			t.Setenv(agentexec.EnvProcPrioMgmt, "hello")
+
+			if runtime.GOOS != "linux" {
+				t.Skip("skipping on linux")
+			}
+
+			executable, err := os.Executable()
+			require.NoError(t, err)
+
+			cmd, err := agentexec.CommandContext(context.Background(), "sh", "-c", "sleep")
+			require.NoError(t, err)
+			require.Equal(t, executable, cmd.Path)
+			require.Equal(t, []string{executable, "agent-exec", "--", "sh", "-c", "sleep"}, cmd.Args)
+		})
+
+		t.Run("Nice", func(t *testing.T) {
+			t.Setenv(agentexec.EnvProcPrioMgmt, "hello")
+			t.Setenv(agentexec.EnvProcNiceScore, "10")
+
+			if runtime.GOOS != "linux" {
+				t.Skip("skipping on linux")
+			}
+
+			executable, err := os.Executable()
+			require.NoError(t, err)
+
+			cmd, err := agentexec.CommandContext(context.Background(), "sh", "-c", "sleep")
+			require.NoError(t, err)
+			require.Equal(t, executable, cmd.Path)
+			require.Equal(t, []string{executable, "agent-exec", "--coder-nice=10", "--", "sh", "-c", "sleep"}, cmd.Args)
+		})
+
+		t.Run("OOM", func(t *testing.T) {
+			t.Setenv(agentexec.EnvProcPrioMgmt, "hello")
+			t.Setenv(agentexec.EnvProcOOMScore, "123")
+
+			if runtime.GOOS != "linux" {
+				t.Skip("skipping on linux")
+			}
+
+			executable, err := os.Executable()
+			require.NoError(t, err)
+
+			cmd, err := agentexec.CommandContext(context.Background(), "sh", "-c", "sleep")
+			require.NoError(t, err)
+			require.Equal(t, executable, cmd.Path)
+			require.Equal(t, []string{executable, "agent-exec", "--coder-oom=123", "--", "sh", "-c", "sleep"}, cmd.Args)
+		})
+
+		t.Run("Both", func(t *testing.T) {
+			t.Setenv(agentexec.EnvProcPrioMgmt, "hello")
+			t.Setenv(agentexec.EnvProcOOMScore, "432")
+			t.Setenv(agentexec.EnvProcNiceScore, "14")
+
+			if runtime.GOOS != "linux" {
+				t.Skip("skipping on linux")
+			}
+
+			executable, err := os.Executable()
+			require.NoError(t, err)
+
+			cmd, err := agentexec.CommandContext(context.Background(), "sh", "-c", "sleep")
+			require.NoError(t, err)
+			require.Equal(t, executable, cmd.Path)
+			require.Equal(t, []string{executable, "agent-exec", "--coder-oom=432", "--coder-nice=14", "--", "sh", "-c", "sleep"}, cmd.Args)
+		})
+	})
+}
--- a/agent/agentexec/main_linux_test.go
+++ b/agent/agentexec/main_linux_test.go
@ -0,0 +1,46 @@
+//go:build linux
+// +build linux
+
+package agentexec_test
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"testing"
+)
+
+var TestBin string
+
+func TestMain(m *testing.M) {
+	code := func() int {
+		// We generate a unique directory per test invocation to avoid collisions between two
+		// processes attempting to create the same temp file.
+		dir := genDir()
+		defer os.RemoveAll(dir)
+		TestBin = buildBinary(dir)
+		return m.Run()
+	}()
+
+	os.Exit(code)
+}
+
+func buildBinary(dir string) string {
+	path := filepath.Join(dir, "agent-test")
+	out, err := exec.Command("go", "build", "-o", path, "./cmdtest").CombinedOutput()
+	mustf(err, "build binary: %s", out)
+	return path
+}
+
+func mustf(err error, msg string, args ...any) {
+	if err != nil {
+		panic(fmt.Sprintf(msg, args...))
+	}
+}
+
+func genDir() string {
+	dir, err := os.MkdirTemp(os.TempDir(), "agentexec")
+	mustf(err, "create temp dir: %v", err)
+	return dir
+}