Files
coder/agent/agentcontainers/api.go
Mathias Fredriksson 0f3a1e9849 fix(agent/agentcontainers): split Init into Init and Start for early API responses (#18640)
Previously in #18635 we delayed the containers API `Init` to avoid producing
errors due to Docker and `@devcontainers/cli` not yet being installed by startup
scripts. This had an adverse effect on the UX via UI responsiveness as the
detection of devcontainers was greatly delayed.

This change splits `Init` into `Init` and `Start` so that we can immediately
after `Init` start serving known devcontainers (defined in Terraform), improving
the UX.

Related #18635
Related #18640
2025-06-27 19:01:50 +03:00

1683 lines
56 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package agentcontainers
import (
"context"
"errors"
"fmt"
"net/http"
"os"
"path"
"path/filepath"
"regexp"
"runtime"
"slices"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/fsnotify/fsnotify"
"github.com/go-chi/chi/v5"
"github.com/google/uuid"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/v2/agent/agentcontainers/watcher"
"github.com/coder/coder/v2/agent/agentexec"
"github.com/coder/coder/v2/agent/usershell"
"github.com/coder/coder/v2/coderd/httpapi"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/provisioner"
"github.com/coder/quartz"
)
const (
defaultUpdateInterval = 10 * time.Second
defaultOperationTimeout = 15 * time.Second
// Destination path inside the container, we store it in a fixed location
// under /.coder-agent/coder to avoid conflicts and avoid being shadowed
// by tmpfs or other mounts. This assumes the container root filesystem is
// read-write, which seems sensible for devcontainers.
coderPathInsideContainer = "/.coder-agent/coder"
maxAgentNameLength = 64
maxAttemptsToNameAgent = 5
)
// API is responsible for container-related operations in the agent.
// It provides methods to list and manage containers.
type API struct {
ctx context.Context
cancel context.CancelFunc
watcherDone chan struct{}
updaterDone chan struct{}
updateTrigger chan chan error // Channel to trigger manual refresh.
updateInterval time.Duration // Interval for periodic container updates.
logger slog.Logger
watcher watcher.Watcher
execer agentexec.Execer
commandEnv CommandEnv
ccli ContainerCLI
containerLabelIncludeFilter map[string]string // Labels to filter containers by.
dccli DevcontainerCLI
clock quartz.Clock
scriptLogger func(logSourceID uuid.UUID) ScriptLogger
subAgentClient atomic.Pointer[SubAgentClient]
subAgentURL string
subAgentEnv []string
ownerName string
workspaceName string
parentAgent string
mu sync.RWMutex // Protects the following fields.
initDone chan struct{} // Closed by Init.
closed bool
containers codersdk.WorkspaceAgentListContainersResponse // Output from the last list operation.
containersErr error // Error from the last list operation.
devcontainerNames map[string]bool // By devcontainer name.
knownDevcontainers map[string]codersdk.WorkspaceAgentDevcontainer // By workspace folder.
devcontainerLogSourceIDs map[string]uuid.UUID // By workspace folder.
configFileModifiedTimes map[string]time.Time // By config file path.
recreateSuccessTimes map[string]time.Time // By workspace folder.
recreateErrorTimes map[string]time.Time // By workspace folder.
injectedSubAgentProcs map[string]subAgentProcess // By workspace folder.
usingWorkspaceFolderName map[string]bool // By workspace folder.
ignoredDevcontainers map[string]bool // By workspace folder. Tracks three states (true, false and not checked).
asyncWg sync.WaitGroup
}
type subAgentProcess struct {
agent SubAgent
containerID string
ctx context.Context
stop context.CancelFunc
}
// Option is a functional option for API.
type Option func(*API)
// WithClock sets the quartz.Clock implementation to use.
// This is primarily used for testing to control time.
func WithClock(clock quartz.Clock) Option {
return func(api *API) {
api.clock = clock
}
}
// WithExecer sets the agentexec.Execer implementation to use.
func WithExecer(execer agentexec.Execer) Option {
return func(api *API) {
api.execer = execer
}
}
// WithCommandEnv sets the CommandEnv implementation to use.
func WithCommandEnv(ce CommandEnv) Option {
return func(api *API) {
api.commandEnv = func(ei usershell.EnvInfoer, preEnv []string) (string, string, []string, error) {
shell, dir, env, err := ce(ei, preEnv)
if err != nil {
return shell, dir, env, err
}
env = slices.DeleteFunc(env, func(s string) bool {
// Ensure we filter out environment variables that come
// from the parent agent and are incorrect or not
// relevant for the devcontainer.
return strings.HasPrefix(s, "CODER_WORKSPACE_AGENT_NAME=") ||
strings.HasPrefix(s, "CODER_WORKSPACE_AGENT_URL=") ||
strings.HasPrefix(s, "CODER_AGENT_TOKEN=") ||
strings.HasPrefix(s, "CODER_AGENT_AUTH=") ||
strings.HasPrefix(s, "CODER_AGENT_DEVCONTAINERS_ENABLE=")
})
return shell, dir, env, nil
}
}
}
// WithContainerCLI sets the agentcontainers.ContainerCLI implementation
// to use. The default implementation uses the Docker CLI.
func WithContainerCLI(ccli ContainerCLI) Option {
return func(api *API) {
api.ccli = ccli
}
}
// WithContainerLabelIncludeFilter sets a label filter for containers.
// This option can be given multiple times to filter by multiple labels.
// The behavior is such that only containers matching one or more of the
// provided labels will be included.
func WithContainerLabelIncludeFilter(label, value string) Option {
return func(api *API) {
api.containerLabelIncludeFilter[label] = value
}
}
// WithDevcontainerCLI sets the DevcontainerCLI implementation to use.
// This can be used in tests to modify @devcontainer/cli behavior.
func WithDevcontainerCLI(dccli DevcontainerCLI) Option {
return func(api *API) {
api.dccli = dccli
}
}
// WithSubAgentClient sets the SubAgentClient implementation to use.
// This is used to list, create, and delete devcontainer agents.
func WithSubAgentClient(client SubAgentClient) Option {
return func(api *API) {
api.subAgentClient.Store(&client)
}
}
// WithSubAgentURL sets the agent URL for the sub-agent for
// communicating with the control plane.
func WithSubAgentURL(url string) Option {
return func(api *API) {
api.subAgentURL = url
}
}
// WithSubAgentEnv sets the environment variables for the sub-agent.
func WithSubAgentEnv(env ...string) Option {
return func(api *API) {
api.subAgentEnv = env
}
}
// WithManifestInfo sets the owner name, and workspace name
// for the sub-agent.
func WithManifestInfo(owner, workspace, parentAgent string) Option {
return func(api *API) {
api.ownerName = owner
api.workspaceName = workspace
api.parentAgent = parentAgent
}
}
// WithDevcontainers sets the known devcontainers for the API. This
// allows the API to be aware of devcontainers defined in the workspace
// agent manifest.
func WithDevcontainers(devcontainers []codersdk.WorkspaceAgentDevcontainer, scripts []codersdk.WorkspaceAgentScript) Option {
return func(api *API) {
if len(devcontainers) == 0 {
return
}
api.knownDevcontainers = make(map[string]codersdk.WorkspaceAgentDevcontainer, len(devcontainers))
api.devcontainerNames = make(map[string]bool, len(devcontainers))
api.devcontainerLogSourceIDs = make(map[string]uuid.UUID)
for _, dc := range devcontainers {
if dc.Status == "" {
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStarting
}
api.knownDevcontainers[dc.WorkspaceFolder] = dc
api.devcontainerNames[dc.Name] = true
for _, script := range scripts {
// The devcontainer scripts match the devcontainer ID for
// identification.
if script.ID == dc.ID {
api.devcontainerLogSourceIDs[dc.WorkspaceFolder] = script.LogSourceID
break
}
}
if api.devcontainerLogSourceIDs[dc.WorkspaceFolder] == uuid.Nil {
api.logger.Error(api.ctx, "devcontainer log source ID not found for devcontainer",
slog.F("devcontainer_id", dc.ID),
slog.F("devcontainer_name", dc.Name),
slog.F("workspace_folder", dc.WorkspaceFolder),
slog.F("config_path", dc.ConfigPath),
)
}
}
}
}
// WithWatcher sets the file watcher implementation to use. By default a
// noop watcher is used. This can be used in tests to modify the watcher
// behavior or to use an actual file watcher (e.g. fsnotify).
func WithWatcher(w watcher.Watcher) Option {
return func(api *API) {
api.watcher = w
}
}
// ScriptLogger is an interface for sending devcontainer logs to the
// controlplane.
type ScriptLogger interface {
Send(ctx context.Context, log ...agentsdk.Log) error
Flush(ctx context.Context) error
}
// noopScriptLogger is a no-op implementation of the ScriptLogger
// interface.
type noopScriptLogger struct{}
func (noopScriptLogger) Send(context.Context, ...agentsdk.Log) error { return nil }
func (noopScriptLogger) Flush(context.Context) error { return nil }
// WithScriptLogger sets the script logger provider for devcontainer operations.
func WithScriptLogger(scriptLogger func(logSourceID uuid.UUID) ScriptLogger) Option {
return func(api *API) {
api.scriptLogger = scriptLogger
}
}
// NewAPI returns a new API with the given options applied.
func NewAPI(logger slog.Logger, options ...Option) *API {
ctx, cancel := context.WithCancel(context.Background())
api := &API{
ctx: ctx,
cancel: cancel,
initDone: make(chan struct{}),
updateTrigger: make(chan chan error),
updateInterval: defaultUpdateInterval,
logger: logger,
clock: quartz.NewReal(),
execer: agentexec.DefaultExecer,
containerLabelIncludeFilter: make(map[string]string),
devcontainerNames: make(map[string]bool),
knownDevcontainers: make(map[string]codersdk.WorkspaceAgentDevcontainer),
configFileModifiedTimes: make(map[string]time.Time),
ignoredDevcontainers: make(map[string]bool),
recreateSuccessTimes: make(map[string]time.Time),
recreateErrorTimes: make(map[string]time.Time),
scriptLogger: func(uuid.UUID) ScriptLogger { return noopScriptLogger{} },
injectedSubAgentProcs: make(map[string]subAgentProcess),
usingWorkspaceFolderName: make(map[string]bool),
}
// The ctx and logger must be set before applying options to avoid
// nil pointer dereference.
for _, opt := range options {
opt(api)
}
if api.commandEnv != nil {
api.execer = newCommandEnvExecer(
api.logger,
api.commandEnv,
api.execer,
)
}
if api.ccli == nil {
api.ccli = NewDockerCLI(api.execer)
}
if api.dccli == nil {
api.dccli = NewDevcontainerCLI(logger.Named("devcontainer-cli"), api.execer)
}
if api.watcher == nil {
var err error
api.watcher, err = watcher.NewFSNotify()
if err != nil {
logger.Error(ctx, "create file watcher service failed", slog.Error(err))
api.watcher = watcher.NewNoop()
}
}
if api.subAgentClient.Load() == nil {
var c SubAgentClient = noopSubAgentClient{}
api.subAgentClient.Store(&c)
}
return api
}
// Init applies a final set of options to the API and then
// closes initDone. This method can only be called once.
func (api *API) Init(opts ...Option) {
api.mu.Lock()
defer api.mu.Unlock()
if api.closed {
return
}
select {
case <-api.initDone:
return
default:
}
defer close(api.initDone)
for _, opt := range opts {
opt(api)
}
}
// Start starts the API by initializing the watcher and updater loops.
// This method calls Init, if it is desired to apply options after
// the API has been created, it should be done by calling Init before
// Start. This method must only be called once.
func (api *API) Start() {
api.Init()
api.mu.Lock()
defer api.mu.Unlock()
if api.closed {
return
}
api.watcherDone = make(chan struct{})
api.updaterDone = make(chan struct{})
go api.watcherLoop()
go api.updaterLoop()
}
func (api *API) watcherLoop() {
defer close(api.watcherDone)
defer api.logger.Debug(api.ctx, "watcher loop stopped")
api.logger.Debug(api.ctx, "watcher loop started")
for {
event, err := api.watcher.Next(api.ctx)
if err != nil {
if errors.Is(err, watcher.ErrClosed) {
api.logger.Debug(api.ctx, "watcher closed")
return
}
if api.ctx.Err() != nil {
api.logger.Debug(api.ctx, "api context canceled")
return
}
api.logger.Error(api.ctx, "watcher error waiting for next event", slog.Error(err))
continue
}
if event == nil {
continue
}
now := api.clock.Now("agentcontainers", "watcherLoop")
switch {
case event.Has(fsnotify.Create | fsnotify.Write):
api.logger.Debug(api.ctx, "devcontainer config file changed", slog.F("file", event.Name))
api.markDevcontainerDirty(event.Name, now)
case event.Has(fsnotify.Remove):
api.logger.Debug(api.ctx, "devcontainer config file removed", slog.F("file", event.Name))
api.markDevcontainerDirty(event.Name, now)
case event.Has(fsnotify.Rename):
api.logger.Debug(api.ctx, "devcontainer config file renamed", slog.F("file", event.Name))
api.markDevcontainerDirty(event.Name, now)
default:
api.logger.Debug(api.ctx, "devcontainer config file event ignored", slog.F("file", event.Name), slog.F("event", event))
}
}
}
// updaterLoop is responsible for periodically updating the container
// list and handling manual refresh requests.
func (api *API) updaterLoop() {
defer close(api.updaterDone)
defer api.logger.Debug(api.ctx, "updater loop stopped")
api.logger.Debug(api.ctx, "updater loop started")
// Make sure we clean up any subagents not tracked by this process
// before starting the update loop and creating new ones.
api.logger.Debug(api.ctx, "cleaning up subagents")
if err := api.cleanupSubAgents(api.ctx); err != nil {
api.logger.Error(api.ctx, "cleanup subagents failed", slog.Error(err))
} else {
api.logger.Debug(api.ctx, "cleanup subagents complete")
}
// Perform an initial update to populate the container list, this
// gives us a guarantee that the API has loaded the initial state
// before returning any responses. This is useful for both tests
// and anyone looking to interact with the API.
api.logger.Debug(api.ctx, "performing initial containers update")
if err := api.updateContainers(api.ctx); err != nil {
if errors.Is(err, context.Canceled) {
api.logger.Warn(api.ctx, "initial containers update canceled", slog.Error(err))
} else {
api.logger.Error(api.ctx, "initial containers update failed", slog.Error(err))
}
} else {
api.logger.Debug(api.ctx, "initial containers update complete")
}
// We utilize a TickerFunc here instead of a regular Ticker so that
// we can guarantee execution of the updateContainers method after
// advancing the clock.
ticker := api.clock.TickerFunc(api.ctx, api.updateInterval, func() error {
done := make(chan error, 1)
var sent bool
defer func() {
if !sent {
close(done)
}
}()
select {
case <-api.ctx.Done():
return api.ctx.Err()
case api.updateTrigger <- done:
sent = true
err := <-done
if err != nil {
if errors.Is(err, context.Canceled) {
api.logger.Warn(api.ctx, "updater loop ticker canceled", slog.Error(err))
} else {
api.logger.Error(api.ctx, "updater loop ticker failed", slog.Error(err))
}
}
default:
api.logger.Debug(api.ctx, "updater loop ticker skipped, update in progress")
}
return nil // Always nil to keep the ticker going.
}, "agentcontainers", "updaterLoop")
defer func() {
if err := ticker.Wait("agentcontainers", "updaterLoop"); err != nil && !errors.Is(err, context.Canceled) {
api.logger.Error(api.ctx, "updater loop ticker failed", slog.Error(err))
}
}()
for {
select {
case <-api.ctx.Done():
return
case done := <-api.updateTrigger:
// Note that although we pass api.ctx here, updateContainers
// has an internal timeout to prevent long blocking calls.
done <- api.updateContainers(api.ctx)
close(done)
}
}
}
// UpdateSubAgentClient updates the `SubAgentClient` for the API.
func (api *API) UpdateSubAgentClient(client SubAgentClient) {
api.subAgentClient.Store(&client)
}
// Routes returns the HTTP handler for container-related routes.
func (api *API) Routes() http.Handler {
r := chi.NewRouter()
ensureInitDoneMW := func(next http.Handler) http.Handler {
return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
select {
case <-api.ctx.Done():
httpapi.Write(r.Context(), rw, http.StatusServiceUnavailable, codersdk.Response{
Message: "API closed",
Detail: "The API is closed and cannot process requests.",
})
return
case <-r.Context().Done():
return
case <-api.initDone:
// API init is done, we can start processing requests.
}
next.ServeHTTP(rw, r)
})
}
// For now, all endpoints require the initial update to be done.
// If we want to allow some endpoints to be available before
// the initial update, we can enable this per-route.
r.Use(ensureInitDoneMW)
r.Get("/", api.handleList)
// TODO(mafredri): Simplify this route as the previous /devcontainers
// /-route was dropped. We can drop the /devcontainers prefix here too.
r.Route("/devcontainers/{devcontainer}", func(r chi.Router) {
r.Post("/recreate", api.handleDevcontainerRecreate)
})
return r
}
// handleList handles the HTTP request to list containers.
func (api *API) handleList(rw http.ResponseWriter, r *http.Request) {
ct, err := api.getContainers()
if err != nil {
httpapi.Write(r.Context(), rw, http.StatusInternalServerError, codersdk.Response{
Message: "Could not get containers",
Detail: err.Error(),
})
return
}
httpapi.Write(r.Context(), rw, http.StatusOK, ct)
}
// updateContainers fetches the latest container list, processes it, and
// updates the cache. It performs locking for updating shared API state.
func (api *API) updateContainers(ctx context.Context) error {
listCtx, listCancel := context.WithTimeout(ctx, defaultOperationTimeout)
defer listCancel()
updated, err := api.ccli.List(listCtx)
if err != nil {
// If the context was canceled, we hold off on clearing the
// containers cache. This is to avoid clearing the cache if
// the update was canceled due to a timeout. Hopefully this
// will clear up on the next update.
if !errors.Is(err, context.Canceled) {
api.mu.Lock()
api.containersErr = err
api.mu.Unlock()
}
return xerrors.Errorf("list containers failed: %w", err)
}
// Clone to avoid test flakes due to data manipulation.
updated.Containers = slices.Clone(updated.Containers)
api.mu.Lock()
defer api.mu.Unlock()
api.processUpdatedContainersLocked(ctx, updated)
api.logger.Debug(ctx, "containers updated successfully", slog.F("container_count", len(api.containers.Containers)), slog.F("warning_count", len(api.containers.Warnings)), slog.F("devcontainer_count", len(api.knownDevcontainers)))
return nil
}
// processUpdatedContainersLocked updates the devcontainer state based
// on the latest list of containers. This method assumes that api.mu is
// held.
func (api *API) processUpdatedContainersLocked(ctx context.Context, updated codersdk.WorkspaceAgentListContainersResponse) {
dcFields := func(dc codersdk.WorkspaceAgentDevcontainer) []slog.Field {
f := []slog.Field{
slog.F("devcontainer_id", dc.ID),
slog.F("devcontainer_name", dc.Name),
slog.F("workspace_folder", dc.WorkspaceFolder),
slog.F("config_path", dc.ConfigPath),
}
if dc.Container != nil {
f = append(f, slog.F("container_id", dc.Container.ID))
f = append(f, slog.F("container_name", dc.Container.FriendlyName))
}
return f
}
// Reset the container links in known devcontainers to detect if
// they still exist.
for _, dc := range api.knownDevcontainers {
dc.Container = nil
api.knownDevcontainers[dc.WorkspaceFolder] = dc
}
// Check if the container is running and update the known devcontainers.
for i := range updated.Containers {
container := &updated.Containers[i] // Grab a reference to the container to allow mutating it.
workspaceFolder := container.Labels[DevcontainerLocalFolderLabel]
configFile := container.Labels[DevcontainerConfigFileLabel]
if workspaceFolder == "" {
continue
}
logger := api.logger.With(
slog.F("container_id", updated.Containers[i].ID),
slog.F("container_name", updated.Containers[i].FriendlyName),
slog.F("workspace_folder", workspaceFolder),
slog.F("config_file", configFile),
)
// Filter out devcontainer tests, unless explicitly set in include filters.
if len(api.containerLabelIncludeFilter) > 0 || container.Labels[DevcontainerIsTestRunLabel] == "true" {
var ok bool
for label, value := range api.containerLabelIncludeFilter {
if v, found := container.Labels[label]; found && v == value {
ok = true
}
}
// Verbose debug logging is fine here since typically filters
// are only used in development or testing environments.
if !ok {
logger.Debug(ctx, "container does not match include filter, ignoring devcontainer", slog.F("container_labels", container.Labels), slog.F("include_filter", api.containerLabelIncludeFilter))
continue
}
logger.Debug(ctx, "container matches include filter, processing devcontainer", slog.F("container_labels", container.Labels), slog.F("include_filter", api.containerLabelIncludeFilter))
}
if dc, ok := api.knownDevcontainers[workspaceFolder]; ok {
// If no config path is set, this devcontainer was defined
// in Terraform without the optional config file. Assume the
// first container with the workspace folder label is the
// one we want to use.
if dc.ConfigPath == "" && configFile != "" {
dc.ConfigPath = configFile
if err := api.watcher.Add(configFile); err != nil {
logger.With(dcFields(dc)...).Error(ctx, "watch devcontainer config file failed", slog.Error(err))
}
}
dc.Container = container
api.knownDevcontainers[dc.WorkspaceFolder] = dc
continue
}
dc := codersdk.WorkspaceAgentDevcontainer{
ID: uuid.New(),
Name: "", // Updated later based on container state.
WorkspaceFolder: workspaceFolder,
ConfigPath: configFile,
Status: "", // Updated later based on container state.
Dirty: false, // Updated later based on config file changes.
Container: container,
}
if configFile != "" {
if err := api.watcher.Add(configFile); err != nil {
logger.With(dcFields(dc)...).Error(ctx, "watch devcontainer config file failed", slog.Error(err))
}
}
api.knownDevcontainers[workspaceFolder] = dc
}
// Iterate through all known devcontainers and update their status
// based on the current state of the containers.
for _, dc := range api.knownDevcontainers {
logger := api.logger.With(dcFields(dc)...)
if dc.Container != nil {
if !api.devcontainerNames[dc.Name] {
// If the devcontainer name wasn't set via terraform, we
// will attempt to create an agent name based on the workspace
// folder's name. If it is not possible to generate a valid
// agent name based off of the folder name (i.e. no valid characters),
// we will instead fall back to using the container's friendly name.
dc.Name, api.usingWorkspaceFolderName[dc.WorkspaceFolder] = api.makeAgentName(dc.WorkspaceFolder, dc.Container.FriendlyName)
}
}
switch {
case dc.Status == codersdk.WorkspaceAgentDevcontainerStatusStarting:
continue // This state is handled by the recreation routine.
case dc.Status == codersdk.WorkspaceAgentDevcontainerStatusError && (dc.Container == nil || dc.Container.CreatedAt.Before(api.recreateErrorTimes[dc.WorkspaceFolder])):
continue // The devcontainer needs to be recreated.
case dc.Container != nil:
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped
if dc.Container.Running {
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning
}
dc.Dirty = false
if lastModified, hasModTime := api.configFileModifiedTimes[dc.ConfigPath]; hasModTime && dc.Container.CreatedAt.Before(lastModified) {
dc.Dirty = true
}
if dc.Status == codersdk.WorkspaceAgentDevcontainerStatusRunning {
err := api.maybeInjectSubAgentIntoContainerLocked(ctx, dc)
if err != nil {
logger.Error(ctx, "inject subagent into container failed", slog.Error(err))
}
}
case dc.Container == nil:
if !api.devcontainerNames[dc.Name] {
dc.Name = ""
}
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped
dc.Dirty = false
}
delete(api.recreateErrorTimes, dc.WorkspaceFolder)
api.knownDevcontainers[dc.WorkspaceFolder] = dc
}
api.containers = updated
api.containersErr = nil
}
var consecutiveHyphenRegex = regexp.MustCompile("-+")
// `safeAgentName` returns a safe agent name derived from a folder name,
// falling back to the containers friendly name if needed. The second
// return value will be `true` if it succeeded and `false` if it had
// to fallback to the friendly name.
func safeAgentName(name string, friendlyName string) (string, bool) {
// Keep only ASCII letters and digits, replacing everything
// else with a hyphen.
var sb strings.Builder
for _, r := range strings.ToLower(name) {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
_, _ = sb.WriteRune(r)
} else {
_, _ = sb.WriteRune('-')
}
}
// Remove any consecutive hyphens, and then trim any leading
// and trailing hyphens.
name = consecutiveHyphenRegex.ReplaceAllString(sb.String(), "-")
name = strings.Trim(name, "-")
// Ensure the name of the agent doesn't exceed the maximum agent
// name length.
name = name[:min(len(name), maxAgentNameLength)]
if provisioner.AgentNameRegex.Match([]byte(name)) {
return name, true
}
return safeFriendlyName(friendlyName), false
}
// safeFriendlyName returns a API safe version of the container's
// friendly name.
//
// See provisioner/regexes.go for the regex used to validate
// the friendly name on the API side.
func safeFriendlyName(name string) string {
name = strings.ToLower(name)
name = strings.ReplaceAll(name, "_", "-")
return name
}
// expandedAgentName creates an agent name by including parent directories
// from the workspace folder path to avoid name collisions. Like `safeAgentName`,
// the second returned value will be true if using the workspace folder name,
// and false if it fell back to the friendly name.
func expandedAgentName(workspaceFolder string, friendlyName string, depth int) (string, bool) {
var parts []string
for part := range strings.SplitSeq(filepath.ToSlash(workspaceFolder), "/") {
if part = strings.TrimSpace(part); part != "" {
parts = append(parts, part)
}
}
if len(parts) == 0 {
return safeFriendlyName(friendlyName), false
}
components := parts[max(0, len(parts)-depth-1):]
expanded := strings.Join(components, "-")
return safeAgentName(expanded, friendlyName)
}
// makeAgentName attempts to create an agent name. It will first attempt to create an
// agent name based off of the workspace folder, and will eventually fallback to a
// friendly name. Like `safeAgentName`, the second returned value will be true if the
// agent name utilizes the workspace folder, and false if it falls back to the
// friendly name.
func (api *API) makeAgentName(workspaceFolder string, friendlyName string) (string, bool) {
for attempt := 0; attempt <= maxAttemptsToNameAgent; attempt++ {
agentName, usingWorkspaceFolder := expandedAgentName(workspaceFolder, friendlyName, attempt)
if !usingWorkspaceFolder {
return agentName, false
}
if !api.devcontainerNames[agentName] {
return agentName, true
}
}
return safeFriendlyName(friendlyName), false
}
// RefreshContainers triggers an immediate update of the container list
// and waits for it to complete.
func (api *API) RefreshContainers(ctx context.Context) (err error) {
defer func() {
if err != nil {
err = xerrors.Errorf("refresh containers failed: %w", err)
}
}()
done := make(chan error, 1)
var sent bool
defer func() {
if !sent {
close(done)
}
}()
select {
case <-api.ctx.Done():
return xerrors.Errorf("API closed: %w", api.ctx.Err())
case <-ctx.Done():
return ctx.Err()
case api.updateTrigger <- done:
sent = true
select {
case <-api.ctx.Done():
return xerrors.Errorf("API closed: %w", api.ctx.Err())
case <-ctx.Done():
return ctx.Err()
case err := <-done:
return err
}
}
}
func (api *API) getContainers() (codersdk.WorkspaceAgentListContainersResponse, error) {
api.mu.RLock()
defer api.mu.RUnlock()
if api.containersErr != nil {
return codersdk.WorkspaceAgentListContainersResponse{}, api.containersErr
}
var devcontainers []codersdk.WorkspaceAgentDevcontainer
if len(api.knownDevcontainers) > 0 {
devcontainers = make([]codersdk.WorkspaceAgentDevcontainer, 0, len(api.knownDevcontainers))
for _, dc := range api.knownDevcontainers {
if api.ignoredDevcontainers[dc.WorkspaceFolder] {
continue
}
// Include the agent if it's running (we're iterating over
// copies, so mutating is fine).
if proc := api.injectedSubAgentProcs[dc.WorkspaceFolder]; proc.agent.ID != uuid.Nil {
dc.Agent = &codersdk.WorkspaceAgentDevcontainerAgent{
ID: proc.agent.ID,
Name: proc.agent.Name,
Directory: proc.agent.Directory,
}
}
devcontainers = append(devcontainers, dc)
}
slices.SortFunc(devcontainers, func(a, b codersdk.WorkspaceAgentDevcontainer) int {
return strings.Compare(a.Name, b.Name)
})
}
return codersdk.WorkspaceAgentListContainersResponse{
Devcontainers: devcontainers,
Containers: slices.Clone(api.containers.Containers),
Warnings: slices.Clone(api.containers.Warnings),
}, nil
}
// handleDevcontainerRecreate handles the HTTP request to recreate a
// devcontainer by referencing the container.
func (api *API) handleDevcontainerRecreate(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
devcontainerID := chi.URLParam(r, "devcontainer")
if devcontainerID == "" {
httpapi.Write(ctx, w, http.StatusBadRequest, codersdk.Response{
Message: "Missing devcontainer ID",
Detail: "Devcontainer ID is required to recreate a devcontainer.",
})
return
}
api.mu.Lock()
var dc codersdk.WorkspaceAgentDevcontainer
for _, knownDC := range api.knownDevcontainers {
if knownDC.ID.String() == devcontainerID {
dc = knownDC
break
}
}
if dc.ID == uuid.Nil {
api.mu.Unlock()
httpapi.Write(ctx, w, http.StatusNotFound, codersdk.Response{
Message: "Devcontainer not found.",
Detail: fmt.Sprintf("Could not find devcontainer with ID: %q", devcontainerID),
})
return
}
if dc.Status == codersdk.WorkspaceAgentDevcontainerStatusStarting {
api.mu.Unlock()
httpapi.Write(ctx, w, http.StatusConflict, codersdk.Response{
Message: "Devcontainer recreation already in progress",
Detail: fmt.Sprintf("Recreation for devcontainer %q is already underway.", dc.Name),
})
return
}
// Update the status so that we don't try to recreate the
// devcontainer multiple times in parallel.
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStarting
dc.Container = nil
api.knownDevcontainers[dc.WorkspaceFolder] = dc
go func() {
_ = api.CreateDevcontainer(dc.WorkspaceFolder, dc.ConfigPath, WithRemoveExistingContainer())
}()
api.mu.Unlock()
httpapi.Write(ctx, w, http.StatusAccepted, codersdk.Response{
Message: "Devcontainer recreation initiated",
Detail: fmt.Sprintf("Recreation process for devcontainer %q has started.", dc.Name),
})
}
// createDevcontainer should run in its own goroutine and is responsible for
// recreating a devcontainer based on the provided devcontainer configuration.
// It updates the devcontainer status and logs the process. The configPath is
// passed as a parameter for the odd chance that the container being recreated
// has a different config file than the one stored in the devcontainer state.
// The devcontainer state must be set to starting and the asyncWg must be
// incremented before calling this function.
func (api *API) CreateDevcontainer(workspaceFolder, configPath string, opts ...DevcontainerCLIUpOptions) error {
api.mu.Lock()
if api.closed {
api.mu.Unlock()
return nil
}
dc, found := api.knownDevcontainers[workspaceFolder]
if !found {
api.mu.Unlock()
return xerrors.Errorf("devcontainer not found")
}
var (
ctx = api.ctx
logger = api.logger.With(
slog.F("devcontainer_id", dc.ID),
slog.F("devcontainer_name", dc.Name),
slog.F("workspace_folder", dc.WorkspaceFolder),
slog.F("config_path", dc.ConfigPath),
)
)
// Send logs via agent logging facilities.
logSourceID := api.devcontainerLogSourceIDs[dc.WorkspaceFolder]
if logSourceID == uuid.Nil {
api.logger.Debug(api.ctx, "devcontainer log source ID not found, falling back to external log source ID")
logSourceID = agentsdk.ExternalLogSourceID
}
api.asyncWg.Add(1)
defer api.asyncWg.Done()
api.mu.Unlock()
if dc.ConfigPath != configPath {
logger.Warn(ctx, "devcontainer config path mismatch",
slog.F("config_path_param", configPath),
)
}
scriptLogger := api.scriptLogger(logSourceID)
defer func() {
flushCtx, cancel := context.WithTimeout(api.ctx, 5*time.Second)
defer cancel()
if err := scriptLogger.Flush(flushCtx); err != nil {
logger.Error(flushCtx, "flush devcontainer logs failed during recreation", slog.Error(err))
}
}()
infoW := agentsdk.LogsWriter(ctx, scriptLogger.Send, logSourceID, codersdk.LogLevelInfo)
defer infoW.Close()
errW := agentsdk.LogsWriter(ctx, scriptLogger.Send, logSourceID, codersdk.LogLevelError)
defer errW.Close()
logger.Debug(ctx, "starting devcontainer recreation")
upOptions := []DevcontainerCLIUpOptions{WithUpOutput(infoW, errW)}
upOptions = append(upOptions, opts...)
_, err := api.dccli.Up(ctx, dc.WorkspaceFolder, configPath, upOptions...)
if err != nil {
// No need to log if the API is closing (context canceled), as this
// is expected behavior when the API is shutting down.
if !errors.Is(err, context.Canceled) {
logger.Error(ctx, "devcontainer creation failed", slog.Error(err))
}
api.mu.Lock()
dc = api.knownDevcontainers[dc.WorkspaceFolder]
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusError
api.knownDevcontainers[dc.WorkspaceFolder] = dc
api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "errorTimes")
api.mu.Unlock()
return xerrors.Errorf("start devcontainer: %w", err)
}
logger.Info(ctx, "devcontainer created successfully")
api.mu.Lock()
dc = api.knownDevcontainers[dc.WorkspaceFolder]
// Update the devcontainer status to Running or Stopped based on the
// current state of the container, changing the status to !starting
// allows the update routine to update the devcontainer status, but
// to minimize the time between API consistency, we guess the status
// based on the container state.
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped
if dc.Container != nil {
if dc.Container.Running {
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning
}
}
dc.Dirty = false
api.recreateSuccessTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "successTimes")
api.knownDevcontainers[dc.WorkspaceFolder] = dc
api.mu.Unlock()
// Ensure an immediate refresh to accurately reflect the
// devcontainer state after recreation.
if err := api.RefreshContainers(ctx); err != nil {
logger.Error(ctx, "failed to trigger immediate refresh after devcontainer creation", slog.Error(err))
return xerrors.Errorf("refresh containers: %w", err)
}
return nil
}
// markDevcontainerDirty finds the devcontainer with the given config file path
// and marks it as dirty. It acquires the lock before modifying the state.
func (api *API) markDevcontainerDirty(configPath string, modifiedAt time.Time) {
api.mu.Lock()
defer api.mu.Unlock()
// Record the timestamp of when this configuration file was modified.
api.configFileModifiedTimes[configPath] = modifiedAt
for _, dc := range api.knownDevcontainers {
if dc.ConfigPath != configPath {
continue
}
logger := api.logger.With(
slog.F("devcontainer_id", dc.ID),
slog.F("devcontainer_name", dc.Name),
slog.F("workspace_folder", dc.WorkspaceFolder),
slog.F("file", configPath),
slog.F("modified_at", modifiedAt),
)
// TODO(mafredri): Simplistic mark for now, we should check if the
// container is running and if the config file was modified after
// the container was created.
if !dc.Dirty {
logger.Info(api.ctx, "marking devcontainer as dirty")
dc.Dirty = true
}
if _, ok := api.ignoredDevcontainers[dc.WorkspaceFolder]; ok {
logger.Debug(api.ctx, "clearing devcontainer ignored state")
delete(api.ignoredDevcontainers, dc.WorkspaceFolder) // Allow re-reading config.
}
api.knownDevcontainers[dc.WorkspaceFolder] = dc
}
}
// cleanupSubAgents removes subagents that are no longer managed by
// this agent. This is usually only run at startup to ensure a clean
// slate. This method has an internal timeout to prevent blocking
// indefinitely if something goes wrong with the subagent deletion.
func (api *API) cleanupSubAgents(ctx context.Context) error {
client := *api.subAgentClient.Load()
agents, err := client.List(ctx)
if err != nil {
return xerrors.Errorf("list agents: %w", err)
}
if len(agents) == 0 {
return nil
}
api.mu.Lock()
defer api.mu.Unlock()
injected := make(map[uuid.UUID]bool, len(api.injectedSubAgentProcs))
for _, proc := range api.injectedSubAgentProcs {
injected[proc.agent.ID] = true
}
ctx, cancel := context.WithTimeout(ctx, defaultOperationTimeout)
defer cancel()
for _, agent := range agents {
if injected[agent.ID] {
continue
}
client := *api.subAgentClient.Load()
err := client.Delete(ctx, agent.ID)
if err != nil {
api.logger.Error(ctx, "failed to delete agent",
slog.Error(err),
slog.F("agent_id", agent.ID),
slog.F("agent_name", agent.Name),
)
}
}
return nil
}
// maybeInjectSubAgentIntoContainerLocked injects a subagent into a dev
// container and starts the subagent process. This method assumes that
// api.mu is held. This method is idempotent and will not re-inject the
// subagent if it is already/still running in the container.
//
// This method uses an internal timeout to prevent blocking indefinitely
// if something goes wrong with the injection.
func (api *API) maybeInjectSubAgentIntoContainerLocked(ctx context.Context, dc codersdk.WorkspaceAgentDevcontainer) (err error) {
if api.ignoredDevcontainers[dc.WorkspaceFolder] {
return nil
}
ctx, cancel := context.WithTimeout(ctx, defaultOperationTimeout)
defer cancel()
container := dc.Container
if container == nil {
return xerrors.New("container is nil, cannot inject subagent")
}
logger := api.logger.With(
slog.F("devcontainer_id", dc.ID),
slog.F("devcontainer_name", dc.Name),
slog.F("workspace_folder", dc.WorkspaceFolder),
slog.F("config_path", dc.ConfigPath),
slog.F("container_id", container.ID),
slog.F("container_name", container.FriendlyName),
)
// Check if subagent already exists for this devcontainer.
maybeRecreateSubAgent := false
proc, injected := api.injectedSubAgentProcs[dc.WorkspaceFolder]
if injected {
if _, ignoreChecked := api.ignoredDevcontainers[dc.WorkspaceFolder]; !ignoreChecked {
// If ignore status has not yet been checked, or cleared by
// modifications to the devcontainer.json, we must read it
// to determine the current status. This can happen while
// the devcontainer subagent is already running or before
// we've had a chance to inject it.
//
// Note, for simplicity, we do not try to optimize to reduce
// ReadConfig calls here.
config, err := api.dccli.ReadConfig(ctx, dc.WorkspaceFolder, dc.ConfigPath, nil)
if err != nil {
return xerrors.Errorf("read devcontainer config: %w", err)
}
dcIgnored := config.Configuration.Customizations.Coder.Ignore
if dcIgnored {
proc.stop()
if proc.agent.ID != uuid.Nil {
// Unlock while doing the delete operation.
api.mu.Unlock()
client := *api.subAgentClient.Load()
if err := client.Delete(ctx, proc.agent.ID); err != nil {
api.mu.Lock()
return xerrors.Errorf("delete subagent: %w", err)
}
api.mu.Lock()
}
// Reset agent and containerID to force config re-reading if ignore is toggled.
proc.agent = SubAgent{}
proc.containerID = ""
api.injectedSubAgentProcs[dc.WorkspaceFolder] = proc
api.ignoredDevcontainers[dc.WorkspaceFolder] = dcIgnored
return nil
}
}
if proc.containerID == container.ID && proc.ctx.Err() == nil {
// Same container and running, no need to reinject.
return nil
}
if proc.containerID != container.ID {
// Always recreate the subagent if the container ID changed
// for now, in the future we can inspect e.g. if coder_apps
// remain the same and avoid unnecessary recreation.
logger.Debug(ctx, "container ID changed, injecting subagent into new container",
slog.F("old_container_id", proc.containerID),
)
maybeRecreateSubAgent = proc.agent.ID != uuid.Nil
}
// Container ID changed or the subagent process is not running,
// stop the existing subagent context to replace it.
proc.stop()
}
if proc.agent.OperatingSystem == "" {
// Set SubAgent defaults.
proc.agent.OperatingSystem = "linux" // Assuming Linux for devcontainers.
}
// Prepare the subAgentProcess to be used when running the subagent.
// We use api.ctx here to ensure that the process keeps running
// after this method returns.
proc.ctx, proc.stop = context.WithCancel(api.ctx)
api.injectedSubAgentProcs[dc.WorkspaceFolder] = proc
// This is used to track the goroutine that will run the subagent
// process inside the container. It will be decremented when the
// subagent process completes or if an error occurs before we can
// start the subagent.
api.asyncWg.Add(1)
ranSubAgent := false
// Clean up if injection fails.
var dcIgnored, setDCIgnored bool
defer func() {
if setDCIgnored {
api.ignoredDevcontainers[dc.WorkspaceFolder] = dcIgnored
}
if !ranSubAgent {
proc.stop()
if !api.closed {
// Ensure sure state modifications are reflected.
api.injectedSubAgentProcs[dc.WorkspaceFolder] = proc
}
api.asyncWg.Done()
}
}()
// Unlock the mutex to allow other operations while we
// inject the subagent into the container.
api.mu.Unlock()
defer api.mu.Lock() // Re-lock.
arch, err := api.ccli.DetectArchitecture(ctx, container.ID)
if err != nil {
return xerrors.Errorf("detect architecture: %w", err)
}
logger.Info(ctx, "detected container architecture", slog.F("architecture", arch))
// For now, only support injecting if the architecture matches the host.
hostArch := runtime.GOARCH
// TODO(mafredri): Add support for downloading agents for supported architectures.
if arch != hostArch {
logger.Warn(ctx, "skipping subagent injection for unsupported architecture",
slog.F("container_arch", arch),
slog.F("host_arch", hostArch),
)
return nil
}
if proc.agent.ID == uuid.Nil {
proc.agent.Architecture = arch
}
subAgentConfig := proc.agent.CloneConfig(dc)
if proc.agent.ID == uuid.Nil || maybeRecreateSubAgent {
subAgentConfig.Architecture = arch
displayAppsMap := map[codersdk.DisplayApp]bool{
// NOTE(DanielleMaywood):
// We use the same defaults here as set in terraform-provider-coder.
// https://github.com/coder/terraform-provider-coder/blob/c1c33f6d556532e75662c0ca373ed8fdea220eb5/provider/agent.go#L38-L51
codersdk.DisplayAppVSCodeDesktop: true,
codersdk.DisplayAppVSCodeInsiders: false,
codersdk.DisplayAppWebTerminal: true,
codersdk.DisplayAppSSH: true,
codersdk.DisplayAppPortForward: true,
}
var (
featureOptionsAsEnvs []string
appsWithPossibleDuplicates []SubAgentApp
workspaceFolder = DevcontainerDefaultContainerWorkspaceFolder
)
if err := func() error {
var (
config DevcontainerConfig
configOutdated bool
)
readConfig := func() (DevcontainerConfig, error) {
return api.dccli.ReadConfig(ctx, dc.WorkspaceFolder, dc.ConfigPath,
append(featureOptionsAsEnvs, []string{
fmt.Sprintf("CODER_WORKSPACE_AGENT_NAME=%s", subAgentConfig.Name),
fmt.Sprintf("CODER_WORKSPACE_OWNER_NAME=%s", api.ownerName),
fmt.Sprintf("CODER_WORKSPACE_NAME=%s", api.workspaceName),
fmt.Sprintf("CODER_WORKSPACE_PARENT_AGENT_NAME=%s", api.parentAgent),
fmt.Sprintf("CODER_URL=%s", api.subAgentURL),
fmt.Sprintf("CONTAINER_ID=%s", container.ID),
}...),
)
}
if config, err = readConfig(); err != nil {
return err
}
// We only allow ignore to be set in the root customization layer to
// prevent weird interactions with devcontainer features.
dcIgnored, setDCIgnored = config.Configuration.Customizations.Coder.Ignore, true
if dcIgnored {
return nil
}
workspaceFolder = config.Workspace.WorkspaceFolder
featureOptionsAsEnvs = config.MergedConfiguration.Features.OptionsAsEnvs()
if len(featureOptionsAsEnvs) > 0 {
configOutdated = true
}
// NOTE(DanielleMaywood):
// We only want to take an agent name specified in the root customization layer.
// This restricts the ability for a feature to specify the agent name. We may revisit
// this in the future, but for now we want to restrict this behavior.
if name := config.Configuration.Customizations.Coder.Name; name != "" {
// We only want to pick this name if it is a valid name.
if provisioner.AgentNameRegex.Match([]byte(name)) {
subAgentConfig.Name = name
configOutdated = true
delete(api.usingWorkspaceFolderName, dc.WorkspaceFolder)
} else {
logger.Warn(ctx, "invalid name in devcontainer customization, ignoring",
slog.F("name", name),
slog.F("regex", provisioner.AgentNameRegex.String()),
)
}
}
if configOutdated {
if config, err = readConfig(); err != nil {
return err
}
}
coderCustomization := config.MergedConfiguration.Customizations.Coder
for _, customization := range coderCustomization {
for app, enabled := range customization.DisplayApps {
if _, ok := displayAppsMap[app]; !ok {
logger.Warn(ctx, "unknown display app in devcontainer customization, ignoring",
slog.F("app", app),
slog.F("enabled", enabled),
)
continue
}
displayAppsMap[app] = enabled
}
appsWithPossibleDuplicates = append(appsWithPossibleDuplicates, customization.Apps...)
}
return nil
}(); err != nil {
api.logger.Error(ctx, "unable to read devcontainer config", slog.Error(err))
}
if dcIgnored {
proc.stop()
if proc.agent.ID != uuid.Nil {
// If we stop the subagent, we also need to delete it.
client := *api.subAgentClient.Load()
if err := client.Delete(ctx, proc.agent.ID); err != nil {
return xerrors.Errorf("delete subagent: %w", err)
}
}
// Reset agent and containerID to force config re-reading if
// ignore is toggled.
proc.agent = SubAgent{}
proc.containerID = ""
return nil
}
displayApps := make([]codersdk.DisplayApp, 0, len(displayAppsMap))
for app, enabled := range displayAppsMap {
if enabled {
displayApps = append(displayApps, app)
}
}
slices.Sort(displayApps)
appSlugs := make(map[string]struct{})
apps := make([]SubAgentApp, 0, len(appsWithPossibleDuplicates))
// We want to deduplicate the apps based on their slugs here.
// As we want to prioritize later apps, we will walk through this
// backwards.
for _, app := range slices.Backward(appsWithPossibleDuplicates) {
if _, slugAlreadyExists := appSlugs[app.Slug]; slugAlreadyExists {
continue
}
appSlugs[app.Slug] = struct{}{}
apps = append(apps, app)
}
// Apps is currently in reverse order here, so by reversing it we restore
// it to the original order.
slices.Reverse(apps)
subAgentConfig.DisplayApps = displayApps
subAgentConfig.Apps = apps
subAgentConfig.Directory = workspaceFolder
}
agentBinaryPath, err := os.Executable()
if err != nil {
return xerrors.Errorf("get agent binary path: %w", err)
}
agentBinaryPath, err = filepath.EvalSymlinks(agentBinaryPath)
if err != nil {
return xerrors.Errorf("resolve agent binary path: %w", err)
}
// If we scripted this as a `/bin/sh` script, we could reduce these
// steps to one instruction, speeding up the injection process.
//
// Note: We use `path` instead of `filepath` here because we are
// working with Unix-style paths inside the container.
if _, err := api.ccli.ExecAs(ctx, container.ID, "root", "mkdir", "-p", path.Dir(coderPathInsideContainer)); err != nil {
return xerrors.Errorf("create agent directory in container: %w", err)
}
if err := api.ccli.Copy(ctx, container.ID, agentBinaryPath, coderPathInsideContainer); err != nil {
return xerrors.Errorf("copy agent binary: %w", err)
}
logger.Info(ctx, "copied agent binary to container")
// Make sure the agent binary is executable so we can run it (the
// user doesn't matter since we're making it executable for all).
if _, err := api.ccli.ExecAs(ctx, container.ID, "root", "chmod", "0755", path.Dir(coderPathInsideContainer), coderPathInsideContainer); err != nil {
return xerrors.Errorf("set agent binary executable: %w", err)
}
// Make sure the agent binary is owned by a valid user so we can run it.
if _, err := api.ccli.ExecAs(ctx, container.ID, "root", "/bin/sh", "-c", fmt.Sprintf("chown $(id -u):$(id -g) %s", coderPathInsideContainer)); err != nil {
return xerrors.Errorf("set agent binary ownership: %w", err)
}
// Attempt to add CAP_NET_ADMIN to the binary to improve network
// performance (optional, allow to fail). See `bootstrap_linux.sh`.
// TODO(mafredri): Disable for now until we can figure out why this
// causes the following error on some images:
//
// Image: mcr.microsoft.com/devcontainers/base:ubuntu
// Error: /.coder-agent/coder: Operation not permitted
//
// if _, err := api.ccli.ExecAs(ctx, container.ID, "root", "setcap", "cap_net_admin+ep", coderPathInsideContainer); err != nil {
// logger.Warn(ctx, "set CAP_NET_ADMIN on agent binary failed", slog.Error(err))
// }
deleteSubAgent := proc.agent.ID != uuid.Nil && maybeRecreateSubAgent && !proc.agent.EqualConfig(subAgentConfig)
if deleteSubAgent {
logger.Debug(ctx, "deleting existing subagent for recreation", slog.F("agent_id", proc.agent.ID))
client := *api.subAgentClient.Load()
err = client.Delete(ctx, proc.agent.ID)
if err != nil {
return xerrors.Errorf("delete existing subagent failed: %w", err)
}
proc.agent = SubAgent{} // Clear agent to signal that we need to create a new one.
}
if proc.agent.ID == uuid.Nil {
logger.Debug(ctx, "creating new subagent",
slog.F("directory", subAgentConfig.Directory),
slog.F("display_apps", subAgentConfig.DisplayApps),
)
// Create new subagent record in the database to receive the auth token.
// If we get a unique constraint violation, try with expanded names that
// include parent directories to avoid collisions.
client := *api.subAgentClient.Load()
originalName := subAgentConfig.Name
for attempt := 1; attempt <= maxAttemptsToNameAgent; attempt++ {
agent, err := client.Create(ctx, subAgentConfig)
if err == nil {
proc.agent = agent // Only reassign on success.
if api.usingWorkspaceFolderName[dc.WorkspaceFolder] {
api.devcontainerNames[dc.Name] = true
delete(api.usingWorkspaceFolderName, dc.WorkspaceFolder)
}
break
}
// NOTE(DanielleMaywood):
// Ordinarily we'd use `errors.As` here, but it didn't appear to work. Not
// sure if this is because of the communication protocol? Instead I've opted
// for a slightly more janky string contains approach.
//
// We only care if sub agent creation has failed due to a unique constraint
// violation on the agent name, as we can _possibly_ rectify this.
if !strings.Contains(err.Error(), "workspace agent name") {
return xerrors.Errorf("create subagent failed: %w", err)
}
// If there has been a unique constraint violation but the user is *not*
// using an auto-generated name, then we should error. This is because
// we do not want to surprise the user with a name they did not ask for.
if usingFolderName := api.usingWorkspaceFolderName[dc.WorkspaceFolder]; !usingFolderName {
return xerrors.Errorf("create subagent failed: %w", err)
}
if attempt == maxAttemptsToNameAgent {
return xerrors.Errorf("create subagent failed after %d attempts: %w", attempt, err)
}
// We increase how much of the workspace folder is used for generating
// the agent name. With each iteration there is greater chance of this
// being successful.
subAgentConfig.Name, api.usingWorkspaceFolderName[dc.WorkspaceFolder] = expandedAgentName(dc.WorkspaceFolder, dc.Container.FriendlyName, attempt)
logger.Debug(ctx, "retrying subagent creation with expanded name",
slog.F("original_name", originalName),
slog.F("expanded_name", subAgentConfig.Name),
slog.F("attempt", attempt+1),
)
}
logger.Info(ctx, "created new subagent", slog.F("agent_id", proc.agent.ID))
} else {
logger.Debug(ctx, "subagent already exists, skipping recreation",
slog.F("agent_id", proc.agent.ID),
)
}
api.mu.Lock() // Re-lock to update the agent.
defer api.mu.Unlock()
if api.closed {
deleteCtx, deleteCancel := context.WithTimeout(context.Background(), defaultOperationTimeout)
defer deleteCancel()
client := *api.subAgentClient.Load()
err := client.Delete(deleteCtx, proc.agent.ID)
if err != nil {
return xerrors.Errorf("delete existing subagent failed after API closed: %w", err)
}
return nil
}
// If we got this far, we should update the container ID to make
// sure we don't retry. If we update it too soon we may end up
// using an old subagent if e.g. delete failed previously.
proc.containerID = container.ID
api.injectedSubAgentProcs[dc.WorkspaceFolder] = proc
// Start the subagent in the container in a new goroutine to avoid
// blocking. Note that we pass the api.ctx to the subagent process
// so that it isn't affected by the timeout.
go api.runSubAgentInContainer(api.ctx, logger, dc, proc, coderPathInsideContainer)
ranSubAgent = true
return nil
}
// runSubAgentInContainer runs the subagent process inside a dev
// container. The api.asyncWg must be incremented before calling this
// function, and it will be decremented when the subagent process
// completes or if an error occurs.
func (api *API) runSubAgentInContainer(ctx context.Context, logger slog.Logger, dc codersdk.WorkspaceAgentDevcontainer, proc subAgentProcess, agentPath string) {
container := dc.Container // Must not be nil.
logger = logger.With(
slog.F("agent_id", proc.agent.ID),
)
defer func() {
proc.stop()
logger.Debug(ctx, "agent process cleanup complete")
api.asyncWg.Done()
}()
logger.Info(ctx, "starting subagent in devcontainer")
env := []string{
"CODER_AGENT_URL=" + api.subAgentURL,
"CODER_AGENT_TOKEN=" + proc.agent.AuthToken.String(),
}
env = append(env, api.subAgentEnv...)
err := api.dccli.Exec(proc.ctx, dc.WorkspaceFolder, dc.ConfigPath, agentPath, []string{"agent"},
WithExecContainerID(container.ID),
WithRemoteEnv(env...),
)
if err != nil && !errors.Is(err, context.Canceled) {
logger.Error(ctx, "subagent process failed", slog.Error(err))
} else {
logger.Info(ctx, "subagent process finished")
}
}
func (api *API) Close() error {
api.mu.Lock()
if api.closed {
api.mu.Unlock()
return nil
}
api.logger.Debug(api.ctx, "closing API")
api.closed = true
// Stop all running subagent processes and clean up.
subAgentIDs := make([]uuid.UUID, 0, len(api.injectedSubAgentProcs))
for workspaceFolder, proc := range api.injectedSubAgentProcs {
api.logger.Debug(api.ctx, "canceling subagent process",
slog.F("agent_name", proc.agent.Name),
slog.F("agent_id", proc.agent.ID),
slog.F("container_id", proc.containerID),
slog.F("workspace_folder", workspaceFolder),
)
proc.stop()
if proc.agent.ID != uuid.Nil {
subAgentIDs = append(subAgentIDs, proc.agent.ID)
}
}
api.injectedSubAgentProcs = make(map[string]subAgentProcess)
api.cancel() // Interrupt all routines.
api.mu.Unlock() // Release lock before waiting for goroutines.
// Note: We can't use api.ctx here because it's canceled.
deleteCtx, deleteCancel := context.WithTimeout(context.Background(), defaultOperationTimeout)
defer deleteCancel()
client := *api.subAgentClient.Load()
for _, id := range subAgentIDs {
err := client.Delete(deleteCtx, id)
if err != nil {
api.logger.Error(api.ctx, "delete subagent record during shutdown failed",
slog.Error(err),
slog.F("agent_id", id),
)
}
}
// Close the watcher to ensure its loop finishes.
err := api.watcher.Close()
// Wait for loops to finish.
if api.watcherDone != nil {
<-api.watcherDone
}
if api.updaterDone != nil {
<-api.updaterDone
}
// Wait for all async tasks to complete.
api.asyncWg.Wait()
api.logger.Debug(api.ctx, "closed API")
return err
}