mirror of
https://github.com/coder/coder.git
synced 2025-07-21 01:28:49 +00:00
refactor: replace startup script logs EOF with starting/ready time (#8082)
This commit reverts some of the changes in #8029 and implements an alternative method of keeping track of when the startup script has ended and there will be no more logs. This is achieved by adding new agent fields for tracking when the agent enters the "starting" and "ready"/"start_error" lifecycle states. The timestamps simplify logic since we don't need understand if the current state is before or after the state we're interested in. They can also be used to show data like how long the startup script took to execute. This also allowed us to remove the EOF field from the logs as the implementation was problematic when we returned the EOF log entry in the response since requesting _after_ that ID would give no logs and the API would thus lose track of EOF.
This commit is contained in:
committed by
GitHub
parent
b1d1b63113
commit
8dac0356ed
131
agent/agent.go
131
agent/agent.go
@ -14,6 +14,7 @@ import (
|
||||
"net/http"
|
||||
"net/netip"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/user"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
@ -126,6 +127,7 @@ func New(options Options) Agent {
|
||||
tempDir: options.TempDir,
|
||||
lifecycleUpdate: make(chan struct{}, 1),
|
||||
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
|
||||
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
|
||||
ignorePorts: options.IgnorePorts,
|
||||
connStatsChan: make(chan *agentsdk.Stats, 1),
|
||||
sshMaxTimeout: options.SSHMaxTimeout,
|
||||
@ -170,7 +172,7 @@ type agent struct {
|
||||
lifecycleUpdate chan struct{}
|
||||
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
|
||||
lifecycleMu sync.RWMutex // Protects following.
|
||||
lifecycleState codersdk.WorkspaceAgentLifecycle
|
||||
lifecycleStates []agentsdk.PostLifecycleRequest
|
||||
|
||||
network *tailnet.Conn
|
||||
connStatsChan chan *agentsdk.Stats
|
||||
@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// reportLifecycleLoop reports the current lifecycle state once.
|
||||
// Only the latest state is reported, intermediate states may be
|
||||
// lost if the agent can't communicate with the API.
|
||||
// reportLifecycleLoop reports the current lifecycle state once. All state
|
||||
// changes are reported in order.
|
||||
func (a *agent) reportLifecycleLoop(ctx context.Context) {
|
||||
var lastReported codersdk.WorkspaceAgentLifecycle
|
||||
lastReportedIndex := 0 // Start off with the created state without reporting it.
|
||||
for {
|
||||
select {
|
||||
case <-a.lifecycleUpdate:
|
||||
@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
|
||||
|
||||
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
|
||||
a.lifecycleMu.RLock()
|
||||
state := a.lifecycleState
|
||||
lastIndex := len(a.lifecycleStates) - 1
|
||||
report := a.lifecycleStates[lastReportedIndex]
|
||||
if len(a.lifecycleStates) > lastReportedIndex+1 {
|
||||
report = a.lifecycleStates[lastReportedIndex+1]
|
||||
}
|
||||
a.lifecycleMu.RUnlock()
|
||||
|
||||
if state == lastReported {
|
||||
if lastIndex == lastReportedIndex {
|
||||
break
|
||||
}
|
||||
|
||||
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("state", state))
|
||||
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload", report))
|
||||
|
||||
err := a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{
|
||||
State: state,
|
||||
})
|
||||
err := a.client.PostLifecycle(ctx, report)
|
||||
if err == nil {
|
||||
lastReported = state
|
||||
lastReportedIndex++
|
||||
select {
|
||||
case a.lifecycleReported <- state:
|
||||
case a.lifecycleReported <- report.State:
|
||||
case <-a.lifecycleReported:
|
||||
a.lifecycleReported <- state
|
||||
a.lifecycleReported <- report.State
|
||||
}
|
||||
if lastReportedIndex < lastIndex {
|
||||
// Keep reporting until we've sent all messages, we can't
|
||||
// rely on the channel triggering us before the backlog is
|
||||
// consumed.
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
|
||||
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
|
||||
// The state is only updated if it's a valid state transition.
|
||||
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
|
||||
report := agentsdk.PostLifecycleRequest{
|
||||
State: state,
|
||||
ChangedAt: database.Now(),
|
||||
}
|
||||
|
||||
a.lifecycleMu.Lock()
|
||||
lastState := a.lifecycleState
|
||||
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
|
||||
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
|
||||
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1]
|
||||
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder, report.State) {
|
||||
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastReport), slog.F("current", report))
|
||||
a.lifecycleMu.Unlock()
|
||||
return
|
||||
}
|
||||
a.lifecycleState = state
|
||||
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
|
||||
a.lifecycleStates = append(a.lifecycleStates, report)
|
||||
a.logger.Debug(ctx, "set lifecycle state", slog.F("current", report), slog.F("last", lastReport))
|
||||
a.lifecycleMu.Unlock()
|
||||
|
||||
select {
|
||||
@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {
|
||||
|
||||
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
|
||||
scriptDone := make(chan error, 1)
|
||||
scriptStart := time.Now()
|
||||
err = a.trackConnGoroutine(func() {
|
||||
defer close(scriptDone)
|
||||
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript)
|
||||
@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
|
||||
select {
|
||||
case err = <-scriptDone:
|
||||
case <-timeout:
|
||||
a.logger.Warn(ctx, "startup script timed out")
|
||||
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.ShutdownScriptTimeout))
|
||||
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
|
||||
err = <-scriptDone // The script can still complete after a timeout.
|
||||
}
|
||||
if errors.Is(err, context.Canceled) {
|
||||
return
|
||||
}
|
||||
// Only log if there was a startup script.
|
||||
if manifest.StartupScript != "" {
|
||||
execTime := time.Since(scriptStart)
|
||||
if err != nil {
|
||||
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
|
||||
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
|
||||
} else {
|
||||
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
|
||||
if err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
return
|
||||
}
|
||||
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
|
||||
}
|
||||
a.setLifecycle(ctx, lifecycleState)
|
||||
}()
|
||||
@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
|
||||
return a.runScript(ctx, "shutdown", script)
|
||||
}
|
||||
|
||||
func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
|
||||
func (a *agent) runScript(ctx context.Context, lifecycle, script string) (err error) {
|
||||
if script == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
|
||||
logger := a.logger.With(slog.F("lifecycle", lifecycle))
|
||||
|
||||
logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script))
|
||||
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
|
||||
}
|
||||
defer func() {
|
||||
_ = fileWriter.Close()
|
||||
err := fileWriter.Close()
|
||||
if err != nil {
|
||||
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err))
|
||||
}
|
||||
}()
|
||||
|
||||
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("%s script: create command: %w", lifecycle, err)
|
||||
}
|
||||
cmd := cmdPty.AsExec()
|
||||
|
||||
var writer io.Writer = fileWriter
|
||||
if lifecycle == "startup" {
|
||||
// Create pipes for startup logs reader and writer
|
||||
@ -854,7 +871,7 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
|
||||
writer = io.MultiWriter(fileWriter, logsWriter)
|
||||
flushedLogs, err := a.trackScriptLogs(ctx, logsReader)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("track script logs: %w", err)
|
||||
return xerrors.Errorf("track %s script logs: %w", lifecycle, err)
|
||||
}
|
||||
defer func() {
|
||||
_ = logsWriter.Close()
|
||||
@ -862,13 +879,26 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
|
||||
}()
|
||||
}
|
||||
|
||||
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
|
||||
if err != nil {
|
||||
return xerrors.Errorf("create command: %w", err)
|
||||
}
|
||||
cmd := cmdPty.AsExec()
|
||||
cmd.Stdout = writer
|
||||
cmd.Stderr = writer
|
||||
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
end := time.Now()
|
||||
execTime := end.Sub(start)
|
||||
exitCode := 0
|
||||
if err != nil {
|
||||
exitCode = 255 // Unknown status.
|
||||
var exitError *exec.ExitError
|
||||
if xerrors.As(err, &exitError) {
|
||||
exitCode = exitError.ExitCode()
|
||||
}
|
||||
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err))
|
||||
} else {
|
||||
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
|
||||
}
|
||||
}()
|
||||
|
||||
err = cmd.Run()
|
||||
if err != nil {
|
||||
// cmd.Run does not return a context canceled error, it returns "signal: killed".
|
||||
@ -876,7 +906,7 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
return xerrors.Errorf("run: %w", err)
|
||||
return xerrors.Errorf("%s script: run: %w", lifecycle, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
|
||||
if errors.As(err, &sdkErr) {
|
||||
if sdkErr.StatusCode() == http.StatusRequestEntityTooLarge {
|
||||
a.logger.Warn(ctx, "startup logs too large, dropping logs")
|
||||
// Always send the EOF even if logs overflow.
|
||||
if len(logsToSend) > 1 && logsToSend[len(logsToSend)-1].EOF {
|
||||
logsToSend = logsToSend[len(logsToSend)-1:]
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
|
||||
if err := scanner.Err(); err != nil {
|
||||
a.logger.Error(ctx, "scan startup logs", slog.Error(err))
|
||||
}
|
||||
queueLog(agentsdk.StartupLog{
|
||||
CreatedAt: database.Now(),
|
||||
Output: "",
|
||||
EOF: true,
|
||||
})
|
||||
defer close(logsFinished)
|
||||
logsFlushed.L.Lock()
|
||||
for {
|
||||
@ -1392,7 +1412,6 @@ func (a *agent) Close() error {
|
||||
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
|
||||
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" {
|
||||
scriptDone := make(chan error, 1)
|
||||
scriptStart := time.Now()
|
||||
go func() {
|
||||
defer close(scriptDone)
|
||||
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript)
|
||||
@ -1411,16 +1430,12 @@ func (a *agent) Close() error {
|
||||
select {
|
||||
case err = <-scriptDone:
|
||||
case <-timeout:
|
||||
a.logger.Warn(ctx, "shutdown script timed out")
|
||||
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout))
|
||||
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
|
||||
err = <-scriptDone // The script can still complete after a timeout.
|
||||
}
|
||||
execTime := time.Since(scriptStart)
|
||||
if err != nil {
|
||||
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
|
||||
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
|
||||
} else {
|
||||
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user