refactor: replace startup script logs EOF with starting/ready time (#8082)

This commit reverts some of the changes in #8029 and implements an
alternative method of keeping track of when the startup script has ended
and there will be no more logs.

This is achieved by adding new agent fields for tracking when the agent
enters the "starting" and "ready"/"start_error" lifecycle states. The
timestamps simplify logic since we don't need understand if the current
state is before or after the state we're interested in. They can also be
used to show data like how long the startup script took to execute. This
also allowed us to remove the EOF field from the logs as the
implementation was problematic when we returned the EOF log entry in the
response since requesting _after_ that ID would give no logs and the API
would thus lose track of EOF.
This commit is contained in:
Mathias Fredriksson
2023-06-20 14:41:55 +03:00
committed by GitHub
parent b1d1b63113
commit 8dac0356ed
29 changed files with 462 additions and 540 deletions

View File

@ -14,6 +14,7 @@ import (
"net/http"
"net/netip"
"os"
"os/exec"
"os/user"
"path/filepath"
"sort"
@ -126,6 +127,7 @@ func New(options Options) Agent {
tempDir: options.TempDir,
lifecycleUpdate: make(chan struct{}, 1),
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
ignorePorts: options.IgnorePorts,
connStatsChan: make(chan *agentsdk.Stats, 1),
sshMaxTimeout: options.SSHMaxTimeout,
@ -170,7 +172,7 @@ type agent struct {
lifecycleUpdate chan struct{}
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
lifecycleMu sync.RWMutex // Protects following.
lifecycleState codersdk.WorkspaceAgentLifecycle
lifecycleStates []agentsdk.PostLifecycleRequest
network *tailnet.Conn
connStatsChan chan *agentsdk.Stats
@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
}
}
// reportLifecycleLoop reports the current lifecycle state once.
// Only the latest state is reported, intermediate states may be
// lost if the agent can't communicate with the API.
// reportLifecycleLoop reports the current lifecycle state once. All state
// changes are reported in order.
func (a *agent) reportLifecycleLoop(ctx context.Context) {
var lastReported codersdk.WorkspaceAgentLifecycle
lastReportedIndex := 0 // Start off with the created state without reporting it.
for {
select {
case <-a.lifecycleUpdate:
@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
a.lifecycleMu.RLock()
state := a.lifecycleState
lastIndex := len(a.lifecycleStates) - 1
report := a.lifecycleStates[lastReportedIndex]
if len(a.lifecycleStates) > lastReportedIndex+1 {
report = a.lifecycleStates[lastReportedIndex+1]
}
a.lifecycleMu.RUnlock()
if state == lastReported {
if lastIndex == lastReportedIndex {
break
}
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("state", state))
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload", report))
err := a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{
State: state,
})
err := a.client.PostLifecycle(ctx, report)
if err == nil {
lastReported = state
lastReportedIndex++
select {
case a.lifecycleReported <- state:
case a.lifecycleReported <- report.State:
case <-a.lifecycleReported:
a.lifecycleReported <- state
a.lifecycleReported <- report.State
}
if lastReportedIndex < lastIndex {
// Keep reporting until we've sent all messages, we can't
// rely on the channel triggering us before the backlog is
// consumed.
continue
}
break
}
@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
// The state is only updated if it's a valid state transition.
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
report := agentsdk.PostLifecycleRequest{
State: state,
ChangedAt: database.Now(),
}
a.lifecycleMu.Lock()
lastState := a.lifecycleState
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1]
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder, report.State) {
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastReport), slog.F("current", report))
a.lifecycleMu.Unlock()
return
}
a.lifecycleState = state
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
a.lifecycleStates = append(a.lifecycleStates, report)
a.logger.Debug(ctx, "set lifecycle state", slog.F("current", report), slog.F("last", lastReport))
a.lifecycleMu.Unlock()
select {
@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
scriptDone := make(chan error, 1)
scriptStart := time.Now()
err = a.trackConnGoroutine(func() {
defer close(scriptDone)
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript)
@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
select {
case err = <-scriptDone:
case <-timeout:
a.logger.Warn(ctx, "startup script timed out")
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.ShutdownScriptTimeout))
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
err = <-scriptDone // The script can still complete after a timeout.
}
if errors.Is(err, context.Canceled) {
return
}
// Only log if there was a startup script.
if manifest.StartupScript != "" {
execTime := time.Since(scriptStart)
if err != nil {
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
} else {
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
if err != nil {
if errors.Is(err, context.Canceled) {
return
}
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
}
a.setLifecycle(ctx, lifecycleState)
}()
@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
return a.runScript(ctx, "shutdown", script)
}
func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
func (a *agent) runScript(ctx context.Context, lifecycle, script string) (err error) {
if script == "" {
return nil
}
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
logger := a.logger.With(slog.F("lifecycle", lifecycle))
logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script))
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
if err != nil {
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
}
defer func() {
_ = fileWriter.Close()
err := fileWriter.Close()
if err != nil {
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err))
}
}()
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
if err != nil {
return xerrors.Errorf("%s script: create command: %w", lifecycle, err)
}
cmd := cmdPty.AsExec()
var writer io.Writer = fileWriter
if lifecycle == "startup" {
// Create pipes for startup logs reader and writer
@ -854,7 +871,7 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
writer = io.MultiWriter(fileWriter, logsWriter)
flushedLogs, err := a.trackScriptLogs(ctx, logsReader)
if err != nil {
return xerrors.Errorf("track script logs: %w", err)
return xerrors.Errorf("track %s script logs: %w", lifecycle, err)
}
defer func() {
_ = logsWriter.Close()
@ -862,13 +879,26 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
}()
}
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
if err != nil {
return xerrors.Errorf("create command: %w", err)
}
cmd := cmdPty.AsExec()
cmd.Stdout = writer
cmd.Stderr = writer
start := time.Now()
defer func() {
end := time.Now()
execTime := end.Sub(start)
exitCode := 0
if err != nil {
exitCode = 255 // Unknown status.
var exitError *exec.ExitError
if xerrors.As(err, &exitError) {
exitCode = exitError.ExitCode()
}
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err))
} else {
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
}
}()
err = cmd.Run()
if err != nil {
// cmd.Run does not return a context canceled error, it returns "signal: killed".
@ -876,7 +906,7 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
return ctx.Err()
}
return xerrors.Errorf("run: %w", err)
return xerrors.Errorf("%s script: run: %w", lifecycle, err)
}
return nil
}
@ -929,11 +959,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
if errors.As(err, &sdkErr) {
if sdkErr.StatusCode() == http.StatusRequestEntityTooLarge {
a.logger.Warn(ctx, "startup logs too large, dropping logs")
// Always send the EOF even if logs overflow.
if len(logsToSend) > 1 && logsToSend[len(logsToSend)-1].EOF {
logsToSend = logsToSend[len(logsToSend)-1:]
continue
}
break
}
}
@ -986,11 +1011,6 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
if err := scanner.Err(); err != nil {
a.logger.Error(ctx, "scan startup logs", slog.Error(err))
}
queueLog(agentsdk.StartupLog{
CreatedAt: database.Now(),
Output: "",
EOF: true,
})
defer close(logsFinished)
logsFlushed.L.Lock()
for {
@ -1392,7 +1412,6 @@ func (a *agent) Close() error {
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" {
scriptDone := make(chan error, 1)
scriptStart := time.Now()
go func() {
defer close(scriptDone)
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript)
@ -1411,16 +1430,12 @@ func (a *agent) Close() error {
select {
case err = <-scriptDone:
case <-timeout:
a.logger.Warn(ctx, "shutdown script timed out")
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout))
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
err = <-scriptDone // The script can still complete after a timeout.
}
execTime := time.Since(scriptStart)
if err != nil {
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
} else {
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
}
}