mirror of
https://github.com/coder/coder.git
synced 2025-03-15 19:19:58 +00:00
Code that was in `/clock` has been moved to github.com/coder/quartz. This PR refactors our use of the clock library to point to the external Quartz repo.
192 lines
6.0 KiB
Go
192 lines
6.0 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"net/http"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
"github.com/coder/coder/v2/codersdk/agentsdk"
|
|
"github.com/coder/quartz"
|
|
)
|
|
|
|
// PostWorkspaceAgentAppHealth updates the workspace app health.
|
|
type PostWorkspaceAgentAppHealth func(context.Context, agentsdk.PostAppHealthsRequest) error
|
|
|
|
// WorkspaceAppHealthReporter is a function that checks and reports the health of the workspace apps until the passed context is canceled.
|
|
type WorkspaceAppHealthReporter func(ctx context.Context)
|
|
|
|
// NewWorkspaceAppHealthReporter creates a WorkspaceAppHealthReporter that reports app health to coderd.
|
|
func NewWorkspaceAppHealthReporter(logger slog.Logger, apps []codersdk.WorkspaceApp, postWorkspaceAgentAppHealth PostWorkspaceAgentAppHealth) WorkspaceAppHealthReporter {
|
|
return NewAppHealthReporterWithClock(logger, apps, postWorkspaceAgentAppHealth, quartz.NewReal())
|
|
}
|
|
|
|
// NewAppHealthReporterWithClock is only called directly by test code. Product code should call
|
|
// NewAppHealthReporter.
|
|
func NewAppHealthReporterWithClock(
|
|
logger slog.Logger,
|
|
apps []codersdk.WorkspaceApp,
|
|
postWorkspaceAgentAppHealth PostWorkspaceAgentAppHealth,
|
|
clk quartz.Clock,
|
|
) WorkspaceAppHealthReporter {
|
|
logger = logger.Named("apphealth")
|
|
|
|
return func(ctx context.Context) {
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
// no need to run this loop if no apps for this workspace.
|
|
if len(apps) == 0 {
|
|
return
|
|
}
|
|
|
|
hasHealthchecksEnabled := false
|
|
health := make(map[uuid.UUID]codersdk.WorkspaceAppHealth, 0)
|
|
for _, app := range apps {
|
|
if app.Health == codersdk.WorkspaceAppHealthDisabled {
|
|
continue
|
|
}
|
|
health[app.ID] = app.Health
|
|
hasHealthchecksEnabled = true
|
|
}
|
|
|
|
// no need to run this loop if no health checks are configured.
|
|
if !hasHealthchecksEnabled {
|
|
return
|
|
}
|
|
|
|
// run a ticker for each app health check.
|
|
var mu sync.RWMutex
|
|
failures := make(map[uuid.UUID]int, 0)
|
|
for _, nextApp := range apps {
|
|
if !shouldStartTicker(nextApp) {
|
|
continue
|
|
}
|
|
app := nextApp
|
|
go func() {
|
|
_ = clk.TickerFunc(ctx, time.Duration(app.Healthcheck.Interval)*time.Second, func() error {
|
|
// We time out at the healthcheck interval to prevent getting too backed up, but
|
|
// set it 1ms early so that it's not simultaneous with the next tick in testing,
|
|
// which makes the test easier to understand.
|
|
//
|
|
// It would be idiomatic to use the http.Client.Timeout or a context.WithTimeout,
|
|
// but we are passing this off to the native http library, which is not aware
|
|
// of the clock library we are using. That means in testing, with a mock clock
|
|
// it will compare mocked times with real times, and we will get strange results.
|
|
// So, we just implement the timeout as a context we cancel with an AfterFunc
|
|
reqCtx, reqCancel := context.WithCancel(ctx)
|
|
timeout := clk.AfterFunc(
|
|
time.Duration(app.Healthcheck.Interval)*time.Second-time.Millisecond,
|
|
reqCancel,
|
|
"timeout", app.Slug)
|
|
defer timeout.Stop()
|
|
|
|
err := func() error {
|
|
req, err := http.NewRequestWithContext(reqCtx, http.MethodGet, app.Healthcheck.URL, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
res, err := http.DefaultClient.Do(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// successful healthcheck is a non-5XX status code
|
|
_ = res.Body.Close()
|
|
if res.StatusCode >= http.StatusInternalServerError {
|
|
return xerrors.Errorf("error status code: %d", res.StatusCode)
|
|
}
|
|
|
|
return nil
|
|
}()
|
|
if err != nil {
|
|
nowUnhealthy := false
|
|
mu.Lock()
|
|
if failures[app.ID] < int(app.Healthcheck.Threshold) {
|
|
// increment the failure count and keep status the same.
|
|
// we will change it when we hit the threshold.
|
|
failures[app.ID]++
|
|
} else {
|
|
// set to unhealthy if we hit the failure threshold.
|
|
// we stop incrementing at the threshold to prevent the failure value from increasing forever.
|
|
health[app.ID] = codersdk.WorkspaceAppHealthUnhealthy
|
|
nowUnhealthy = true
|
|
}
|
|
mu.Unlock()
|
|
logger.Debug(ctx, "error checking app health",
|
|
slog.F("id", app.ID.String()),
|
|
slog.F("slug", app.Slug),
|
|
slog.F("now_unhealthy", nowUnhealthy), slog.Error(err),
|
|
)
|
|
} else {
|
|
mu.Lock()
|
|
// we only need one successful health check to be considered healthy.
|
|
health[app.ID] = codersdk.WorkspaceAppHealthHealthy
|
|
failures[app.ID] = 0
|
|
mu.Unlock()
|
|
logger.Debug(ctx, "workspace app healthy", slog.F("id", app.ID.String()), slog.F("slug", app.Slug))
|
|
}
|
|
return nil
|
|
}, "healthcheck", app.Slug)
|
|
}()
|
|
}
|
|
|
|
mu.Lock()
|
|
lastHealth := copyHealth(health)
|
|
mu.Unlock()
|
|
reportTicker := clk.TickerFunc(ctx, time.Second, func() error {
|
|
mu.RLock()
|
|
changed := healthChanged(lastHealth, health)
|
|
mu.RUnlock()
|
|
if !changed {
|
|
return nil
|
|
}
|
|
|
|
mu.Lock()
|
|
lastHealth = copyHealth(health)
|
|
mu.Unlock()
|
|
err := postWorkspaceAgentAppHealth(ctx, agentsdk.PostAppHealthsRequest{
|
|
Healths: lastHealth,
|
|
})
|
|
if err != nil {
|
|
logger.Error(ctx, "failed to report workspace app health", slog.Error(err))
|
|
} else {
|
|
logger.Debug(ctx, "sent workspace app health", slog.F("health", lastHealth))
|
|
}
|
|
return nil
|
|
}, "report")
|
|
_ = reportTicker.Wait() // only possible error is context done
|
|
}
|
|
}
|
|
|
|
func shouldStartTicker(app codersdk.WorkspaceApp) bool {
|
|
return app.Healthcheck.URL != "" && app.Healthcheck.Interval > 0 && app.Healthcheck.Threshold > 0
|
|
}
|
|
|
|
func healthChanged(old map[uuid.UUID]codersdk.WorkspaceAppHealth, new map[uuid.UUID]codersdk.WorkspaceAppHealth) bool {
|
|
for name, newValue := range new {
|
|
oldValue, found := old[name]
|
|
if !found {
|
|
return true
|
|
}
|
|
if newValue != oldValue {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func copyHealth(h1 map[uuid.UUID]codersdk.WorkspaceAppHealth) map[uuid.UUID]codersdk.WorkspaceAppHealth {
|
|
h2 := make(map[uuid.UUID]codersdk.WorkspaceAppHealth, 0)
|
|
for k, v := range h1 {
|
|
h2[k] = v
|
|
}
|
|
|
|
return h2
|
|
}
|