mirror of
https://github.com/coder/coder.git
synced 2025-07-08 11:39:50 +00:00
feat(coderd/healthcheck): allow configuring database hc threshold (#10623)
* feat(coderd/healthcheck): allow configuring database hc threshold * feat(coderd): add database hc latency, plumb through * feat(coderd): allow configuring healthcheck refresh interval
This commit is contained in:
17
coderd/apidoc/docs.go
generated
17
coderd/apidoc/docs.go
generated
@ -8380,6 +8380,9 @@ const docTemplate = `{
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"healthcheck": {
|
||||
"$ref": "#/definitions/codersdk.HealthcheckConfig"
|
||||
},
|
||||
"http_address": {
|
||||
"description": "HTTPAddress is a string because it may be set to zero to disable.",
|
||||
"type": "string"
|
||||
@ -8859,6 +8862,17 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"codersdk.HealthcheckConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"refresh": {
|
||||
"type": "integer"
|
||||
},
|
||||
"threshold_database": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"codersdk.InsightsReportInterval": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
@ -12177,6 +12191,9 @@ const docTemplate = `{
|
||||
},
|
||||
"reachable": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"threshold_ms": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
17
coderd/apidoc/swagger.json
generated
17
coderd/apidoc/swagger.json
generated
@ -7492,6 +7492,9 @@
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"healthcheck": {
|
||||
"$ref": "#/definitions/codersdk.HealthcheckConfig"
|
||||
},
|
||||
"http_address": {
|
||||
"description": "HTTPAddress is a string because it may be set to zero to disable.",
|
||||
"type": "string"
|
||||
@ -7961,6 +7964,17 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"codersdk.HealthcheckConfig": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"refresh": {
|
||||
"type": "integer"
|
||||
},
|
||||
"threshold_database": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
"codersdk.InsightsReportInterval": {
|
||||
"type": "string",
|
||||
"enum": ["day", "week"],
|
||||
@ -11102,6 +11116,9 @@
|
||||
},
|
||||
"reachable": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"threshold_ms": {
|
||||
"type": "integer"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -38,6 +38,7 @@ import (
|
||||
// Used for swagger docs.
|
||||
_ "github.com/coder/coder/v2/coderd/apidoc"
|
||||
"github.com/coder/coder/v2/coderd/externalauth"
|
||||
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
|
||||
|
||||
"cdr.dev/slog"
|
||||
"github.com/coder/coder/v2/buildinfo"
|
||||
@ -398,10 +399,20 @@ func New(options *Options) *API {
|
||||
if options.HealthcheckFunc == nil {
|
||||
options.HealthcheckFunc = func(ctx context.Context, apiKey string) *healthcheck.Report {
|
||||
return healthcheck.Run(ctx, &healthcheck.ReportOptions{
|
||||
DB: options.Database,
|
||||
AccessURL: options.AccessURL,
|
||||
DERPMap: api.DERPMap(),
|
||||
APIKey: apiKey,
|
||||
Database: healthcheck.DatabaseReportOptions{
|
||||
DB: options.Database,
|
||||
Threshold: options.DeploymentValues.Healthcheck.ThresholdDatabase.Value(),
|
||||
},
|
||||
Websocket: healthcheck.WebsocketReportOptions{
|
||||
AccessURL: options.AccessURL,
|
||||
APIKey: apiKey,
|
||||
},
|
||||
AccessURL: healthcheck.AccessURLReportOptions{
|
||||
AccessURL: options.AccessURL,
|
||||
},
|
||||
DerpHealth: derphealth.ReportOptions{
|
||||
DERPMap: api.DERPMap(),
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -409,7 +420,7 @@ func New(options *Options) *API {
|
||||
options.HealthcheckTimeout = 30 * time.Second
|
||||
}
|
||||
if options.HealthcheckRefresh == 0 {
|
||||
options.HealthcheckRefresh = 10 * time.Minute
|
||||
options.HealthcheckRefresh = options.DeploymentValues.Healthcheck.Refresh.Value()
|
||||
}
|
||||
|
||||
var oidcAuthURLParams map[string]string
|
||||
|
@ -32,12 +32,12 @@ func (api *API) debugCoordinator(rw http.ResponseWriter, r *http.Request) {
|
||||
// @Router /debug/health [get]
|
||||
func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
|
||||
apiKey := httpmw.APITokenFromRequest(r)
|
||||
ctx, cancel := context.WithTimeout(r.Context(), api.HealthcheckTimeout)
|
||||
ctx, cancel := context.WithTimeout(r.Context(), api.Options.HealthcheckTimeout)
|
||||
defer cancel()
|
||||
|
||||
// Get cached report if it exists.
|
||||
if report := api.healthCheckCache.Load(); report != nil {
|
||||
if time.Since(report.Time) < api.HealthcheckRefresh {
|
||||
if time.Since(report.Time) < api.Options.HealthcheckRefresh {
|
||||
formatHealthcheck(ctx, rw, r, report)
|
||||
return
|
||||
}
|
||||
@ -45,7 +45,7 @@ func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
|
||||
|
||||
resChan := api.healthCheckGroup.DoChan("", func() (*healthcheck.Report, error) {
|
||||
// Create a new context not tied to the request.
|
||||
ctx, cancel := context.WithTimeout(context.Background(), api.HealthcheckTimeout)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), api.Options.HealthcheckTimeout)
|
||||
defer cancel()
|
||||
|
||||
report := api.HealthcheckFunc(ctx, apiKey)
|
||||
|
@ -72,6 +72,51 @@ func TestDebugHealth(t *testing.T) {
|
||||
require.Equal(t, http.StatusNotFound, res.StatusCode)
|
||||
})
|
||||
|
||||
t.Run("Refresh", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var (
|
||||
calls = make(chan struct{})
|
||||
callsDone = make(chan struct{})
|
||||
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
|
||||
client = coderdtest.New(t, &coderdtest.Options{
|
||||
HealthcheckRefresh: time.Microsecond,
|
||||
HealthcheckFunc: func(context.Context, string) *healthcheck.Report {
|
||||
calls <- struct{}{}
|
||||
return &healthcheck.Report{}
|
||||
},
|
||||
})
|
||||
_ = coderdtest.CreateFirstUser(t, client)
|
||||
)
|
||||
|
||||
defer cancel()
|
||||
|
||||
go func() {
|
||||
defer close(callsDone)
|
||||
<-calls
|
||||
<-time.After(testutil.IntervalFast)
|
||||
<-calls
|
||||
}()
|
||||
|
||||
res, err := client.Request(ctx, "GET", "/api/v2/debug/health", nil)
|
||||
require.NoError(t, err)
|
||||
defer res.Body.Close()
|
||||
_, _ = io.ReadAll(res.Body)
|
||||
require.Equal(t, http.StatusOK, res.StatusCode)
|
||||
|
||||
res, err = client.Request(ctx, "GET", "/api/v2/debug/health", nil)
|
||||
require.NoError(t, err)
|
||||
defer res.Body.Close()
|
||||
_, _ = io.ReadAll(res.Body)
|
||||
require.Equal(t, http.StatusOK, res.StatusCode)
|
||||
|
||||
select {
|
||||
case <-callsDone:
|
||||
case <-ctx.Done():
|
||||
t.Fatal("timed out waiting for calls to finish")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("Deduplicated", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
@ -10,20 +10,30 @@ import (
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
)
|
||||
|
||||
const (
|
||||
DatabaseDefaultThreshold = 15 * time.Millisecond
|
||||
)
|
||||
|
||||
// @typescript-generate DatabaseReport
|
||||
type DatabaseReport struct {
|
||||
Healthy bool `json:"healthy"`
|
||||
Reachable bool `json:"reachable"`
|
||||
Latency string `json:"latency"`
|
||||
LatencyMs int `json:"latency_ms"`
|
||||
Error *string `json:"error"`
|
||||
Healthy bool `json:"healthy"`
|
||||
Reachable bool `json:"reachable"`
|
||||
Latency string `json:"latency"`
|
||||
LatencyMS int64 `json:"latency_ms"`
|
||||
ThresholdMS int64 `json:"threshold_ms"`
|
||||
Error *string `json:"error"`
|
||||
}
|
||||
|
||||
type DatabaseReportOptions struct {
|
||||
DB database.Store
|
||||
DB database.Store
|
||||
Threshold time.Duration
|
||||
}
|
||||
|
||||
func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
|
||||
r.ThresholdMS = opts.Threshold.Milliseconds()
|
||||
if r.ThresholdMS == 0 {
|
||||
r.ThresholdMS = DatabaseDefaultThreshold.Milliseconds()
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
@ -43,10 +53,8 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
|
||||
// Take the median ping.
|
||||
latency := pings[pingCount/2]
|
||||
r.Latency = latency.String()
|
||||
r.LatencyMs = int(latency.Milliseconds())
|
||||
// Somewhat arbitrary, but if the latency is over 15ms, we consider it
|
||||
// unhealthy.
|
||||
if latency < 15*time.Millisecond {
|
||||
r.LatencyMS = latency.Milliseconds()
|
||||
if r.LatencyMS < r.ThresholdMS {
|
||||
r.Healthy = true
|
||||
}
|
||||
r.Reachable = true
|
||||
|
@ -36,7 +36,8 @@ func TestDatabase(t *testing.T) {
|
||||
assert.True(t, report.Healthy)
|
||||
assert.True(t, report.Reachable)
|
||||
assert.Equal(t, ping.String(), report.Latency)
|
||||
assert.Equal(t, int(ping.Milliseconds()), report.LatencyMs)
|
||||
assert.Equal(t, ping.Milliseconds(), report.LatencyMS)
|
||||
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
|
||||
assert.Nil(t, report.Error)
|
||||
})
|
||||
|
||||
@ -59,6 +60,7 @@ func TestDatabase(t *testing.T) {
|
||||
assert.False(t, report.Reachable)
|
||||
assert.Zero(t, report.Latency)
|
||||
require.NotNil(t, report.Error)
|
||||
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
|
||||
assert.Contains(t, *report.Error, err.Error())
|
||||
})
|
||||
|
||||
@ -83,7 +85,34 @@ func TestDatabase(t *testing.T) {
|
||||
assert.True(t, report.Healthy)
|
||||
assert.True(t, report.Reachable)
|
||||
assert.Equal(t, time.Millisecond.String(), report.Latency)
|
||||
assert.Equal(t, 1, report.LatencyMs)
|
||||
assert.EqualValues(t, 1, report.LatencyMS)
|
||||
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
|
||||
assert.Nil(t, report.Error)
|
||||
})
|
||||
|
||||
t.Run("Threshold", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var (
|
||||
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
|
||||
report = healthcheck.DatabaseReport{}
|
||||
db = dbmock.NewMockStore(gomock.NewController(t))
|
||||
)
|
||||
defer cancel()
|
||||
|
||||
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
|
||||
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
|
||||
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
|
||||
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
|
||||
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
|
||||
|
||||
report.Run(ctx, &healthcheck.DatabaseReportOptions{DB: db, Threshold: time.Second})
|
||||
|
||||
assert.False(t, report.Healthy)
|
||||
assert.True(t, report.Reachable)
|
||||
assert.Equal(t, time.Second.String(), report.Latency)
|
||||
assert.EqualValues(t, 1000, report.LatencyMS)
|
||||
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
|
||||
assert.Nil(t, report.Error)
|
||||
})
|
||||
}
|
||||
|
@ -3,15 +3,10 @@ package healthcheck
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tailscale.com/tailcfg"
|
||||
|
||||
"github.com/coder/coder/v2/buildinfo"
|
||||
"github.com/coder/coder/v2/coderd/database"
|
||||
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
|
||||
"github.com/coder/coder/v2/coderd/util/ptr"
|
||||
)
|
||||
@ -49,12 +44,10 @@ type Report struct {
|
||||
}
|
||||
|
||||
type ReportOptions struct {
|
||||
DB database.Store
|
||||
// TODO: support getting this over HTTP?
|
||||
DERPMap *tailcfg.DERPMap
|
||||
AccessURL *url.URL
|
||||
Client *http.Client
|
||||
APIKey string
|
||||
AccessURL AccessURLReportOptions
|
||||
Database DatabaseReportOptions
|
||||
DerpHealth derphealth.ReportOptions
|
||||
Websocket WebsocketReportOptions
|
||||
|
||||
Checker Checker
|
||||
}
|
||||
@ -100,9 +93,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
||||
}
|
||||
}()
|
||||
|
||||
report.DERP = opts.Checker.DERP(ctx, &derphealth.ReportOptions{
|
||||
DERPMap: opts.DERPMap,
|
||||
})
|
||||
report.DERP = opts.Checker.DERP(ctx, &opts.DerpHealth)
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
@ -114,10 +105,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
||||
}
|
||||
}()
|
||||
|
||||
report.AccessURL = opts.Checker.AccessURL(ctx, &AccessURLReportOptions{
|
||||
AccessURL: opts.AccessURL,
|
||||
Client: opts.Client,
|
||||
})
|
||||
report.AccessURL = opts.Checker.AccessURL(ctx, &opts.AccessURL)
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
@ -129,10 +117,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
||||
}
|
||||
}()
|
||||
|
||||
report.Websocket = opts.Checker.Websocket(ctx, &WebsocketReportOptions{
|
||||
APIKey: opts.APIKey,
|
||||
AccessURL: opts.AccessURL,
|
||||
})
|
||||
report.Websocket = opts.Checker.Websocket(ctx, &opts.Websocket)
|
||||
}()
|
||||
|
||||
wg.Add(1)
|
||||
@ -144,9 +129,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
|
||||
}
|
||||
}()
|
||||
|
||||
report.Database = opts.Checker.Database(ctx, &DatabaseReportOptions{
|
||||
DB: opts.DB,
|
||||
})
|
||||
report.Database = opts.Checker.Database(ctx, &opts.Database)
|
||||
}()
|
||||
|
||||
report.CoderVersion = buildinfo.Version()
|
||||
|
Reference in New Issue
Block a user