feat(coderd/healthcheck): allow configuring database hc threshold (#10623)

* feat(coderd/healthcheck): allow configuring database hc threshold
* feat(coderd): add database hc latency, plumb through
* feat(coderd): allow configuring healthcheck refresh interval
This commit is contained in:
Cian Johnston
2023-11-13 14:14:43 +00:00
committed by GitHub
parent e4211ccb40
commit b69c237b8a
17 changed files with 288 additions and 55 deletions

17
coderd/apidoc/docs.go generated
View File

@ -8380,6 +8380,9 @@ const docTemplate = `{
"type": "string"
}
},
"healthcheck": {
"$ref": "#/definitions/codersdk.HealthcheckConfig"
},
"http_address": {
"description": "HTTPAddress is a string because it may be set to zero to disable.",
"type": "string"
@ -8859,6 +8862,17 @@ const docTemplate = `{
}
}
},
"codersdk.HealthcheckConfig": {
"type": "object",
"properties": {
"refresh": {
"type": "integer"
},
"threshold_database": {
"type": "integer"
}
}
},
"codersdk.InsightsReportInterval": {
"type": "string",
"enum": [
@ -12177,6 +12191,9 @@ const docTemplate = `{
},
"reachable": {
"type": "boolean"
},
"threshold_ms": {
"type": "integer"
}
}
},

View File

@ -7492,6 +7492,9 @@
"type": "string"
}
},
"healthcheck": {
"$ref": "#/definitions/codersdk.HealthcheckConfig"
},
"http_address": {
"description": "HTTPAddress is a string because it may be set to zero to disable.",
"type": "string"
@ -7961,6 +7964,17 @@
}
}
},
"codersdk.HealthcheckConfig": {
"type": "object",
"properties": {
"refresh": {
"type": "integer"
},
"threshold_database": {
"type": "integer"
}
}
},
"codersdk.InsightsReportInterval": {
"type": "string",
"enum": ["day", "week"],
@ -11102,6 +11116,9 @@
},
"reachable": {
"type": "boolean"
},
"threshold_ms": {
"type": "integer"
}
}
},

View File

@ -38,6 +38,7 @@ import (
// Used for swagger docs.
_ "github.com/coder/coder/v2/coderd/apidoc"
"github.com/coder/coder/v2/coderd/externalauth"
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
"cdr.dev/slog"
"github.com/coder/coder/v2/buildinfo"
@ -398,10 +399,20 @@ func New(options *Options) *API {
if options.HealthcheckFunc == nil {
options.HealthcheckFunc = func(ctx context.Context, apiKey string) *healthcheck.Report {
return healthcheck.Run(ctx, &healthcheck.ReportOptions{
DB: options.Database,
AccessURL: options.AccessURL,
DERPMap: api.DERPMap(),
APIKey: apiKey,
Database: healthcheck.DatabaseReportOptions{
DB: options.Database,
Threshold: options.DeploymentValues.Healthcheck.ThresholdDatabase.Value(),
},
Websocket: healthcheck.WebsocketReportOptions{
AccessURL: options.AccessURL,
APIKey: apiKey,
},
AccessURL: healthcheck.AccessURLReportOptions{
AccessURL: options.AccessURL,
},
DerpHealth: derphealth.ReportOptions{
DERPMap: api.DERPMap(),
},
})
}
}
@ -409,7 +420,7 @@ func New(options *Options) *API {
options.HealthcheckTimeout = 30 * time.Second
}
if options.HealthcheckRefresh == 0 {
options.HealthcheckRefresh = 10 * time.Minute
options.HealthcheckRefresh = options.DeploymentValues.Healthcheck.Refresh.Value()
}
var oidcAuthURLParams map[string]string

View File

@ -32,12 +32,12 @@ func (api *API) debugCoordinator(rw http.ResponseWriter, r *http.Request) {
// @Router /debug/health [get]
func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
apiKey := httpmw.APITokenFromRequest(r)
ctx, cancel := context.WithTimeout(r.Context(), api.HealthcheckTimeout)
ctx, cancel := context.WithTimeout(r.Context(), api.Options.HealthcheckTimeout)
defer cancel()
// Get cached report if it exists.
if report := api.healthCheckCache.Load(); report != nil {
if time.Since(report.Time) < api.HealthcheckRefresh {
if time.Since(report.Time) < api.Options.HealthcheckRefresh {
formatHealthcheck(ctx, rw, r, report)
return
}
@ -45,7 +45,7 @@ func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
resChan := api.healthCheckGroup.DoChan("", func() (*healthcheck.Report, error) {
// Create a new context not tied to the request.
ctx, cancel := context.WithTimeout(context.Background(), api.HealthcheckTimeout)
ctx, cancel := context.WithTimeout(context.Background(), api.Options.HealthcheckTimeout)
defer cancel()
report := api.HealthcheckFunc(ctx, apiKey)

View File

@ -72,6 +72,51 @@ func TestDebugHealth(t *testing.T) {
require.Equal(t, http.StatusNotFound, res.StatusCode)
})
t.Run("Refresh", func(t *testing.T) {
t.Parallel()
var (
calls = make(chan struct{})
callsDone = make(chan struct{})
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
client = coderdtest.New(t, &coderdtest.Options{
HealthcheckRefresh: time.Microsecond,
HealthcheckFunc: func(context.Context, string) *healthcheck.Report {
calls <- struct{}{}
return &healthcheck.Report{}
},
})
_ = coderdtest.CreateFirstUser(t, client)
)
defer cancel()
go func() {
defer close(callsDone)
<-calls
<-time.After(testutil.IntervalFast)
<-calls
}()
res, err := client.Request(ctx, "GET", "/api/v2/debug/health", nil)
require.NoError(t, err)
defer res.Body.Close()
_, _ = io.ReadAll(res.Body)
require.Equal(t, http.StatusOK, res.StatusCode)
res, err = client.Request(ctx, "GET", "/api/v2/debug/health", nil)
require.NoError(t, err)
defer res.Body.Close()
_, _ = io.ReadAll(res.Body)
require.Equal(t, http.StatusOK, res.StatusCode)
select {
case <-callsDone:
case <-ctx.Done():
t.Fatal("timed out waiting for calls to finish")
}
})
t.Run("Deduplicated", func(t *testing.T) {
t.Parallel()

View File

@ -10,20 +10,30 @@ import (
"github.com/coder/coder/v2/coderd/database"
)
const (
DatabaseDefaultThreshold = 15 * time.Millisecond
)
// @typescript-generate DatabaseReport
type DatabaseReport struct {
Healthy bool `json:"healthy"`
Reachable bool `json:"reachable"`
Latency string `json:"latency"`
LatencyMs int `json:"latency_ms"`
Error *string `json:"error"`
Healthy bool `json:"healthy"`
Reachable bool `json:"reachable"`
Latency string `json:"latency"`
LatencyMS int64 `json:"latency_ms"`
ThresholdMS int64 `json:"threshold_ms"`
Error *string `json:"error"`
}
type DatabaseReportOptions struct {
DB database.Store
DB database.Store
Threshold time.Duration
}
func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
r.ThresholdMS = opts.Threshold.Milliseconds()
if r.ThresholdMS == 0 {
r.ThresholdMS = DatabaseDefaultThreshold.Milliseconds()
}
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
@ -43,10 +53,8 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
// Take the median ping.
latency := pings[pingCount/2]
r.Latency = latency.String()
r.LatencyMs = int(latency.Milliseconds())
// Somewhat arbitrary, but if the latency is over 15ms, we consider it
// unhealthy.
if latency < 15*time.Millisecond {
r.LatencyMS = latency.Milliseconds()
if r.LatencyMS < r.ThresholdMS {
r.Healthy = true
}
r.Reachable = true

View File

@ -36,7 +36,8 @@ func TestDatabase(t *testing.T) {
assert.True(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, ping.String(), report.Latency)
assert.Equal(t, int(ping.Milliseconds()), report.LatencyMs)
assert.Equal(t, ping.Milliseconds(), report.LatencyMS)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})
@ -59,6 +60,7 @@ func TestDatabase(t *testing.T) {
assert.False(t, report.Reachable)
assert.Zero(t, report.Latency)
require.NotNil(t, report.Error)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Contains(t, *report.Error, err.Error())
})
@ -83,7 +85,34 @@ func TestDatabase(t *testing.T) {
assert.True(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, time.Millisecond.String(), report.Latency)
assert.Equal(t, 1, report.LatencyMs)
assert.EqualValues(t, 1, report.LatencyMS)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})
t.Run("Threshold", func(t *testing.T) {
t.Parallel()
var (
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
report = healthcheck.DatabaseReport{}
db = dbmock.NewMockStore(gomock.NewController(t))
)
defer cancel()
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
report.Run(ctx, &healthcheck.DatabaseReportOptions{DB: db, Threshold: time.Second})
assert.False(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, time.Second.String(), report.Latency)
assert.EqualValues(t, 1000, report.LatencyMS)
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})
}

View File

@ -3,15 +3,10 @@ package healthcheck
import (
"context"
"fmt"
"net/http"
"net/url"
"sync"
"time"
"tailscale.com/tailcfg"
"github.com/coder/coder/v2/buildinfo"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
"github.com/coder/coder/v2/coderd/util/ptr"
)
@ -49,12 +44,10 @@ type Report struct {
}
type ReportOptions struct {
DB database.Store
// TODO: support getting this over HTTP?
DERPMap *tailcfg.DERPMap
AccessURL *url.URL
Client *http.Client
APIKey string
AccessURL AccessURLReportOptions
Database DatabaseReportOptions
DerpHealth derphealth.ReportOptions
Websocket WebsocketReportOptions
Checker Checker
}
@ -100,9 +93,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
}
}()
report.DERP = opts.Checker.DERP(ctx, &derphealth.ReportOptions{
DERPMap: opts.DERPMap,
})
report.DERP = opts.Checker.DERP(ctx, &opts.DerpHealth)
}()
wg.Add(1)
@ -114,10 +105,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
}
}()
report.AccessURL = opts.Checker.AccessURL(ctx, &AccessURLReportOptions{
AccessURL: opts.AccessURL,
Client: opts.Client,
})
report.AccessURL = opts.Checker.AccessURL(ctx, &opts.AccessURL)
}()
wg.Add(1)
@ -129,10 +117,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
}
}()
report.Websocket = opts.Checker.Websocket(ctx, &WebsocketReportOptions{
APIKey: opts.APIKey,
AccessURL: opts.AccessURL,
})
report.Websocket = opts.Checker.Websocket(ctx, &opts.Websocket)
}()
wg.Add(1)
@ -144,9 +129,7 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
}
}()
report.Database = opts.Checker.Database(ctx, &DatabaseReportOptions{
DB: opts.DB,
})
report.Database = opts.Checker.Database(ctx, &opts.Database)
}()
report.CoderVersion = buildinfo.Version()