feat(coderd): add DERP healthcheck (#6936)

This commit is contained in:
Colin Adler
2023-04-03 01:28:42 -05:00
committed by GitHub
parent f4d16a1ae5
commit 7738274b3e
14 changed files with 1925 additions and 5 deletions

222
coderd/apidoc/docs.go generated
View File

@ -359,6 +359,31 @@ const docTemplate = `{
}
}
},
"/debug/health": {
"get": {
"security": [
{
"CoderSessionToken": []
}
],
"produces": [
"application/json"
],
"tags": [
"Debug"
],
"summary": "Debug Info Deployment Health",
"operationId": "debug-info-deployment-health",
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/healthcheck.Report"
}
}
}
}
},
"/deployment/config": {
"get": {
"security": [
@ -9487,6 +9512,203 @@ const docTemplate = `{
"ParameterSourceSchemeData"
]
},
"healthcheck.DERPNodeReport": {
"type": "object",
"properties": {
"can_exchange_messages": {
"type": "boolean"
},
"client_errs": {
"type": "array",
"items": {
"type": "array",
"items": {}
}
},
"client_logs": {
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string"
}
}
},
"healthy": {
"type": "boolean"
},
"node": {
"$ref": "#/definitions/tailcfg.DERPNode"
},
"round_trip_ping": {
"type": "integer"
},
"stun": {
"$ref": "#/definitions/healthcheck.DERPStunReport"
},
"uses_websocket": {
"type": "boolean"
}
}
},
"healthcheck.DERPRegionReport": {
"type": "object",
"properties": {
"healthy": {
"type": "boolean"
},
"node_reports": {
"type": "array",
"items": {
"$ref": "#/definitions/healthcheck.DERPNodeReport"
}
},
"region": {
"$ref": "#/definitions/tailcfg.DERPRegion"
}
}
},
"healthcheck.DERPReport": {
"type": "object",
"properties": {
"healthy": {
"type": "boolean"
},
"netcheck": {
"$ref": "#/definitions/netcheck.Report"
},
"netcheck_logs": {
"type": "array",
"items": {
"type": "string"
}
},
"regions": {
"type": "object",
"additionalProperties": {
"$ref": "#/definitions/healthcheck.DERPRegionReport"
}
}
}
},
"healthcheck.DERPStunReport": {
"type": "object",
"properties": {
"canSTUN": {
"type": "boolean"
},
"enabled": {
"type": "boolean"
},
"error": {}
}
},
"healthcheck.Report": {
"type": "object",
"properties": {
"derp": {
"$ref": "#/definitions/healthcheck.DERPReport"
},
"pass": {
"description": "Healthy is true if the report returns no errors.",
"type": "boolean"
},
"time": {
"description": "Time is the time the report was generated at.",
"type": "string"
}
}
},
"netcheck.Report": {
"type": "object",
"properties": {
"captivePortal": {
"description": "CaptivePortal is set when we think there's a captive portal that is\nintercepting HTTP traffic.",
"type": "string"
},
"globalV4": {
"description": "ip:port of global IPv4",
"type": "string"
},
"globalV6": {
"description": "[ip]:port of global IPv6",
"type": "string"
},
"hairPinning": {
"description": "HairPinning is whether the router supports communicating\nbetween two local devices through the NATted public IP address\n(on IPv4).",
"type": "string"
},
"icmpv4": {
"description": "an ICMPv4 round trip completed",
"type": "boolean"
},
"ipv4": {
"description": "an IPv4 STUN round trip completed",
"type": "boolean"
},
"ipv4CanSend": {
"description": "an IPv4 packet was able to be sent",
"type": "boolean"
},
"ipv6": {
"description": "an IPv6 STUN round trip completed",
"type": "boolean"
},
"ipv6CanSend": {
"description": "an IPv6 packet was able to be sent",
"type": "boolean"
},
"mappingVariesByDestIP": {
"description": "MappingVariesByDestIP is whether STUN results depend which\nSTUN server you're talking to (on IPv4).",
"type": "string"
},
"oshasIPv6": {
"description": "could bind a socket to ::1",
"type": "boolean"
},
"pcp": {
"description": "PCP is whether PCP appears present on the LAN.\nEmpty means not checked.",
"type": "string"
},
"pmp": {
"description": "PMP is whether NAT-PMP appears present on the LAN.\nEmpty means not checked.",
"type": "string"
},
"preferredDERP": {
"description": "or 0 for unknown",
"type": "integer"
},
"regionLatency": {
"description": "keyed by DERP Region ID",
"type": "object",
"additionalProperties": {
"type": "integer"
}
},
"regionV4Latency": {
"description": "keyed by DERP Region ID",
"type": "object",
"additionalProperties": {
"type": "integer"
}
},
"regionV6Latency": {
"description": "keyed by DERP Region ID",
"type": "object",
"additionalProperties": {
"type": "integer"
}
},
"udp": {
"description": "a UDP STUN round trip completed",
"type": "boolean"
},
"upnP": {
"description": "UPnP is whether UPnP appears present on the LAN.\nEmpty means not checked.",
"type": "string"
}
}
},
"parameter.ComputedValue": {
"type": "object",
"properties": {

View File

@ -305,6 +305,27 @@
}
}
},
"/debug/health": {
"get": {
"security": [
{
"CoderSessionToken": []
}
],
"produces": ["application/json"],
"tags": ["Debug"],
"summary": "Debug Info Deployment Health",
"operationId": "debug-info-deployment-health",
"responses": {
"200": {
"description": "OK",
"schema": {
"$ref": "#/definitions/healthcheck.Report"
}
}
}
}
},
"/deployment/config": {
"get": {
"security": [
@ -8564,6 +8585,203 @@
"ParameterSourceSchemeData"
]
},
"healthcheck.DERPNodeReport": {
"type": "object",
"properties": {
"can_exchange_messages": {
"type": "boolean"
},
"client_errs": {
"type": "array",
"items": {
"type": "array",
"items": {}
}
},
"client_logs": {
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string"
}
}
},
"healthy": {
"type": "boolean"
},
"node": {
"$ref": "#/definitions/tailcfg.DERPNode"
},
"round_trip_ping": {
"type": "integer"
},
"stun": {
"$ref": "#/definitions/healthcheck.DERPStunReport"
},
"uses_websocket": {
"type": "boolean"
}
}
},
"healthcheck.DERPRegionReport": {
"type": "object",
"properties": {
"healthy": {
"type": "boolean"
},
"node_reports": {
"type": "array",
"items": {
"$ref": "#/definitions/healthcheck.DERPNodeReport"
}
},
"region": {
"$ref": "#/definitions/tailcfg.DERPRegion"
}
}
},
"healthcheck.DERPReport": {
"type": "object",
"properties": {
"healthy": {
"type": "boolean"
},
"netcheck": {
"$ref": "#/definitions/netcheck.Report"
},
"netcheck_logs": {
"type": "array",
"items": {
"type": "string"
}
},
"regions": {
"type": "object",
"additionalProperties": {
"$ref": "#/definitions/healthcheck.DERPRegionReport"
}
}
}
},
"healthcheck.DERPStunReport": {
"type": "object",
"properties": {
"canSTUN": {
"type": "boolean"
},
"enabled": {
"type": "boolean"
},
"error": {}
}
},
"healthcheck.Report": {
"type": "object",
"properties": {
"derp": {
"$ref": "#/definitions/healthcheck.DERPReport"
},
"pass": {
"description": "Healthy is true if the report returns no errors.",
"type": "boolean"
},
"time": {
"description": "Time is the time the report was generated at.",
"type": "string"
}
}
},
"netcheck.Report": {
"type": "object",
"properties": {
"captivePortal": {
"description": "CaptivePortal is set when we think there's a captive portal that is\nintercepting HTTP traffic.",
"type": "string"
},
"globalV4": {
"description": "ip:port of global IPv4",
"type": "string"
},
"globalV6": {
"description": "[ip]:port of global IPv6",
"type": "string"
},
"hairPinning": {
"description": "HairPinning is whether the router supports communicating\nbetween two local devices through the NATted public IP address\n(on IPv4).",
"type": "string"
},
"icmpv4": {
"description": "an ICMPv4 round trip completed",
"type": "boolean"
},
"ipv4": {
"description": "an IPv4 STUN round trip completed",
"type": "boolean"
},
"ipv4CanSend": {
"description": "an IPv4 packet was able to be sent",
"type": "boolean"
},
"ipv6": {
"description": "an IPv6 STUN round trip completed",
"type": "boolean"
},
"ipv6CanSend": {
"description": "an IPv6 packet was able to be sent",
"type": "boolean"
},
"mappingVariesByDestIP": {
"description": "MappingVariesByDestIP is whether STUN results depend which\nSTUN server you're talking to (on IPv4).",
"type": "string"
},
"oshasIPv6": {
"description": "could bind a socket to ::1",
"type": "boolean"
},
"pcp": {
"description": "PCP is whether PCP appears present on the LAN.\nEmpty means not checked.",
"type": "string"
},
"pmp": {
"description": "PMP is whether NAT-PMP appears present on the LAN.\nEmpty means not checked.",
"type": "string"
},
"preferredDERP": {
"description": "or 0 for unknown",
"type": "integer"
},
"regionLatency": {
"description": "keyed by DERP Region ID",
"type": "object",
"additionalProperties": {
"type": "integer"
}
},
"regionV4Latency": {
"description": "keyed by DERP Region ID",
"type": "object",
"additionalProperties": {
"type": "integer"
}
},
"regionV6Latency": {
"description": "keyed by DERP Region ID",
"type": "object",
"additionalProperties": {
"type": "integer"
}
},
"udp": {
"description": "a UDP STUN round trip completed",
"type": "boolean"
},
"upnP": {
"description": "UPnP is whether UPnP appears present on the LAN.\nEmpty means not checked.",
"type": "string"
}
}
},
"parameter.ComputedValue": {
"type": "object",
"properties": {

View File

@ -33,6 +33,7 @@ import (
"tailscale.com/derp/derphttp"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
"tailscale.com/util/singleflight"
"cdr.dev/slog"
"github.com/coder/coder/buildinfo"
@ -46,6 +47,7 @@ import (
"github.com/coder/coder/coderd/database/dbtype"
"github.com/coder/coder/coderd/gitauth"
"github.com/coder/coder/coderd/gitsshkey"
"github.com/coder/coder/coderd/healthcheck"
"github.com/coder/coder/coderd/httpapi"
"github.com/coder/coder/coderd/httpmw"
"github.com/coder/coder/coderd/metricscache"
@ -123,7 +125,10 @@ type Options struct {
TemplateScheduleStore schedule.TemplateScheduleStore
// AppSigningKey denotes the symmetric key to use for signing app tickets.
// The key must be 64 bytes long.
AppSigningKey []byte
AppSigningKey []byte
HealthcheckFunc func(ctx context.Context) (*healthcheck.Report, error)
HealthcheckTimeout time.Duration
HealthcheckRefresh time.Duration
// APIRateLimit is the minutely throughput rate limit per user or ip.
// Setting a rate limit <0 will disable the rate limiter across the entire
@ -235,6 +240,19 @@ func New(options *Options) *API {
if len(options.AppSigningKey) != 64 {
panic("coderd: AppSigningKey must be 64 bytes long")
}
if options.HealthcheckFunc == nil {
options.HealthcheckFunc = func(ctx context.Context) (*healthcheck.Report, error) {
return healthcheck.Run(ctx, &healthcheck.ReportOptions{
DERPMap: options.DERPMap.Clone(),
})
}
}
if options.HealthcheckTimeout == 0 {
options.HealthcheckTimeout = 30 * time.Second
}
if options.HealthcheckRefresh == 0 {
options.HealthcheckRefresh = 10 * time.Minute
}
siteCacheDir := options.CacheDir
if siteCacheDir != "" {
@ -293,6 +311,7 @@ func New(options *Options) *API {
Auditor: atomic.Pointer[audit.Auditor]{},
TemplateScheduleStore: atomic.Pointer[schedule.TemplateScheduleStore]{},
Experiments: experiments,
healthCheckGroup: &singleflight.Group[string, *healthcheck.Report]{},
}
if options.UpdateCheckOptions != nil {
api.updateChecker = updatecheck.New(
@ -718,6 +737,7 @@ func New(options *Options) *API {
)
r.Get("/coordinator", api.debugCoordinator)
r.Get("/health", api.debugDeploymentHealth)
})
})
@ -773,6 +793,8 @@ type API struct {
// Experiments contains the list of experiments currently enabled.
// This is used to gate features that are not yet ready for production.
Experiments codersdk.Experiments
healthCheckGroup *singleflight.Group[string, *healthcheck.Report]
}
// Close waits for all WebSocket connections to drain before returning.

View File

@ -60,6 +60,7 @@ import (
"github.com/coder/coder/coderd/database/dbtestutil"
"github.com/coder/coder/coderd/gitauth"
"github.com/coder/coder/coderd/gitsshkey"
"github.com/coder/coder/coderd/healthcheck"
"github.com/coder/coder/coderd/httpapi"
"github.com/coder/coder/coderd/httpmw"
"github.com/coder/coder/coderd/rbac"
@ -105,6 +106,10 @@ type Options struct {
TrialGenerator func(context.Context, string) error
TemplateScheduleStore schedule.TemplateScheduleStore
HealthcheckFunc func(ctx context.Context) (*healthcheck.Report, error)
HealthcheckTimeout time.Duration
HealthcheckRefresh time.Duration
// All rate limits default to -1 (unlimited) in tests if not set.
APIRateLimit int
LoginRateLimit int
@ -335,6 +340,9 @@ func NewOptions(t *testing.T, options *Options) (func(http.Handler), context.Can
SwaggerEndpoint: options.SwaggerEndpoint,
AppSigningKey: AppSigningKey,
SSHConfig: options.ConfigSSH,
HealthcheckFunc: options.HealthcheckFunc,
HealthcheckTimeout: options.HealthcheckTimeout,
HealthcheckRefresh: options.HealthcheckRefresh,
}
}

View File

@ -1,6 +1,14 @@
package coderd
import "net/http"
import (
"context"
"net/http"
"time"
"github.com/coder/coder/coderd/healthcheck"
"github.com/coder/coder/coderd/httpapi"
"github.com/coder/coder/codersdk"
)
// @Summary Debug Info Wireguard Coordinator
// @ID debug-info-wireguard-coordinator
@ -12,3 +20,36 @@ import "net/http"
func (api *API) debugCoordinator(rw http.ResponseWriter, r *http.Request) {
(*api.TailnetCoordinator.Load()).ServeHTTPDebug(rw, r)
}
// @Summary Debug Info Deployment Health
// @ID debug-info-deployment-health
// @Security CoderSessionToken
// @Produce json
// @Tags Debug
// @Success 200 {object} healthcheck.Report
// @Router /debug/health [get]
func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), api.HealthcheckTimeout)
defer cancel()
resChan := api.healthCheckGroup.DoChan("", func() (*healthcheck.Report, error) {
return api.HealthcheckFunc(ctx)
})
select {
case <-ctx.Done():
httpapi.Write(ctx, rw, http.StatusNotFound, codersdk.Response{
Message: "Healthcheck is in progress and did not complete in time. Try again in a few seconds.",
})
return
case res := <-resChan:
if time.Since(res.Val.Time) > api.HealthcheckRefresh {
api.healthCheckGroup.Forget("")
api.debugDeploymentHealth(rw, r)
return
}
httpapi.Write(ctx, rw, http.StatusOK, res.Val)
return
}
}

69
coderd/debug_test.go Normal file
View File

@ -0,0 +1,69 @@
package coderd_test
import (
"context"
"io"
"net/http"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/coder/coder/coderd/coderdtest"
"github.com/coder/coder/coderd/healthcheck"
"github.com/coder/coder/testutil"
)
func TestDebug(t *testing.T) {
t.Parallel()
t.Run("Health/OK", func(t *testing.T) {
t.Parallel()
var (
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
client = coderdtest.New(t, &coderdtest.Options{
HealthcheckFunc: func(context.Context) (*healthcheck.Report, error) {
return &healthcheck.Report{}, nil
},
})
_ = coderdtest.CreateFirstUser(t, client)
)
defer cancel()
res, err := client.Request(ctx, "GET", "/debug/health", nil)
require.NoError(t, err)
defer res.Body.Close()
_, _ = io.ReadAll(res.Body)
require.Equal(t, http.StatusOK, res.StatusCode)
})
t.Run("Health/Timeout", func(t *testing.T) {
t.Parallel()
var (
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
client = coderdtest.New(t, &coderdtest.Options{
HealthcheckTimeout: time.Microsecond,
HealthcheckFunc: func(context.Context) (*healthcheck.Report, error) {
t := time.NewTimer(time.Second)
defer t.Stop()
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-t.C:
return &healthcheck.Report{}, nil
}
},
})
_ = coderdtest.CreateFirstUser(t, client)
)
defer cancel()
res, err := client.Request(ctx, "GET", "/api/v2/debug/health", nil)
require.NoError(t, err)
defer res.Body.Close()
_, _ = io.ReadAll(res.Body)
require.Equal(t, http.StatusNotFound, res.StatusCode)
})
}

398
coderd/healthcheck/derp.go Normal file
View File

@ -0,0 +1,398 @@
package healthcheck
import (
"context"
"errors"
"fmt"
"io"
"net"
"net/netip"
"net/url"
"strings"
"sync"
"sync/atomic"
"time"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"tailscale.com/derp"
"tailscale.com/derp/derphttp"
"tailscale.com/net/netcheck"
"tailscale.com/net/portmapper"
"tailscale.com/prober"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
tslogger "tailscale.com/types/logger"
)
type DERPReport struct {
mu sync.Mutex
Healthy bool `json:"healthy"`
Regions map[int]*DERPRegionReport `json:"regions"`
Netcheck *netcheck.Report `json:"netcheck"`
NetcheckLogs []string `json:"netcheck_logs"`
}
type DERPRegionReport struct {
mu sync.Mutex
Healthy bool `json:"healthy"`
Region *tailcfg.DERPRegion `json:"region"`
NodeReports []*DERPNodeReport `json:"node_reports"`
}
type DERPNodeReport struct {
mu sync.Mutex
clientCounter int
Healthy bool `json:"healthy"`
Node *tailcfg.DERPNode `json:"node"`
CanExchangeMessages bool `json:"can_exchange_messages"`
RoundTripPing time.Duration `json:"round_trip_ping"`
UsesWebsocket bool `json:"uses_websocket"`
ClientLogs [][]string `json:"client_logs"`
ClientErrs [][]error `json:"client_errs"`
STUN DERPStunReport `json:"stun"`
}
type DERPStunReport struct {
Enabled bool
CanSTUN bool
Error error
}
type DERPReportOptions struct {
DERPMap *tailcfg.DERPMap
}
func (r *DERPReport) Run(ctx context.Context, opts *DERPReportOptions) error {
r.Healthy = true
r.Regions = map[int]*DERPRegionReport{}
eg, ctx := errgroup.WithContext(ctx)
for _, region := range opts.DERPMap.Regions {
region := region
eg.Go(func() error {
regionReport := DERPRegionReport{
Region: region,
}
err := regionReport.Run(ctx)
if err != nil {
return xerrors.Errorf("run region report: %w", err)
}
r.mu.Lock()
r.Regions[region.RegionID] = &regionReport
if !regionReport.Healthy {
r.Healthy = false
}
r.mu.Unlock()
return nil
})
}
ncLogf := func(format string, args ...interface{}) {
r.mu.Lock()
r.NetcheckLogs = append(r.NetcheckLogs, fmt.Sprintf(format, args...))
r.mu.Unlock()
}
nc := &netcheck.Client{
PortMapper: portmapper.NewClient(tslogger.WithPrefix(ncLogf, "portmap: "), nil),
Logf: tslogger.WithPrefix(ncLogf, "netcheck: "),
}
ncReport, err := nc.GetReport(ctx, opts.DERPMap)
if err != nil {
return xerrors.Errorf("run netcheck: %w", err)
}
r.Netcheck = ncReport
return eg.Wait()
}
func (r *DERPRegionReport) Run(ctx context.Context) error {
r.Healthy = true
r.NodeReports = []*DERPNodeReport{}
eg, ctx := errgroup.WithContext(ctx)
for _, node := range r.Region.Nodes {
node := node
eg.Go(func() error {
nodeReport := DERPNodeReport{
Node: node,
Healthy: true,
}
err := nodeReport.Run(ctx)
if err != nil {
return xerrors.Errorf("run node report: %w", err)
}
r.mu.Lock()
r.NodeReports = append(r.NodeReports, &nodeReport)
if !nodeReport.Healthy {
r.Healthy = false
}
r.mu.Unlock()
return nil
})
}
return eg.Wait()
}
func (r *DERPNodeReport) derpURL() *url.URL {
derpURL := &url.URL{
Scheme: "https",
Host: r.Node.HostName,
Path: "/derp",
}
if r.Node.ForceHTTP {
derpURL.Scheme = "http"
}
if r.Node.HostName == "" {
derpURL.Host = fmt.Sprintf("%s:%d", r.Node.IPv4, r.Node.DERPPort)
}
return derpURL
}
func (r *DERPNodeReport) Run(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
r.ClientLogs = [][]string{}
r.ClientErrs = [][]error{}
r.doExchangeMessage(ctx)
r.doSTUNTest(ctx)
if !r.CanExchangeMessages || r.UsesWebsocket || r.STUN.Error != nil {
r.Healthy = false
}
return nil
}
func (r *DERPNodeReport) doExchangeMessage(ctx context.Context) {
if r.Node.STUNOnly {
return
}
var peerKey atomic.Pointer[key.NodePublic]
eg, ctx := errgroup.WithContext(ctx)
receive, receiveID, err := r.derpClient(ctx, r.derpURL())
if err != nil {
return
}
defer receive.Close()
eg.Go(func() error {
defer receive.Close()
pkt, err := r.recvData(receive)
if err != nil {
r.writeClientErr(receiveID, xerrors.Errorf("recv derp message: %w", err))
return err
}
if *peerKey.Load() != pkt.Source {
r.writeClientErr(receiveID, xerrors.Errorf("received pkt from unknown peer: %s", pkt.Source.ShortString()))
return err
}
t, err := time.Parse(time.RFC3339Nano, string(pkt.Data))
if err != nil {
r.writeClientErr(receiveID, xerrors.Errorf("parse time from peer: %w", err))
return err
}
r.mu.Lock()
r.CanExchangeMessages = true
r.RoundTripPing = time.Since(t)
r.mu.Unlock()
return nil
})
eg.Go(func() error {
send, sendID, err := r.derpClient(ctx, r.derpURL())
if err != nil {
return err
}
defer send.Close()
key := send.SelfPublicKey()
peerKey.Store(&key)
err = send.Send(receive.SelfPublicKey(), []byte(time.Now().Format(time.RFC3339Nano)))
if err != nil {
r.writeClientErr(sendID, xerrors.Errorf("send derp message: %w", err))
return err
}
return nil
})
_ = eg.Wait()
}
func (r *DERPNodeReport) doSTUNTest(ctx context.Context) {
if r.Node.STUNPort == -1 {
return
}
r.mu.Lock()
r.STUN.Enabled = true
r.mu.Unlock()
addr, port, err := r.stunAddr(ctx)
if err != nil {
r.STUN.Error = xerrors.Errorf("get stun addr: %w", err)
return
}
// We only create a prober to call ProbeUDP manually.
p, err := prober.DERP(prober.New(), "", time.Second, time.Second, time.Second)
if err != nil {
r.STUN.Error = xerrors.Errorf("create prober: %w", err)
return
}
err = p.ProbeUDP(addr, port)(ctx)
if err != nil {
r.STUN.Error = xerrors.Errorf("probe stun: %w", err)
return
}
r.mu.Lock()
r.STUN.CanSTUN = true
r.mu.Unlock()
}
func (r *DERPNodeReport) stunAddr(ctx context.Context) (string, int, error) {
port := r.Node.STUNPort
if port == 0 {
port = 3478
}
if port < 0 || port > 1<<16-1 {
return "", 0, xerrors.Errorf("invalid stun port %d", port)
}
if r.Node.STUNTestIP != "" {
ip, err := netip.ParseAddr(r.Node.STUNTestIP)
if err != nil {
return "", 0, xerrors.Errorf("invalid stun test ip %q: %w", r.Node.STUNTestIP, err)
}
return ip.String(), port, nil
}
if r.Node.HostName != "" {
addrs, err := net.DefaultResolver.LookupIPAddr(ctx, r.Node.HostName)
if err != nil {
return "", 0, xerrors.Errorf("lookup ip addr: %w", err)
}
for _, a := range addrs {
return a.String(), port, nil
}
}
if r.Node.IPv4 != "" {
ip, err := netip.ParseAddr(r.Node.IPv4)
if err != nil {
return "", 0, xerrors.Errorf("invalid ipv4 %q: %w", r.Node.IPv4, err)
}
if !ip.Is4() {
return "", 0, xerrors.Errorf("provided node ipv4 is not v4 %q: %w", r.Node.IPv4, err)
}
return ip.String(), port, nil
}
if r.Node.IPv6 != "" {
ip, err := netip.ParseAddr(r.Node.IPv6)
if err != nil {
return "", 0, xerrors.Errorf("invalid ipv6 %q: %w", r.Node.IPv6, err)
}
if !ip.Is6() {
return "", 0, xerrors.Errorf("provided node ipv6 is not v6 %q: %w", r.Node.IPv6, err)
}
return ip.String(), port, nil
}
return "", 0, xerrors.New("no stun ips provided")
}
func (r *DERPNodeReport) writeClientErr(clientID int, err error) {
r.mu.Lock()
r.ClientErrs[clientID] = append(r.ClientErrs[clientID], err)
r.mu.Unlock()
}
func (r *DERPNodeReport) derpClient(ctx context.Context, derpURL *url.URL) (*derphttp.Client, int, error) {
r.mu.Lock()
id := r.clientCounter
r.clientCounter++
r.ClientLogs = append(r.ClientLogs, []string{})
r.ClientErrs = append(r.ClientErrs, []error{})
r.mu.Unlock()
client, err := derphttp.NewClient(key.NewNode(), derpURL.String(), func(format string, args ...any) {
r.mu.Lock()
defer r.mu.Unlock()
msg := fmt.Sprintf(format, args...)
if strings.Contains(msg, "We'll use WebSockets on the next connection attempt") {
r.UsesWebsocket = true
}
r.ClientLogs[id] = append(r.ClientLogs[id], msg)
})
if err != nil {
err := xerrors.Errorf("create derp client: %w", err)
r.writeClientErr(id, err)
return nil, id, err
}
go func() {
<-ctx.Done()
_ = client.Close()
}()
i := 0
for ; i < 5; i++ {
err = client.Connect(ctx)
if err != nil {
r.writeClientErr(id, xerrors.Errorf("connect to derp: %w", err))
continue
}
break
}
if i == 5 {
err := xerrors.Errorf("couldn't connect after 5 tries, last error: %w", err)
r.writeClientErr(id, xerrors.Errorf("couldn't connect after 5 tries, last error: %w", err))
return nil, id, err
}
return client, id, nil
}
func (*DERPNodeReport) recvData(client *derphttp.Client) (derp.ReceivedPacket, error) {
for {
msg, err := client.Recv()
if err != nil {
if errors.Is(err, io.EOF) {
return derp.ReceivedPacket{}, nil
}
}
switch msg := msg.(type) {
case derp.ReceivedPacket:
return msg, nil
default:
// Drop all others!
}
}
}

View File

@ -0,0 +1,202 @@
package healthcheck_test
import (
"context"
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"net/url"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/derp"
"tailscale.com/derp/derphttp"
"tailscale.com/ipn"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
"github.com/coder/coder/coderd/healthcheck"
"github.com/coder/coder/tailnet"
)
func TestDERP(t *testing.T) {
t.Parallel()
t.Run("OK", func(t *testing.T) {
t.Parallel()
derpSrv := derp.NewServer(key.NewNode(), func(format string, args ...any) { t.Logf(format, args...) })
defer derpSrv.Close()
srv := httptest.NewServer(derphttp.Handler(derpSrv))
defer srv.Close()
var (
ctx = context.Background()
report = healthcheck.DERPReport{}
derpURL, _ = url.Parse(srv.URL)
opts = &healthcheck.DERPReportOptions{
DERPMap: &tailcfg.DERPMap{Regions: map[int]*tailcfg.DERPRegion{
1: {
EmbeddedRelay: true,
RegionID: 999,
Nodes: []*tailcfg.DERPNode{{
Name: "1a",
RegionID: 999,
HostName: derpURL.Host,
IPv4: derpURL.Host,
STUNPort: -1,
InsecureForTests: true,
ForceHTTP: true,
}},
},
}},
}
)
err := report.Run(ctx, opts)
require.NoError(t, err)
assert.True(t, report.Healthy)
for _, region := range report.Regions {
assert.True(t, region.Healthy)
for _, node := range region.NodeReports {
assert.True(t, node.Healthy)
assert.True(t, node.CanExchangeMessages)
assert.Positive(t, node.RoundTripPing)
assert.Len(t, node.ClientLogs, 2)
assert.Len(t, node.ClientLogs[0], 1)
assert.Len(t, node.ClientErrs[0], 0)
assert.Len(t, node.ClientLogs[1], 1)
assert.Len(t, node.ClientErrs[1], 0)
assert.False(t, node.STUN.Enabled)
assert.False(t, node.STUN.CanSTUN)
assert.NoError(t, node.STUN.Error)
}
}
})
t.Run("OK/Tailscale/Dallas", func(t *testing.T) {
t.Parallel()
derpSrv := derp.NewServer(key.NewNode(), func(format string, args ...any) { t.Logf(format, args...) })
defer derpSrv.Close()
srv := httptest.NewServer(derphttp.Handler(derpSrv))
defer srv.Close()
var (
ctx = context.Background()
report = healthcheck.DERPReport{}
opts = &healthcheck.DERPReportOptions{
DERPMap: tsDERPMap(ctx, t),
}
)
// Only include the Dallas region
opts.DERPMap.Regions = map[int]*tailcfg.DERPRegion{9: opts.DERPMap.Regions[9]}
err := report.Run(ctx, opts)
require.NoError(t, err)
assert.True(t, report.Healthy)
for _, region := range report.Regions {
assert.True(t, region.Healthy)
for _, node := range region.NodeReports {
assert.True(t, node.Healthy)
assert.True(t, node.CanExchangeMessages)
assert.Positive(t, node.RoundTripPing)
assert.Len(t, node.ClientLogs, 2)
assert.Len(t, node.ClientLogs[0], 1)
assert.Len(t, node.ClientErrs[0], 0)
assert.Len(t, node.ClientLogs[1], 1)
assert.Len(t, node.ClientErrs[1], 0)
assert.True(t, node.STUN.Enabled)
assert.True(t, node.STUN.CanSTUN)
assert.NoError(t, node.STUN.Error)
}
}
})
t.Run("ForceWebsockets", func(t *testing.T) {
t.Parallel()
derpSrv := derp.NewServer(key.NewNode(), func(format string, args ...any) { t.Logf(format, args...) })
defer derpSrv.Close()
handler, closeHandler := tailnet.WithWebsocketSupport(derpSrv, derphttp.Handler(derpSrv))
defer closeHandler()
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Header.Get("Upgrade") == "DERP" {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte("bad request"))
return
}
handler.ServeHTTP(w, r)
}))
var (
ctx = context.Background()
report = healthcheck.DERPReport{}
derpURL, _ = url.Parse(srv.URL)
opts = &healthcheck.DERPReportOptions{
DERPMap: &tailcfg.DERPMap{Regions: map[int]*tailcfg.DERPRegion{
1: {
EmbeddedRelay: true,
RegionID: 999,
Nodes: []*tailcfg.DERPNode{{
Name: "1a",
RegionID: 999,
HostName: derpURL.Host,
IPv4: derpURL.Host,
STUNPort: -1,
InsecureForTests: true,
ForceHTTP: true,
}},
},
}},
}
)
report.Run(ctx, opts)
assert.False(t, report.Healthy)
for _, region := range report.Regions {
assert.False(t, region.Healthy)
for _, node := range region.NodeReports {
assert.False(t, node.Healthy)
assert.True(t, node.CanExchangeMessages)
assert.Positive(t, node.RoundTripPing)
assert.Len(t, node.ClientLogs, 2)
assert.Len(t, node.ClientLogs[0], 3)
assert.Len(t, node.ClientLogs[1], 3)
assert.Len(t, node.ClientErrs, 2)
assert.Len(t, node.ClientErrs[0], 1)
assert.Len(t, node.ClientErrs[1], 1)
assert.True(t, node.UsesWebsocket)
assert.False(t, node.STUN.Enabled)
assert.False(t, node.STUN.CanSTUN)
assert.NoError(t, node.STUN.Error)
}
}
})
}
func tsDERPMap(ctx context.Context, t testing.TB) *tailcfg.DERPMap {
req, err := http.NewRequestWithContext(ctx, "GET", ipn.DefaultControlURL+"/derpmap/default", nil)
require.NoError(t, err)
res, err := http.DefaultClient.Do(req)
require.NoError(t, err)
defer res.Body.Close()
require.Equal(t, http.StatusOK, res.StatusCode)
var derpMap tailcfg.DERPMap
err = json.NewDecoder(io.LimitReader(res.Body, 1<<20)).Decode(&derpMap)
require.NoError(t, err)
return &derpMap
}

View File

@ -0,0 +1,42 @@
package healthcheck
import (
"context"
"time"
"golang.org/x/xerrors"
"tailscale.com/tailcfg"
)
type Report struct {
// Time is the time the report was generated at.
Time time.Time `json:"time"`
// Healthy is true if the report returns no errors.
Healthy bool `json:"pass"`
DERP DERPReport `json:"derp"`
// TODO
// AccessURL AccessURLReport
// Websocket WebsocketReport
}
type ReportOptions struct {
// TODO: support getting this over HTTP?
DERPMap *tailcfg.DERPMap
}
func Run(ctx context.Context, opts *ReportOptions) (*Report, error) {
var report Report
err := report.DERP.Run(ctx, &DERPReportOptions{
DERPMap: opts.DERPMap,
})
if err != nil {
return nil, xerrors.Errorf("run derp: %w", err)
}
report.Time = time.Now()
report.Healthy = report.DERP.Healthy
return &report, nil
}

View File

@ -0,0 +1,3 @@
package healthcheck
type WebsocketReport struct{}