mirror of
https://github.com/coder/coder.git
synced 2025-07-09 11:45:56 +00:00
I modified the proxy host cache we already had and were using for websocket csp headers to also include the wildcard app host, then used those for frame-src policies. I did not add frame-ancestors, since if I understand correctly, those would go on the app, and this middleware does not come into play there. Maybe we will want to add it on workspace apps like we do with cors, if we find apps are setting it to `none` or something. Closes https://github.com/coder/internal/issues/684
387 lines
13 KiB
Go
387 lines
13 KiB
Go
package proxyhealth
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"golang.org/x/sync/errgroup"
|
|
"golang.org/x/xerrors"
|
|
|
|
"cdr.dev/slog"
|
|
"github.com/coder/coder/v2/coderd/database"
|
|
"github.com/coder/coder/v2/coderd/database/dbauthz"
|
|
"github.com/coder/coder/v2/coderd/prometheusmetrics"
|
|
agplproxyhealth "github.com/coder/coder/v2/coderd/proxyhealth"
|
|
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
|
|
"github.com/coder/coder/v2/codersdk"
|
|
)
|
|
|
|
type Status string
|
|
|
|
const (
|
|
// Unknown should never be returned by the proxy health check.
|
|
Unknown Status = "unknown"
|
|
// Healthy means the proxy access url is reachable and returns a healthy
|
|
// status code.
|
|
Healthy Status = "ok"
|
|
// Unreachable means the proxy access url is not responding.
|
|
Unreachable Status = "unreachable"
|
|
// Unhealthy means the proxy access url is responding, but there is some
|
|
// problem with the proxy. This problem may or may not be preventing functionality.
|
|
Unhealthy Status = "unhealthy"
|
|
// Unregistered means the proxy has not registered a url yet. This means
|
|
// the proxy was created with the cli, but has not yet been started.
|
|
Unregistered Status = "unregistered"
|
|
)
|
|
|
|
type Options struct {
|
|
// Interval is the interval at which the proxy health is checked.
|
|
Interval time.Duration
|
|
DB database.Store
|
|
Logger slog.Logger
|
|
Client *http.Client
|
|
Prometheus *prometheus.Registry
|
|
}
|
|
|
|
// ProxyHealth runs a go routine that periodically checks the health of all
|
|
// workspace proxies. This information is stored in memory, so each coderd
|
|
// replica has its own view of the health of the proxies. These views should be
|
|
// consistent, and if they are not, it indicates a problem.
|
|
type ProxyHealth struct {
|
|
db database.Store
|
|
interval time.Duration
|
|
logger slog.Logger
|
|
client *http.Client
|
|
|
|
// Cached values for quick access to the health of proxies.
|
|
cache *atomic.Pointer[map[uuid.UUID]ProxyStatus]
|
|
proxyHosts *atomic.Pointer[[]*agplproxyhealth.ProxyHost]
|
|
|
|
// PromMetrics
|
|
healthCheckDuration prometheus.Histogram
|
|
healthCheckResults *prometheusmetrics.CachedGaugeVec
|
|
}
|
|
|
|
func New(opts *Options) (*ProxyHealth, error) {
|
|
if opts.Interval <= 0 {
|
|
opts.Interval = time.Minute
|
|
}
|
|
if opts.DB == nil {
|
|
return nil, xerrors.Errorf("db is required")
|
|
}
|
|
if opts.Prometheus == nil {
|
|
opts.Prometheus = prometheus.NewRegistry()
|
|
}
|
|
|
|
client := opts.Client
|
|
if client == nil {
|
|
client = http.DefaultClient
|
|
}
|
|
// Set a timeout on the client, so we don't wait forever for a healthz response.
|
|
tmp := *client
|
|
tmp.Timeout = time.Second * 5
|
|
client = &tmp
|
|
|
|
// Prometheus metrics
|
|
healthCheckDuration := prometheus.NewHistogram(prometheus.HistogramOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "proxyhealth",
|
|
Name: "health_check_duration_seconds",
|
|
Help: "Histogram for duration of proxy health collection in seconds.",
|
|
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
|
|
})
|
|
opts.Prometheus.MustRegister(healthCheckDuration)
|
|
|
|
healthCheckResults := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "coderd",
|
|
Subsystem: "proxyhealth",
|
|
Name: "health_check_results",
|
|
Help: "This endpoint returns a number to indicate the health status. " +
|
|
"-3 (unknown), -2 (Unreachable), -1 (Unhealthy), 0 (Unregistered), 1 (Healthy)",
|
|
}, []string{"proxy_id"}))
|
|
opts.Prometheus.MustRegister(healthCheckResults)
|
|
|
|
return &ProxyHealth{
|
|
db: opts.DB,
|
|
interval: opts.Interval,
|
|
logger: opts.Logger,
|
|
client: client,
|
|
cache: &atomic.Pointer[map[uuid.UUID]ProxyStatus]{},
|
|
proxyHosts: &atomic.Pointer[[]*agplproxyhealth.ProxyHost]{},
|
|
healthCheckDuration: healthCheckDuration,
|
|
healthCheckResults: healthCheckResults,
|
|
}, nil
|
|
}
|
|
|
|
// Run will block until the context is canceled. It will periodically check the
|
|
// health of all proxies and store the results in the cache.
|
|
func (p *ProxyHealth) Run(ctx context.Context) {
|
|
ticker := time.NewTicker(p.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case now := <-ticker.C:
|
|
statuses, err := p.runOnce(ctx, now)
|
|
if err != nil {
|
|
p.logger.Error(ctx, "proxy health check failed", slog.Error(err))
|
|
continue
|
|
}
|
|
p.storeProxyHealth(statuses)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (p *ProxyHealth) storeProxyHealth(statuses map[uuid.UUID]ProxyStatus) {
|
|
var proxyHosts []*agplproxyhealth.ProxyHost
|
|
for _, s := range statuses {
|
|
if s.ProxyHost != nil {
|
|
proxyHosts = append(proxyHosts, s.ProxyHost)
|
|
}
|
|
}
|
|
|
|
// Store the statuses in the cache before any other quick values.
|
|
p.cache.Store(&statuses)
|
|
p.proxyHosts.Store(&proxyHosts)
|
|
}
|
|
|
|
// ForceUpdate runs a single health check and updates the cache. If the health
|
|
// check fails, the cache is not updated and an error is returned. This is useful
|
|
// to trigger an update when a proxy is created or deleted.
|
|
func (p *ProxyHealth) ForceUpdate(ctx context.Context) error {
|
|
statuses, err := p.runOnce(ctx, time.Now())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
p.storeProxyHealth(statuses)
|
|
return nil
|
|
}
|
|
|
|
// HealthStatus returns the current health status of all proxies stored in the
|
|
// cache.
|
|
func (p *ProxyHealth) HealthStatus() map[uuid.UUID]ProxyStatus {
|
|
if p == nil {
|
|
// This can happen because workspace proxies are still an experiment.
|
|
// For the /regions endpoint, this will be nil in those cases.
|
|
return map[uuid.UUID]ProxyStatus{}
|
|
}
|
|
ptr := p.cache.Load()
|
|
if ptr == nil {
|
|
return map[uuid.UUID]ProxyStatus{}
|
|
}
|
|
return *ptr
|
|
}
|
|
|
|
type ProxyStatus struct {
|
|
// ProxyStatus includes the value of the proxy at the time of checking. This is
|
|
// useful to know as it helps determine if the proxy checked has different values
|
|
// then the proxy in hand. AKA if the proxy was updated, and the status was for
|
|
// an older proxy.
|
|
Proxy database.WorkspaceProxy
|
|
// ProxyHost is the base host:port and app host of the proxy. This is included
|
|
// in the status to make sure the proxy url is a valid URL. It also makes it
|
|
// easier to escalate errors if the url.Parse errors (should never happen).
|
|
ProxyHost *agplproxyhealth.ProxyHost
|
|
Status Status
|
|
Report codersdk.ProxyHealthReport
|
|
CheckedAt time.Time
|
|
}
|
|
|
|
// ProxyHosts returns the host:port and wildcard host of all healthy proxies.
|
|
// This can be computed from HealthStatus, but is cached to avoid the caller
|
|
// needing to loop over all proxies to compute this on all static web requests.
|
|
func (p *ProxyHealth) ProxyHosts() []*agplproxyhealth.ProxyHost {
|
|
ptr := p.proxyHosts.Load()
|
|
if ptr == nil {
|
|
return []*agplproxyhealth.ProxyHost{}
|
|
}
|
|
return *ptr
|
|
}
|
|
|
|
// runOnce runs the health check for all workspace proxies. If there is an
|
|
// unexpected error, an error is returned. Expected errors will mark a proxy as
|
|
// unreachable.
|
|
func (p *ProxyHealth) runOnce(ctx context.Context, now time.Time) (map[uuid.UUID]ProxyStatus, error) {
|
|
// Record from the given time.
|
|
defer func() { p.healthCheckDuration.Observe(time.Since(now).Seconds()) }()
|
|
|
|
//nolint:gocritic // Proxy health is a system service.
|
|
proxies, err := p.db.GetWorkspaceProxies(dbauthz.AsSystemRestricted(ctx))
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("get workspace proxies: %w", err)
|
|
}
|
|
|
|
// Just use a mutex to protect map writes.
|
|
var statusMu sync.Mutex
|
|
proxyStatus := map[uuid.UUID]ProxyStatus{}
|
|
|
|
grp, gctx := errgroup.WithContext(ctx)
|
|
// Arbitrary parallelism limit.
|
|
grp.SetLimit(5)
|
|
|
|
for _, proxy := range proxies {
|
|
if proxy.Deleted {
|
|
// Ignore deleted proxies.
|
|
continue
|
|
}
|
|
// Each proxy needs to have a status set. Make a local copy for the
|
|
// call to be run async.
|
|
proxy := proxy
|
|
status := ProxyStatus{
|
|
Proxy: proxy,
|
|
CheckedAt: now,
|
|
Status: Unknown,
|
|
}
|
|
|
|
grp.Go(func() error {
|
|
if proxy.Url == "" {
|
|
// Empty URL means the proxy has not registered yet.
|
|
// When the proxy is started, it will update the url.
|
|
statusMu.Lock()
|
|
defer statusMu.Unlock()
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 0, proxy.ID.String())
|
|
status.Status = Unregistered
|
|
proxyStatus[proxy.ID] = status
|
|
return nil
|
|
}
|
|
|
|
// Try to hit the healthz-report endpoint for a comprehensive health check.
|
|
reqURL := fmt.Sprintf("%s/healthz-report", strings.TrimSuffix(proxy.Url, "/"))
|
|
req, err := http.NewRequestWithContext(gctx, http.MethodGet, reqURL, nil)
|
|
if err != nil {
|
|
return xerrors.Errorf("new request: %w", err)
|
|
}
|
|
req = req.WithContext(gctx)
|
|
|
|
resp, err := p.client.Do(req)
|
|
if err == nil {
|
|
defer resp.Body.Close()
|
|
}
|
|
// A switch statement felt easier to categorize the different cases than
|
|
// if else statements or nested if statements.
|
|
switch {
|
|
case err == nil && resp.StatusCode == http.StatusOK:
|
|
err := json.NewDecoder(resp.Body).Decode(&status.Report)
|
|
if err != nil {
|
|
isCoderErr := xerrors.Errorf("proxy url %q is not a coder proxy instance, verify the url is correct", reqURL)
|
|
if resp.Header.Get(codersdk.BuildVersionHeader) != "" {
|
|
isCoderErr = xerrors.Errorf("proxy url %q is a coder instance, but unable to decode the response payload. Could this be a primary coderd and not a proxy?", reqURL)
|
|
}
|
|
|
|
// If the response is not json, then the user likely input a bad url that returns status code 200.
|
|
// This is very common, since most webpages do return a 200. So let's improve the error message.
|
|
if notJSONErr := codersdk.ExpectJSONMime(resp); notJSONErr != nil {
|
|
err = errors.Join(
|
|
isCoderErr,
|
|
xerrors.Errorf("attempted to query health at %q but got back the incorrect content type: %w", reqURL, notJSONErr),
|
|
)
|
|
|
|
status.Report.Errors = []string{
|
|
err.Error(),
|
|
}
|
|
status.Status = Unhealthy
|
|
break
|
|
}
|
|
|
|
// If we cannot read the report, mark the proxy as unhealthy.
|
|
status.Report.Errors = []string{
|
|
errors.Join(
|
|
isCoderErr,
|
|
xerrors.Errorf("received a status code 200, but failed to decode health report body: %w", err),
|
|
).Error(),
|
|
}
|
|
status.Status = Unhealthy
|
|
break
|
|
}
|
|
if len(status.Report.Errors) > 0 {
|
|
status.Status = Unhealthy
|
|
break
|
|
}
|
|
|
|
status.Status = Healthy
|
|
case err == nil && resp.StatusCode != http.StatusOK:
|
|
// Unhealthy as we did reach the proxy but it got an unexpected response.
|
|
status.Status = Unhealthy
|
|
var builder strings.Builder
|
|
// This string is shown on the UI where newlines are respected.
|
|
// This error message is not ever decoded programmatically, so keep it human-
|
|
// readable.
|
|
builder.WriteString(fmt.Sprintf("unexpected status code %d. ", resp.StatusCode))
|
|
builder.WriteString(fmt.Sprintf("\nEncountered error, send a request to %q from the Coderd environment to debug this issue.", reqURL))
|
|
// err will always be non-nil
|
|
err := codersdk.ReadBodyAsError(resp)
|
|
var apiErr *codersdk.Error
|
|
if xerrors.As(err, &apiErr) {
|
|
builder.WriteString(fmt.Sprintf("\nError Message: %s\nError Detail: %s", apiErr.Message, apiErr.Detail))
|
|
for _, v := range apiErr.Validations {
|
|
// Pretty sure this is not possible from the called endpoint, but just in case.
|
|
builder.WriteString(fmt.Sprintf("\n\tValidation: %s=%s", v.Field, v.Detail))
|
|
}
|
|
}
|
|
builder.WriteString(fmt.Sprintf("\nError: %s", err.Error()))
|
|
|
|
status.Report.Errors = []string{builder.String()}
|
|
case err != nil:
|
|
// Request failed, mark the proxy as unreachable.
|
|
status.Status = Unreachable
|
|
status.Report.Errors = []string{fmt.Sprintf("request to proxy failed: %s", err.Error())}
|
|
default:
|
|
// This should never happen
|
|
status.Status = Unknown
|
|
}
|
|
|
|
u, err := url.Parse(proxy.Url)
|
|
if err != nil {
|
|
// This should never happen. This would mean the proxy sent
|
|
// us an invalid url?
|
|
status.Report.Errors = append(status.Report.Errors, fmt.Sprintf("failed to parse proxy url: %s", err.Error()))
|
|
status.Status = Unhealthy
|
|
}
|
|
status.ProxyHost = &agplproxyhealth.ProxyHost{
|
|
Host: u.Host,
|
|
AppHost: appurl.ConvertAppHostForCSP(u.Host, proxy.WildcardHostname),
|
|
}
|
|
|
|
// Set the prometheus metric correctly.
|
|
switch status.Status {
|
|
case Healthy:
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 1, proxy.ID.String())
|
|
case Unhealthy:
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -1, proxy.ID.String())
|
|
case Unreachable:
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -2, proxy.ID.String())
|
|
default:
|
|
// Unknown
|
|
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -3, proxy.ID.String())
|
|
}
|
|
|
|
statusMu.Lock()
|
|
defer statusMu.Unlock()
|
|
proxyStatus[proxy.ID] = status
|
|
return nil
|
|
})
|
|
}
|
|
|
|
err = grp.Wait()
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("group run: %w", err)
|
|
}
|
|
p.healthCheckResults.Commit()
|
|
|
|
return proxyStatus, nil
|
|
}
|