Files
coder/enterprise/coderd/proxyhealth/proxyhealth.go
Asher 82c14e00ce feat: add csp headers for embedded apps (#18374)
I modified the proxy host cache we already had and were using for
websocket csp headers to also include the wildcard app host, then used
those for frame-src policies.

I did not add frame-ancestors, since if I understand correctly, those
would go on the app, and this middleware does not come into play there.
Maybe we will want to add it on workspace apps like we do with cors, if
we find apps are setting it to `none` or something.

Closes https://github.com/coder/internal/issues/684
2025-06-17 09:00:32 -08:00

387 lines
13 KiB
Go

package proxyhealth
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"net/url"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/prometheusmetrics"
agplproxyhealth "github.com/coder/coder/v2/coderd/proxyhealth"
"github.com/coder/coder/v2/coderd/workspaceapps/appurl"
"github.com/coder/coder/v2/codersdk"
)
type Status string
const (
// Unknown should never be returned by the proxy health check.
Unknown Status = "unknown"
// Healthy means the proxy access url is reachable and returns a healthy
// status code.
Healthy Status = "ok"
// Unreachable means the proxy access url is not responding.
Unreachable Status = "unreachable"
// Unhealthy means the proxy access url is responding, but there is some
// problem with the proxy. This problem may or may not be preventing functionality.
Unhealthy Status = "unhealthy"
// Unregistered means the proxy has not registered a url yet. This means
// the proxy was created with the cli, but has not yet been started.
Unregistered Status = "unregistered"
)
type Options struct {
// Interval is the interval at which the proxy health is checked.
Interval time.Duration
DB database.Store
Logger slog.Logger
Client *http.Client
Prometheus *prometheus.Registry
}
// ProxyHealth runs a go routine that periodically checks the health of all
// workspace proxies. This information is stored in memory, so each coderd
// replica has its own view of the health of the proxies. These views should be
// consistent, and if they are not, it indicates a problem.
type ProxyHealth struct {
db database.Store
interval time.Duration
logger slog.Logger
client *http.Client
// Cached values for quick access to the health of proxies.
cache *atomic.Pointer[map[uuid.UUID]ProxyStatus]
proxyHosts *atomic.Pointer[[]*agplproxyhealth.ProxyHost]
// PromMetrics
healthCheckDuration prometheus.Histogram
healthCheckResults *prometheusmetrics.CachedGaugeVec
}
func New(opts *Options) (*ProxyHealth, error) {
if opts.Interval <= 0 {
opts.Interval = time.Minute
}
if opts.DB == nil {
return nil, xerrors.Errorf("db is required")
}
if opts.Prometheus == nil {
opts.Prometheus = prometheus.NewRegistry()
}
client := opts.Client
if client == nil {
client = http.DefaultClient
}
// Set a timeout on the client, so we don't wait forever for a healthz response.
tmp := *client
tmp.Timeout = time.Second * 5
client = &tmp
// Prometheus metrics
healthCheckDuration := prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "coderd",
Subsystem: "proxyhealth",
Name: "health_check_duration_seconds",
Help: "Histogram for duration of proxy health collection in seconds.",
Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30},
})
opts.Prometheus.MustRegister(healthCheckDuration)
healthCheckResults := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "coderd",
Subsystem: "proxyhealth",
Name: "health_check_results",
Help: "This endpoint returns a number to indicate the health status. " +
"-3 (unknown), -2 (Unreachable), -1 (Unhealthy), 0 (Unregistered), 1 (Healthy)",
}, []string{"proxy_id"}))
opts.Prometheus.MustRegister(healthCheckResults)
return &ProxyHealth{
db: opts.DB,
interval: opts.Interval,
logger: opts.Logger,
client: client,
cache: &atomic.Pointer[map[uuid.UUID]ProxyStatus]{},
proxyHosts: &atomic.Pointer[[]*agplproxyhealth.ProxyHost]{},
healthCheckDuration: healthCheckDuration,
healthCheckResults: healthCheckResults,
}, nil
}
// Run will block until the context is canceled. It will periodically check the
// health of all proxies and store the results in the cache.
func (p *ProxyHealth) Run(ctx context.Context) {
ticker := time.NewTicker(p.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case now := <-ticker.C:
statuses, err := p.runOnce(ctx, now)
if err != nil {
p.logger.Error(ctx, "proxy health check failed", slog.Error(err))
continue
}
p.storeProxyHealth(statuses)
}
}
}
func (p *ProxyHealth) storeProxyHealth(statuses map[uuid.UUID]ProxyStatus) {
var proxyHosts []*agplproxyhealth.ProxyHost
for _, s := range statuses {
if s.ProxyHost != nil {
proxyHosts = append(proxyHosts, s.ProxyHost)
}
}
// Store the statuses in the cache before any other quick values.
p.cache.Store(&statuses)
p.proxyHosts.Store(&proxyHosts)
}
// ForceUpdate runs a single health check and updates the cache. If the health
// check fails, the cache is not updated and an error is returned. This is useful
// to trigger an update when a proxy is created or deleted.
func (p *ProxyHealth) ForceUpdate(ctx context.Context) error {
statuses, err := p.runOnce(ctx, time.Now())
if err != nil {
return err
}
p.storeProxyHealth(statuses)
return nil
}
// HealthStatus returns the current health status of all proxies stored in the
// cache.
func (p *ProxyHealth) HealthStatus() map[uuid.UUID]ProxyStatus {
if p == nil {
// This can happen because workspace proxies are still an experiment.
// For the /regions endpoint, this will be nil in those cases.
return map[uuid.UUID]ProxyStatus{}
}
ptr := p.cache.Load()
if ptr == nil {
return map[uuid.UUID]ProxyStatus{}
}
return *ptr
}
type ProxyStatus struct {
// ProxyStatus includes the value of the proxy at the time of checking. This is
// useful to know as it helps determine if the proxy checked has different values
// then the proxy in hand. AKA if the proxy was updated, and the status was for
// an older proxy.
Proxy database.WorkspaceProxy
// ProxyHost is the base host:port and app host of the proxy. This is included
// in the status to make sure the proxy url is a valid URL. It also makes it
// easier to escalate errors if the url.Parse errors (should never happen).
ProxyHost *agplproxyhealth.ProxyHost
Status Status
Report codersdk.ProxyHealthReport
CheckedAt time.Time
}
// ProxyHosts returns the host:port and wildcard host of all healthy proxies.
// This can be computed from HealthStatus, but is cached to avoid the caller
// needing to loop over all proxies to compute this on all static web requests.
func (p *ProxyHealth) ProxyHosts() []*agplproxyhealth.ProxyHost {
ptr := p.proxyHosts.Load()
if ptr == nil {
return []*agplproxyhealth.ProxyHost{}
}
return *ptr
}
// runOnce runs the health check for all workspace proxies. If there is an
// unexpected error, an error is returned. Expected errors will mark a proxy as
// unreachable.
func (p *ProxyHealth) runOnce(ctx context.Context, now time.Time) (map[uuid.UUID]ProxyStatus, error) {
// Record from the given time.
defer func() { p.healthCheckDuration.Observe(time.Since(now).Seconds()) }()
//nolint:gocritic // Proxy health is a system service.
proxies, err := p.db.GetWorkspaceProxies(dbauthz.AsSystemRestricted(ctx))
if err != nil {
return nil, xerrors.Errorf("get workspace proxies: %w", err)
}
// Just use a mutex to protect map writes.
var statusMu sync.Mutex
proxyStatus := map[uuid.UUID]ProxyStatus{}
grp, gctx := errgroup.WithContext(ctx)
// Arbitrary parallelism limit.
grp.SetLimit(5)
for _, proxy := range proxies {
if proxy.Deleted {
// Ignore deleted proxies.
continue
}
// Each proxy needs to have a status set. Make a local copy for the
// call to be run async.
proxy := proxy
status := ProxyStatus{
Proxy: proxy,
CheckedAt: now,
Status: Unknown,
}
grp.Go(func() error {
if proxy.Url == "" {
// Empty URL means the proxy has not registered yet.
// When the proxy is started, it will update the url.
statusMu.Lock()
defer statusMu.Unlock()
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 0, proxy.ID.String())
status.Status = Unregistered
proxyStatus[proxy.ID] = status
return nil
}
// Try to hit the healthz-report endpoint for a comprehensive health check.
reqURL := fmt.Sprintf("%s/healthz-report", strings.TrimSuffix(proxy.Url, "/"))
req, err := http.NewRequestWithContext(gctx, http.MethodGet, reqURL, nil)
if err != nil {
return xerrors.Errorf("new request: %w", err)
}
req = req.WithContext(gctx)
resp, err := p.client.Do(req)
if err == nil {
defer resp.Body.Close()
}
// A switch statement felt easier to categorize the different cases than
// if else statements or nested if statements.
switch {
case err == nil && resp.StatusCode == http.StatusOK:
err := json.NewDecoder(resp.Body).Decode(&status.Report)
if err != nil {
isCoderErr := xerrors.Errorf("proxy url %q is not a coder proxy instance, verify the url is correct", reqURL)
if resp.Header.Get(codersdk.BuildVersionHeader) != "" {
isCoderErr = xerrors.Errorf("proxy url %q is a coder instance, but unable to decode the response payload. Could this be a primary coderd and not a proxy?", reqURL)
}
// If the response is not json, then the user likely input a bad url that returns status code 200.
// This is very common, since most webpages do return a 200. So let's improve the error message.
if notJSONErr := codersdk.ExpectJSONMime(resp); notJSONErr != nil {
err = errors.Join(
isCoderErr,
xerrors.Errorf("attempted to query health at %q but got back the incorrect content type: %w", reqURL, notJSONErr),
)
status.Report.Errors = []string{
err.Error(),
}
status.Status = Unhealthy
break
}
// If we cannot read the report, mark the proxy as unhealthy.
status.Report.Errors = []string{
errors.Join(
isCoderErr,
xerrors.Errorf("received a status code 200, but failed to decode health report body: %w", err),
).Error(),
}
status.Status = Unhealthy
break
}
if len(status.Report.Errors) > 0 {
status.Status = Unhealthy
break
}
status.Status = Healthy
case err == nil && resp.StatusCode != http.StatusOK:
// Unhealthy as we did reach the proxy but it got an unexpected response.
status.Status = Unhealthy
var builder strings.Builder
// This string is shown on the UI where newlines are respected.
// This error message is not ever decoded programmatically, so keep it human-
// readable.
builder.WriteString(fmt.Sprintf("unexpected status code %d. ", resp.StatusCode))
builder.WriteString(fmt.Sprintf("\nEncountered error, send a request to %q from the Coderd environment to debug this issue.", reqURL))
// err will always be non-nil
err := codersdk.ReadBodyAsError(resp)
var apiErr *codersdk.Error
if xerrors.As(err, &apiErr) {
builder.WriteString(fmt.Sprintf("\nError Message: %s\nError Detail: %s", apiErr.Message, apiErr.Detail))
for _, v := range apiErr.Validations {
// Pretty sure this is not possible from the called endpoint, but just in case.
builder.WriteString(fmt.Sprintf("\n\tValidation: %s=%s", v.Field, v.Detail))
}
}
builder.WriteString(fmt.Sprintf("\nError: %s", err.Error()))
status.Report.Errors = []string{builder.String()}
case err != nil:
// Request failed, mark the proxy as unreachable.
status.Status = Unreachable
status.Report.Errors = []string{fmt.Sprintf("request to proxy failed: %s", err.Error())}
default:
// This should never happen
status.Status = Unknown
}
u, err := url.Parse(proxy.Url)
if err != nil {
// This should never happen. This would mean the proxy sent
// us an invalid url?
status.Report.Errors = append(status.Report.Errors, fmt.Sprintf("failed to parse proxy url: %s", err.Error()))
status.Status = Unhealthy
}
status.ProxyHost = &agplproxyhealth.ProxyHost{
Host: u.Host,
AppHost: appurl.ConvertAppHostForCSP(u.Host, proxy.WildcardHostname),
}
// Set the prometheus metric correctly.
switch status.Status {
case Healthy:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, 1, proxy.ID.String())
case Unhealthy:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -1, proxy.ID.String())
case Unreachable:
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -2, proxy.ID.String())
default:
// Unknown
p.healthCheckResults.WithLabelValues(prometheusmetrics.VectorOperationSet, -3, proxy.ID.String())
}
statusMu.Lock()
defer statusMu.Unlock()
proxyStatus[proxy.ID] = status
return nil
})
}
err = grp.Wait()
if err != nil {
return nil, xerrors.Errorf("group run: %w", err)
}
p.healthCheckResults.Commit()
return proxyStatus, nil
}