mirror of
https://github.com/coder/coder.git
synced 2025-07-03 16:13:58 +00:00
465 lines
15 KiB
Go
465 lines
15 KiB
Go
package cliui
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"io"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
|
||
"github.com/google/uuid"
|
||
"golang.org/x/xerrors"
|
||
"tailscale.com/tailcfg"
|
||
|
||
"github.com/coder/coder/v2/codersdk"
|
||
"github.com/coder/coder/v2/codersdk/healthsdk"
|
||
"github.com/coder/coder/v2/codersdk/workspacesdk"
|
||
"github.com/coder/coder/v2/tailnet"
|
||
)
|
||
|
||
var errAgentShuttingDown = xerrors.New("agent is shutting down")
|
||
|
||
type AgentOptions struct {
|
||
FetchInterval time.Duration
|
||
Fetch func(ctx context.Context, agentID uuid.UUID) (codersdk.WorkspaceAgent, error)
|
||
FetchLogs func(ctx context.Context, agentID uuid.UUID, after int64, follow bool) (<-chan []codersdk.WorkspaceAgentLog, io.Closer, error)
|
||
Wait bool // If true, wait for the agent to be ready (startup script).
|
||
DocsURL string
|
||
}
|
||
|
||
// Agent displays a spinning indicator that waits for a workspace agent to connect.
|
||
func Agent(ctx context.Context, writer io.Writer, agentID uuid.UUID, opts AgentOptions) error {
|
||
ctx, cancel := context.WithCancel(ctx)
|
||
defer cancel()
|
||
|
||
if opts.FetchInterval == 0 {
|
||
opts.FetchInterval = 500 * time.Millisecond
|
||
}
|
||
if opts.FetchLogs == nil {
|
||
opts.FetchLogs = func(_ context.Context, _ uuid.UUID, _ int64, _ bool) (<-chan []codersdk.WorkspaceAgentLog, io.Closer, error) {
|
||
c := make(chan []codersdk.WorkspaceAgentLog)
|
||
close(c)
|
||
return c, closeFunc(func() error { return nil }), nil
|
||
}
|
||
}
|
||
|
||
type fetchAgent struct {
|
||
agent codersdk.WorkspaceAgent
|
||
err error
|
||
}
|
||
fetchedAgent := make(chan fetchAgent, 1)
|
||
go func() {
|
||
t := time.NewTimer(0)
|
||
defer t.Stop()
|
||
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
return
|
||
case <-t.C:
|
||
agent, err := opts.Fetch(ctx, agentID)
|
||
select {
|
||
case <-fetchedAgent:
|
||
default:
|
||
}
|
||
if err != nil {
|
||
fetchedAgent <- fetchAgent{err: xerrors.Errorf("fetch workspace agent: %w", err)}
|
||
return
|
||
}
|
||
fetchedAgent <- fetchAgent{agent: agent}
|
||
t.Reset(opts.FetchInterval)
|
||
}
|
||
}
|
||
}()
|
||
fetch := func() (codersdk.WorkspaceAgent, error) {
|
||
select {
|
||
case <-ctx.Done():
|
||
return codersdk.WorkspaceAgent{}, ctx.Err()
|
||
case f := <-fetchedAgent:
|
||
if f.err != nil {
|
||
return codersdk.WorkspaceAgent{}, f.err
|
||
}
|
||
return f.agent, nil
|
||
}
|
||
}
|
||
|
||
agent, err := fetch()
|
||
if err != nil {
|
||
return xerrors.Errorf("fetch: %w", err)
|
||
}
|
||
logSources := map[uuid.UUID]codersdk.WorkspaceAgentLogSource{}
|
||
for _, source := range agent.LogSources {
|
||
logSources[source.ID] = source
|
||
}
|
||
|
||
sw := &stageWriter{w: writer}
|
||
|
||
showStartupLogs := false
|
||
for {
|
||
// It doesn't matter if we're connected or not, if the agent is
|
||
// shutting down, we don't know if it's coming back.
|
||
if agent.LifecycleState.ShuttingDown() {
|
||
return errAgentShuttingDown
|
||
}
|
||
|
||
switch agent.Status {
|
||
case codersdk.WorkspaceAgentConnecting, codersdk.WorkspaceAgentTimeout:
|
||
// Since we were waiting for the agent to connect, also show
|
||
// startup logs if applicable.
|
||
showStartupLogs = true
|
||
|
||
stage := "Waiting for the workspace agent to connect"
|
||
sw.Start(stage)
|
||
for agent.Status == codersdk.WorkspaceAgentConnecting {
|
||
if agent, err = fetch(); err != nil {
|
||
return xerrors.Errorf("fetch: %w", err)
|
||
}
|
||
}
|
||
|
||
if agent.Status == codersdk.WorkspaceAgentTimeout {
|
||
now := time.Now()
|
||
sw.Log(now, codersdk.LogLevelInfo, "The workspace agent is having trouble connecting, wait for it to connect or restart your workspace.")
|
||
sw.Log(now, codersdk.LogLevelInfo, troubleshootingMessage(agent, fmt.Sprintf("%s/admin/templates/troubleshooting#agent-connection-issues", opts.DocsURL)))
|
||
for agent.Status == codersdk.WorkspaceAgentTimeout {
|
||
if agent, err = fetch(); err != nil {
|
||
return xerrors.Errorf("fetch: %w", err)
|
||
}
|
||
}
|
||
}
|
||
sw.Complete(stage, agent.FirstConnectedAt.Sub(agent.CreatedAt))
|
||
|
||
case codersdk.WorkspaceAgentConnected:
|
||
if !showStartupLogs && agent.LifecycleState == codersdk.WorkspaceAgentLifecycleReady {
|
||
// The workspace is ready, there's nothing to do but connect.
|
||
return nil
|
||
}
|
||
|
||
stage := "Running workspace agent startup scripts"
|
||
follow := opts.Wait && agent.LifecycleState.Starting()
|
||
if !follow {
|
||
stage += " (non-blocking)"
|
||
}
|
||
sw.Start(stage)
|
||
if follow {
|
||
sw.Log(time.Time{}, codersdk.LogLevelInfo, "==> ℹ︎ To connect immediately, reconnect with --wait=no or CODER_SSH_WAIT=no, see --help for more information.")
|
||
}
|
||
|
||
err = func() error { // Use func because of defer in for loop.
|
||
logStream, logsCloser, err := opts.FetchLogs(ctx, agent.ID, 0, follow)
|
||
if err != nil {
|
||
return xerrors.Errorf("fetch workspace agent startup logs: %w", err)
|
||
}
|
||
defer logsCloser.Close()
|
||
|
||
var lastLog codersdk.WorkspaceAgentLog
|
||
fetchedAgentWhileFollowing := fetchedAgent
|
||
if !follow {
|
||
fetchedAgentWhileFollowing = nil
|
||
}
|
||
for {
|
||
// This select is essentially and inline `fetch()`.
|
||
select {
|
||
case <-ctx.Done():
|
||
return ctx.Err()
|
||
case f := <-fetchedAgentWhileFollowing:
|
||
if f.err != nil {
|
||
return xerrors.Errorf("fetch: %w", f.err)
|
||
}
|
||
agent = f.agent
|
||
|
||
// If the agent is no longer starting, stop following
|
||
// logs because FetchLogs will keep streaming forever.
|
||
// We do one last non-follow request to ensure we have
|
||
// fetched all logs.
|
||
if !agent.LifecycleState.Starting() {
|
||
_ = logsCloser.Close()
|
||
fetchedAgentWhileFollowing = nil
|
||
|
||
logStream, logsCloser, err = opts.FetchLogs(ctx, agent.ID, lastLog.ID, false)
|
||
if err != nil {
|
||
return xerrors.Errorf("fetch workspace agent startup logs: %w", err)
|
||
}
|
||
// Logs are already primed, so we can call close.
|
||
_ = logsCloser.Close()
|
||
}
|
||
case logs, ok := <-logStream:
|
||
if !ok {
|
||
return nil
|
||
}
|
||
for _, log := range logs {
|
||
source, hasSource := logSources[log.SourceID]
|
||
output := log.Output
|
||
if hasSource && source.DisplayName != "" {
|
||
output = source.DisplayName + ": " + output
|
||
}
|
||
sw.Log(log.CreatedAt, log.Level, output)
|
||
lastLog = log
|
||
}
|
||
}
|
||
}
|
||
}()
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
for follow && agent.LifecycleState.Starting() {
|
||
if agent, err = fetch(); err != nil {
|
||
return xerrors.Errorf("fetch: %w", err)
|
||
}
|
||
}
|
||
|
||
switch agent.LifecycleState {
|
||
case codersdk.WorkspaceAgentLifecycleReady:
|
||
sw.Complete(stage, safeDuration(sw, agent.ReadyAt, agent.StartedAt))
|
||
case codersdk.WorkspaceAgentLifecycleStartTimeout:
|
||
// Backwards compatibility: Avoid printing warning if
|
||
// coderd is old and doesn't set ReadyAt for timeouts.
|
||
if agent.ReadyAt == nil {
|
||
sw.Fail(stage, 0)
|
||
} else {
|
||
sw.Fail(stage, safeDuration(sw, agent.ReadyAt, agent.StartedAt))
|
||
}
|
||
sw.Log(time.Time{}, codersdk.LogLevelWarn, "Warning: A startup script timed out and your workspace may be incomplete.")
|
||
case codersdk.WorkspaceAgentLifecycleStartError:
|
||
sw.Fail(stage, safeDuration(sw, agent.ReadyAt, agent.StartedAt))
|
||
// Use zero time (omitted) to separate these from the startup logs.
|
||
sw.Log(time.Time{}, codersdk.LogLevelWarn, "Warning: A startup script exited with an error and your workspace may be incomplete.")
|
||
sw.Log(time.Time{}, codersdk.LogLevelWarn, troubleshootingMessage(agent, fmt.Sprintf("%s/admin/templates/troubleshooting#startup-script-exited-with-an-error", opts.DocsURL)))
|
||
default:
|
||
switch {
|
||
case agent.LifecycleState.Starting():
|
||
// Use zero time (omitted) to separate these from the startup logs.
|
||
sw.Log(time.Time{}, codersdk.LogLevelWarn, "Notice: The startup scripts are still running and your workspace may be incomplete.")
|
||
sw.Log(time.Time{}, codersdk.LogLevelWarn, troubleshootingMessage(agent, fmt.Sprintf("%s/admin/templates/troubleshooting#your-workspace-may-be-incomplete", opts.DocsURL)))
|
||
// Note: We don't complete or fail the stage here, it's
|
||
// intentionally left open to indicate this stage didn't
|
||
// complete.
|
||
case agent.LifecycleState.ShuttingDown():
|
||
// We no longer know if the startup script failed or not,
|
||
// but we need to tell the user something.
|
||
sw.Complete(stage, safeDuration(sw, agent.ReadyAt, agent.StartedAt))
|
||
return errAgentShuttingDown
|
||
}
|
||
}
|
||
|
||
return nil
|
||
|
||
case codersdk.WorkspaceAgentDisconnected:
|
||
// If the agent was still starting during disconnect, we'll
|
||
// show startup logs.
|
||
showStartupLogs = agent.LifecycleState.Starting()
|
||
|
||
stage := "The workspace agent lost connection"
|
||
sw.Start(stage)
|
||
sw.Log(time.Now(), codersdk.LogLevelWarn, "Wait for it to reconnect or restart your workspace.")
|
||
sw.Log(time.Now(), codersdk.LogLevelWarn, troubleshootingMessage(agent, fmt.Sprintf("%s/admin/templates/troubleshooting#agent-connection-issues", opts.DocsURL)))
|
||
|
||
disconnectedAt := agent.DisconnectedAt
|
||
for agent.Status == codersdk.WorkspaceAgentDisconnected {
|
||
if agent, err = fetch(); err != nil {
|
||
return xerrors.Errorf("fetch: %w", err)
|
||
}
|
||
}
|
||
sw.Complete(stage, safeDuration(sw, agent.LastConnectedAt, disconnectedAt))
|
||
}
|
||
}
|
||
}
|
||
|
||
func troubleshootingMessage(agent codersdk.WorkspaceAgent, url string) string {
|
||
m := "For more information and troubleshooting, see " + url
|
||
if agent.TroubleshootingURL != "" {
|
||
m += " and " + agent.TroubleshootingURL
|
||
}
|
||
return m
|
||
}
|
||
|
||
// safeDuration returns a-b. If a or b is nil, it returns 0.
|
||
// This is because we often dereference a time pointer, which can
|
||
// cause a panic. These dereferences are used to calculate durations,
|
||
// which are not critical, and therefor should not break things
|
||
// when it fails.
|
||
// A panic has been observed in a test.
|
||
func safeDuration(sw *stageWriter, a, b *time.Time) time.Duration {
|
||
if a == nil || b == nil {
|
||
if sw != nil {
|
||
// Ideally the message includes which fields are <nil>, but you can
|
||
// use the surrounding log lines to figure that out. And passing more
|
||
// params makes this unwieldy.
|
||
sw.Log(time.Now(), codersdk.LogLevelWarn, "Warning: Failed to calculate duration from a time being <nil>.")
|
||
}
|
||
return 0
|
||
}
|
||
return a.Sub(*b)
|
||
}
|
||
|
||
type closeFunc func() error
|
||
|
||
func (c closeFunc) Close() error {
|
||
return c()
|
||
}
|
||
|
||
func PeerDiagnostics(w io.Writer, d tailnet.PeerDiagnostics) {
|
||
if d.PreferredDERP > 0 {
|
||
rn, ok := d.DERPRegionNames[d.PreferredDERP]
|
||
if !ok {
|
||
rn = "unknown"
|
||
}
|
||
_, _ = fmt.Fprintf(w, "✔ preferred DERP region: %d (%s)\n", d.PreferredDERP, rn)
|
||
} else {
|
||
_, _ = fmt.Fprint(w, "✘ not connected to DERP\n")
|
||
}
|
||
if d.SentNode {
|
||
_, _ = fmt.Fprint(w, "✔ sent local data to Coder networking coordinator\n")
|
||
} else {
|
||
_, _ = fmt.Fprint(w, "✘ have not sent local data to Coder networking coordinator\n")
|
||
}
|
||
if d.ReceivedNode != nil {
|
||
dp := d.ReceivedNode.DERP
|
||
dn := ""
|
||
// should be 127.3.3.40:N where N is the DERP region
|
||
ap := strings.Split(dp, ":")
|
||
if len(ap) == 2 {
|
||
dp = ap[1]
|
||
di, err := strconv.Atoi(dp)
|
||
if err == nil {
|
||
var ok bool
|
||
dn, ok = d.DERPRegionNames[di]
|
||
if ok {
|
||
dn = fmt.Sprintf("(%s)", dn)
|
||
} else {
|
||
dn = "(unknown)"
|
||
}
|
||
}
|
||
}
|
||
_, _ = fmt.Fprintf(w,
|
||
"✔ received remote agent data from Coder networking coordinator\n preferred DERP region: %s %s\n endpoints: %s\n",
|
||
dp, dn, strings.Join(d.ReceivedNode.Endpoints, ", "))
|
||
} else {
|
||
_, _ = fmt.Fprint(w, "✘ have not received remote agent data from Coder networking coordinator\n")
|
||
}
|
||
if !d.LastWireguardHandshake.IsZero() {
|
||
ago := time.Since(d.LastWireguardHandshake)
|
||
symbol := "✔"
|
||
// wireguard is supposed to refresh handshake on 5 minute intervals
|
||
if ago > 5*time.Minute {
|
||
symbol = "⚠"
|
||
}
|
||
_, _ = fmt.Fprintf(w, "%s Wireguard handshake %s ago\n", symbol, ago.Round(time.Second))
|
||
} else {
|
||
_, _ = fmt.Fprint(w, "✘ Wireguard is not connected\n")
|
||
}
|
||
}
|
||
|
||
type ConnDiags struct {
|
||
ConnInfo workspacesdk.AgentConnectionInfo
|
||
PingP2P bool
|
||
DisableDirect bool
|
||
LocalNetInfo *tailcfg.NetInfo
|
||
LocalInterfaces *healthsdk.InterfacesReport
|
||
AgentNetcheck *healthsdk.AgentNetcheckReport
|
||
ClientIPIsAWS bool
|
||
AgentIPIsAWS bool
|
||
Verbose bool
|
||
TroubleshootingURL string
|
||
}
|
||
|
||
func (d ConnDiags) Write(w io.Writer) {
|
||
_, _ = fmt.Fprintln(w, "")
|
||
general, client, agent := d.splitDiagnostics()
|
||
for _, msg := range general {
|
||
_, _ = fmt.Fprintln(w, msg)
|
||
}
|
||
if len(general) > 0 {
|
||
_, _ = fmt.Fprintln(w, "")
|
||
}
|
||
if len(client) > 0 {
|
||
_, _ = fmt.Fprint(w, "Possible client-side issues with direct connection:\n\n")
|
||
for _, msg := range client {
|
||
_, _ = fmt.Fprintf(w, " - %s\n\n", msg)
|
||
}
|
||
}
|
||
if len(agent) > 0 {
|
||
_, _ = fmt.Fprint(w, "Possible agent-side issues with direct connections:\n\n")
|
||
for _, msg := range agent {
|
||
_, _ = fmt.Fprintf(w, " - %s\n\n", msg)
|
||
}
|
||
}
|
||
}
|
||
|
||
func (d ConnDiags) splitDiagnostics() (general, client, agent []string) {
|
||
if d.AgentNetcheck != nil {
|
||
for _, msg := range d.AgentNetcheck.Interfaces.Warnings {
|
||
agent = append(agent, msg.Message)
|
||
}
|
||
if len(d.AgentNetcheck.Interfaces.Warnings) > 0 {
|
||
agent[len(agent)-1] += fmt.Sprintf("\n%s#low-mtu", d.TroubleshootingURL)
|
||
}
|
||
}
|
||
|
||
if d.LocalInterfaces != nil {
|
||
for _, msg := range d.LocalInterfaces.Warnings {
|
||
client = append(client, msg.Message)
|
||
}
|
||
if len(d.LocalInterfaces.Warnings) > 0 {
|
||
client[len(client)-1] += fmt.Sprintf("\n%s#low-mtu", d.TroubleshootingURL)
|
||
}
|
||
}
|
||
|
||
if d.PingP2P && !d.Verbose {
|
||
return general, client, agent
|
||
}
|
||
|
||
if d.DisableDirect {
|
||
general = append(general, "❗ Direct connections are disabled locally, by `--disable-direct-connections` or `CODER_DISABLE_DIRECT_CONNECTIONS`.\n"+
|
||
" They may still be established over a private network.")
|
||
if !d.Verbose {
|
||
return general, client, agent
|
||
}
|
||
}
|
||
|
||
if d.ConnInfo.DisableDirectConnections {
|
||
general = append(general,
|
||
fmt.Sprintf("❗ Your Coder administrator has blocked direct connections\n %s#disabled-deployment-wide", d.TroubleshootingURL))
|
||
if !d.Verbose {
|
||
return general, client, agent
|
||
}
|
||
}
|
||
|
||
if !d.ConnInfo.DERPMap.HasSTUN() {
|
||
general = append(general,
|
||
fmt.Sprintf("❗ The DERP map is not configured to use STUN\n %s#no-stun-servers", d.TroubleshootingURL))
|
||
} else if d.LocalNetInfo != nil && !d.LocalNetInfo.UDP {
|
||
client = append(client,
|
||
fmt.Sprintf("Client could not connect to STUN over UDP\n %s#udp-blocked", d.TroubleshootingURL))
|
||
}
|
||
|
||
if d.LocalNetInfo != nil && d.LocalNetInfo.MappingVariesByDestIP.EqualBool(true) {
|
||
client = append(client,
|
||
fmt.Sprintf("Client is potentially behind a hard NAT, as multiple endpoints were retrieved from different STUN servers\n %s#endpoint-dependent-nat-hard-nat", d.TroubleshootingURL))
|
||
}
|
||
|
||
if d.AgentNetcheck != nil && d.AgentNetcheck.NetInfo != nil {
|
||
if d.AgentNetcheck.NetInfo.MappingVariesByDestIP.EqualBool(true) {
|
||
agent = append(agent,
|
||
fmt.Sprintf("Agent is potentially behind a hard NAT, as multiple endpoints were retrieved from different STUN servers\n %s#endpoint-dependent-nat-hard-nat", d.TroubleshootingURL))
|
||
}
|
||
if !d.AgentNetcheck.NetInfo.UDP {
|
||
agent = append(agent,
|
||
fmt.Sprintf("Agent could not connect to STUN over UDP\n %s#udp-blocked", d.TroubleshootingURL))
|
||
}
|
||
}
|
||
|
||
if d.ClientIPIsAWS {
|
||
client = append(client,
|
||
fmt.Sprintf("Client IP address is within an AWS range (AWS uses hard NAT)\n %s#endpoint-dependent-nat-hard-nat", d.TroubleshootingURL))
|
||
}
|
||
|
||
if d.AgentIPIsAWS {
|
||
agent = append(agent,
|
||
fmt.Sprintf("Agent IP address is within an AWS range (AWS uses hard NAT)\n %s#endpoint-dependent-nat-hard-nat", d.TroubleshootingURL))
|
||
}
|
||
|
||
return general, client, agent
|
||
}
|