mirror of
https://github.com/coder/coder.git
synced 2025-07-12 00:14:10 +00:00
feat: add SIGQUIT/SIGTRAP handler for the CLI (#5665)
This commit is contained in:
@ -39,6 +39,8 @@ func workspaceAgent() *cobra.Command {
|
|||||||
ctx, cancel := context.WithCancel(cmd.Context())
|
ctx, cancel := context.WithCancel(cmd.Context())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
|
go dumpHandler(ctx)
|
||||||
|
|
||||||
rawURL, err := cmd.Flags().GetString(varAgentURL)
|
rawURL, err := cmd.Flags().GetString(varAgentURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return xerrors.Errorf("CODER_AGENT_URL must be set: %w", err)
|
return xerrors.Errorf("CODER_AGENT_URL must be set: %w", err)
|
||||||
|
93
cli/root.go
93
cli/root.go
@ -8,8 +8,11 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strings"
|
"strings"
|
||||||
|
"syscall"
|
||||||
"text/template"
|
"text/template"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -631,3 +634,93 @@ func (h *headerTransport) RoundTrip(req *http.Request) (*http.Response, error) {
|
|||||||
}
|
}
|
||||||
return h.transport.RoundTrip(req)
|
return h.transport.RoundTrip(req)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// dumpHandler provides a custom SIGQUIT and SIGTRAP handler that dumps the
|
||||||
|
// stacktrace of all goroutines to stderr and a well-known file in the home
|
||||||
|
// directory. This is useful for debugging deadlock issues that may occur in
|
||||||
|
// production in workspaces, since the default Go runtime will only dump to
|
||||||
|
// stderr (which is often difficult/impossible to read in a workspace).
|
||||||
|
//
|
||||||
|
// SIGQUITs will still cause the program to exit (similarly to the default Go
|
||||||
|
// runtime behavior).
|
||||||
|
//
|
||||||
|
// A SIGQUIT handler will not be registered if GOTRACEBACK=crash.
|
||||||
|
//
|
||||||
|
// On Windows this immediately returns.
|
||||||
|
func dumpHandler(ctx context.Context) {
|
||||||
|
if runtime.GOOS == "windows" {
|
||||||
|
// free up the goroutine since it'll be permanently blocked anyways
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
listenSignals := []os.Signal{syscall.SIGTRAP}
|
||||||
|
if os.Getenv("GOTRACEBACK") != "crash" {
|
||||||
|
listenSignals = append(listenSignals, syscall.SIGQUIT)
|
||||||
|
}
|
||||||
|
|
||||||
|
sigs := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigs, listenSignals...)
|
||||||
|
defer signal.Stop(sigs)
|
||||||
|
|
||||||
|
for {
|
||||||
|
sigStr := ""
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case sig := <-sigs:
|
||||||
|
switch sig {
|
||||||
|
case syscall.SIGQUIT:
|
||||||
|
sigStr = "SIGQUIT"
|
||||||
|
case syscall.SIGTRAP:
|
||||||
|
sigStr = "SIGTRAP"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start with a 1MB buffer and keep doubling it until we can fit the
|
||||||
|
// entire stacktrace, stopping early once we reach 64MB.
|
||||||
|
buf := make([]byte, 1_000_000)
|
||||||
|
stacklen := 0
|
||||||
|
for {
|
||||||
|
stacklen = runtime.Stack(buf, true)
|
||||||
|
if stacklen < len(buf) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if 2*len(buf) > 64_000_000 {
|
||||||
|
// Write a message to the end of the buffer saying that it was
|
||||||
|
// truncated.
|
||||||
|
const truncatedMsg = "\n\n\nstack trace truncated due to size\n"
|
||||||
|
copy(buf[len(buf)-len(truncatedMsg):], truncatedMsg)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
buf = make([]byte, 2*len(buf))
|
||||||
|
}
|
||||||
|
|
||||||
|
_, _ = fmt.Fprintf(os.Stderr, "%s:\n%s\n", sigStr, buf[:stacklen])
|
||||||
|
|
||||||
|
// Write to a well-known file.
|
||||||
|
dir, err := os.UserHomeDir()
|
||||||
|
if err != nil {
|
||||||
|
dir = os.TempDir()
|
||||||
|
}
|
||||||
|
fpath := filepath.Join(dir, fmt.Sprintf("coder-agent-%s.dump", time.Now().Format("2006-01-02T15:04:05.000Z")))
|
||||||
|
_, _ = fmt.Fprintf(os.Stderr, "writing dump to %q\n", fpath)
|
||||||
|
|
||||||
|
f, err := os.Create(fpath)
|
||||||
|
if err != nil {
|
||||||
|
_, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error())
|
||||||
|
goto done
|
||||||
|
}
|
||||||
|
_, err = f.Write(buf[:stacklen])
|
||||||
|
_ = f.Close()
|
||||||
|
if err != nil {
|
||||||
|
_, _ = fmt.Fprintf(os.Stderr, "failed to write dump file: %v\n", err.Error())
|
||||||
|
goto done
|
||||||
|
}
|
||||||
|
|
||||||
|
done:
|
||||||
|
if sigStr == "SIGQUIT" {
|
||||||
|
//nolint:revive
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -81,6 +81,13 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
|
|||||||
Use: "server",
|
Use: "server",
|
||||||
Short: "Start a Coder server",
|
Short: "Start a Coder server",
|
||||||
RunE: func(cmd *cobra.Command, args []string) error {
|
RunE: func(cmd *cobra.Command, args []string) error {
|
||||||
|
// Main command context for managing cancellation of running
|
||||||
|
// services.
|
||||||
|
ctx, cancel := context.WithCancel(cmd.Context())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
go dumpHandler(ctx)
|
||||||
|
|
||||||
cfg, err := deployment.Config(cmd.Flags(), vip)
|
cfg, err := deployment.Config(cmd.Flags(), vip)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return xerrors.Errorf("getting deployment config: %w", err)
|
return xerrors.Errorf("getting deployment config: %w", err)
|
||||||
@ -123,11 +130,6 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
|
|||||||
logger = logger.AppendSinks(tracing.SlogSink{})
|
logger = logger.AppendSinks(tracing.SlogSink{})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Main command context for managing cancellation
|
|
||||||
// of running services.
|
|
||||||
ctx, cancel := context.WithCancel(cmd.Context())
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
// Register signals early on so that graceful shutdown can't
|
// Register signals early on so that graceful shutdown can't
|
||||||
// be interrupted by additional signals. Note that we avoid
|
// be interrupted by additional signals. Note that we avoid
|
||||||
// shadowing cancel() (from above) here because notifyStop()
|
// shadowing cancel() (from above) here because notifyStop()
|
||||||
|
Reference in New Issue
Block a user