feat: add SIGQUIT/SIGTRAP handler for the CLI (#5665)

This commit is contained in:
Dean Sheather
2023-01-11 10:22:20 -06:00
committed by GitHub
parent 69241d06e7
commit e72a2ad907
3 changed files with 102 additions and 5 deletions

View File

@ -39,6 +39,8 @@ func workspaceAgent() *cobra.Command {
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()
go dumpHandler(ctx)
rawURL, err := cmd.Flags().GetString(varAgentURL)
if err != nil {
return xerrors.Errorf("CODER_AGENT_URL must be set: %w", err)

View File

@ -8,8 +8,11 @@ import (
"net/http"
"net/url"
"os"
"os/signal"
"path/filepath"
"runtime"
"strings"
"syscall"
"text/template"
"time"
@ -631,3 +634,93 @@ func (h *headerTransport) RoundTrip(req *http.Request) (*http.Response, error) {
}
return h.transport.RoundTrip(req)
}
// dumpHandler provides a custom SIGQUIT and SIGTRAP handler that dumps the
// stacktrace of all goroutines to stderr and a well-known file in the home
// directory. This is useful for debugging deadlock issues that may occur in
// production in workspaces, since the default Go runtime will only dump to
// stderr (which is often difficult/impossible to read in a workspace).
//
// SIGQUITs will still cause the program to exit (similarly to the default Go
// runtime behavior).
//
// A SIGQUIT handler will not be registered if GOTRACEBACK=crash.
//
// On Windows this immediately returns.
func dumpHandler(ctx context.Context) {
if runtime.GOOS == "windows" {
// free up the goroutine since it'll be permanently blocked anyways
return
}
listenSignals := []os.Signal{syscall.SIGTRAP}
if os.Getenv("GOTRACEBACK") != "crash" {
listenSignals = append(listenSignals, syscall.SIGQUIT)
}
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, listenSignals...)
defer signal.Stop(sigs)
for {
sigStr := ""
select {
case <-ctx.Done():
return
case sig := <-sigs:
switch sig {
case syscall.SIGQUIT:
sigStr = "SIGQUIT"
case syscall.SIGTRAP:
sigStr = "SIGTRAP"
}
}
// Start with a 1MB buffer and keep doubling it until we can fit the
// entire stacktrace, stopping early once we reach 64MB.
buf := make([]byte, 1_000_000)
stacklen := 0
for {
stacklen = runtime.Stack(buf, true)
if stacklen < len(buf) {
break
}
if 2*len(buf) > 64_000_000 {
// Write a message to the end of the buffer saying that it was
// truncated.
const truncatedMsg = "\n\n\nstack trace truncated due to size\n"
copy(buf[len(buf)-len(truncatedMsg):], truncatedMsg)
break
}
buf = make([]byte, 2*len(buf))
}
_, _ = fmt.Fprintf(os.Stderr, "%s:\n%s\n", sigStr, buf[:stacklen])
// Write to a well-known file.
dir, err := os.UserHomeDir()
if err != nil {
dir = os.TempDir()
}
fpath := filepath.Join(dir, fmt.Sprintf("coder-agent-%s.dump", time.Now().Format("2006-01-02T15:04:05.000Z")))
_, _ = fmt.Fprintf(os.Stderr, "writing dump to %q\n", fpath)
f, err := os.Create(fpath)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error())
goto done
}
_, err = f.Write(buf[:stacklen])
_ = f.Close()
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, "failed to write dump file: %v\n", err.Error())
goto done
}
done:
if sigStr == "SIGQUIT" {
//nolint:revive
os.Exit(1)
}
}
}

View File

@ -81,6 +81,13 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
Use: "server",
Short: "Start a Coder server",
RunE: func(cmd *cobra.Command, args []string) error {
// Main command context for managing cancellation of running
// services.
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()
go dumpHandler(ctx)
cfg, err := deployment.Config(cmd.Flags(), vip)
if err != nil {
return xerrors.Errorf("getting deployment config: %w", err)
@ -123,11 +130,6 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
logger = logger.AppendSinks(tracing.SlogSink{})
}
// Main command context for managing cancellation
// of running services.
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()
// Register signals early on so that graceful shutdown can't
// be interrupted by additional signals. Note that we avoid
// shadowing cancel() (from above) here because notifyStop()