mirror of
https://github.com/coder/coder.git
synced 2025-07-15 22:20:27 +00:00
chore: log provider stack traces on text file busy (#15078)
re: #14726 If we see "text file busy" in the errors while initializing terraform, attempt to query the pprof endpoint set up by https://github.com/coder/terraform-provider-coder/pull/295 and log at CRITICAL. --------- Signed-off-by: Spike Curtis <spike@coder.com>
This commit is contained in:
@ -200,6 +200,15 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver
|
|||||||
return version.NewVersion(vj.Version)
|
return version.NewVersion(vj.Version)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type textFileBusyError struct {
|
||||||
|
exitErr *exec.ExitError
|
||||||
|
stderr string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *textFileBusyError) Error() string {
|
||||||
|
return "text file busy: " + e.exitErr.String()
|
||||||
|
}
|
||||||
|
|
||||||
func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
|
func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
|
||||||
ctx, span := e.server.startTrace(ctx, tracing.FuncName())
|
ctx, span := e.server.startTrace(ctx, tracing.FuncName())
|
||||||
defer span.End()
|
defer span.End()
|
||||||
@ -216,13 +225,24 @@ func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
|
|||||||
<-doneErr
|
<-doneErr
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
// As a special case, we want to look for the error "text file busy" in the stderr output of
|
||||||
|
// the init command, so we also take a copy of the stderr into an in memory buffer.
|
||||||
|
errBuf := newBufferedWriteCloser(errWriter)
|
||||||
|
|
||||||
args := []string{
|
args := []string{
|
||||||
"init",
|
"init",
|
||||||
"-no-color",
|
"-no-color",
|
||||||
"-input=false",
|
"-input=false",
|
||||||
}
|
}
|
||||||
|
|
||||||
return e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errWriter)
|
err := e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errBuf)
|
||||||
|
var exitErr *exec.ExitError
|
||||||
|
if xerrors.As(err, &exitErr) {
|
||||||
|
if bytes.Contains(errBuf.b.Bytes(), []byte("text file busy")) {
|
||||||
|
return &textFileBusyError{exitErr: exitErr, stderr: errBuf.b.String()}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func getPlanFilePath(workdir string) string {
|
func getPlanFilePath(workdir string) string {
|
||||||
@ -707,3 +727,26 @@ func (sw syncWriter) Write(p []byte) (n int, err error) {
|
|||||||
defer sw.mut.Unlock()
|
defer sw.mut.Unlock()
|
||||||
return sw.w.Write(p)
|
return sw.w.Write(p)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type bufferedWriteCloser struct {
|
||||||
|
wc io.WriteCloser
|
||||||
|
b bytes.Buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
func newBufferedWriteCloser(wc io.WriteCloser) *bufferedWriteCloser {
|
||||||
|
return &bufferedWriteCloser{
|
||||||
|
wc: wc,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *bufferedWriteCloser) Write(p []byte) (int, error) {
|
||||||
|
n, err := b.b.Write(p)
|
||||||
|
if err != nil {
|
||||||
|
return n, err
|
||||||
|
}
|
||||||
|
return b.wc.Write(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *bufferedWriteCloser) Close() error {
|
||||||
|
return b.wc.Close()
|
||||||
|
}
|
||||||
|
@ -4,7 +4,11 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net"
|
||||||
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -109,10 +113,32 @@ func (s *server) Plan(
|
|||||||
initTimings.ingest(createInitTimingsEvent(timingInitStart))
|
initTimings.ingest(createInitTimingsEvent(timingInitStart))
|
||||||
|
|
||||||
err = e.init(ctx, killCtx, sess)
|
err = e.init(ctx, killCtx, sess)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
initTimings.ingest(createInitTimingsEvent(timingInitErrored))
|
initTimings.ingest(createInitTimingsEvent(timingInitErrored))
|
||||||
|
|
||||||
s.logger.Debug(ctx, "init failed", slog.Error(err))
|
s.logger.Debug(ctx, "init failed", slog.Error(err))
|
||||||
|
|
||||||
|
// Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726
|
||||||
|
// We believe this might be due to some race condition that prevents the
|
||||||
|
// terraform-provider-coder process from exiting. When terraform tries to install the
|
||||||
|
// provider during this init, it copies over the local cache. Normally this isn't an issue,
|
||||||
|
// but if the terraform-provider-coder process is still running from a previous build, Linux
|
||||||
|
// returns "text file busy" error when attempting to open the file.
|
||||||
|
//
|
||||||
|
// Capturing the stack trace from the process should help us figure out why it has not
|
||||||
|
// exited. We'll drop these diagnostics in a CRITICAL log so that operators are likely to
|
||||||
|
// notice, and also because it indicates this provisioner could be permanently broken and
|
||||||
|
// require a restart.
|
||||||
|
var errTFB *textFileBusyError
|
||||||
|
if xerrors.As(err, &errTFB) {
|
||||||
|
stacktrace := tryGettingCoderProviderStacktrace(sess)
|
||||||
|
s.logger.Critical(ctx, "init: text file busy",
|
||||||
|
slog.Error(errTFB),
|
||||||
|
slog.F("stderr", errTFB.stderr),
|
||||||
|
slog.F("provider_coder_stacktrace", stacktrace),
|
||||||
|
)
|
||||||
|
}
|
||||||
return provisionersdk.PlanErrorf("initialize terraform: %s", err)
|
return provisionersdk.PlanErrorf("initialize terraform: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -280,3 +306,39 @@ func logTerraformEnvVars(sink logSink) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// tryGettingCoderProviderStacktrace attempts to dial a special pprof endpoint we added to
|
||||||
|
// terraform-provider-coder in https://github.com/coder/terraform-provider-coder/pull/295 which
|
||||||
|
// shipped in v1.0.4. It will return the stacktraces of the provider, which will hopefully allow us
|
||||||
|
// to figure out why it hasn't exited.
|
||||||
|
func tryGettingCoderProviderStacktrace(sess *provisionersdk.Session) string {
|
||||||
|
path := filepath.Clean(filepath.Join(sess.WorkDirectory, "../.coder/pprof"))
|
||||||
|
sess.Logger.Info(sess.Context(), "attempting to get stack traces", slog.F("path", path))
|
||||||
|
c := http.Client{
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
|
||||||
|
d := net.Dialer{}
|
||||||
|
return d.DialContext(ctx, "unix", path)
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
req, err := http.NewRequestWithContext(sess.Context(), http.MethodGet,
|
||||||
|
"http://localhost/debug/pprof/goroutine?debug=2", nil)
|
||||||
|
if err != nil {
|
||||||
|
sess.Logger.Error(sess.Context(), "error creating GET request", slog.Error(err))
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resp, err := c.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
// Only log at Info here, since we only added the pprof endpoint to terraform-provider-coder
|
||||||
|
// in v1.0.4
|
||||||
|
sess.Logger.Info(sess.Context(), "could not GET stack traces", slog.Error(err))
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
stacktraces, err := io.ReadAll(resp.Body)
|
||||||
|
if err != nil {
|
||||||
|
sess.Logger.Error(sess.Context(), "could not read stack traces", slog.Error(err))
|
||||||
|
}
|
||||||
|
return string(stacktraces)
|
||||||
|
}
|
||||||
|
@ -9,6 +9,8 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"net"
|
||||||
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
@ -31,6 +33,8 @@ import (
|
|||||||
type provisionerServeOptions struct {
|
type provisionerServeOptions struct {
|
||||||
binaryPath string
|
binaryPath string
|
||||||
exitTimeout time.Duration
|
exitTimeout time.Duration
|
||||||
|
workDir string
|
||||||
|
logger *slog.Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
|
func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
|
||||||
@ -38,7 +42,13 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
|
|||||||
opts = &provisionerServeOptions{}
|
opts = &provisionerServeOptions{}
|
||||||
}
|
}
|
||||||
cachePath := t.TempDir()
|
cachePath := t.TempDir()
|
||||||
workDir := t.TempDir()
|
if opts.workDir == "" {
|
||||||
|
opts.workDir = t.TempDir()
|
||||||
|
}
|
||||||
|
if opts.logger == nil {
|
||||||
|
logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
|
||||||
|
opts.logger = &logger
|
||||||
|
}
|
||||||
client, server := drpc.MemTransportPipe()
|
client, server := drpc.MemTransportPipe()
|
||||||
ctx, cancelFunc := context.WithCancel(context.Background())
|
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||||
serverErr := make(chan error, 1)
|
serverErr := make(chan error, 1)
|
||||||
@ -55,8 +65,8 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
|
|||||||
serverErr <- terraform.Serve(ctx, &terraform.ServeOptions{
|
serverErr <- terraform.Serve(ctx, &terraform.ServeOptions{
|
||||||
ServeOptions: &provisionersdk.ServeOptions{
|
ServeOptions: &provisionersdk.ServeOptions{
|
||||||
Listener: server,
|
Listener: server,
|
||||||
Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug),
|
Logger: *opts.logger,
|
||||||
WorkDirectory: workDir,
|
WorkDirectory: opts.workDir,
|
||||||
},
|
},
|
||||||
BinaryPath: opts.binaryPath,
|
BinaryPath: opts.binaryPath,
|
||||||
CachePath: cachePath,
|
CachePath: cachePath,
|
||||||
@ -236,7 +246,7 @@ func TestProvision_CancelTimeout(t *testing.T) {
|
|||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
binPath := filepath.Join(dir, "terraform")
|
binPath := filepath.Join(dir, "terraform")
|
||||||
|
|
||||||
// Example: exec /path/to/terrafork_fake_cancel.sh 1.2.1 apply "$@"
|
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
|
||||||
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
|
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
|
||||||
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
|
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
@ -282,6 +292,81 @@ func TestProvision_CancelTimeout(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// below we exec fake_text_file_busy.sh, which causes the kernel to execute it, and if more than
|
||||||
|
// one process tries to do this, it can cause "text file busy" to be returned to us. In this test
|
||||||
|
// we want to simulate "text file busy" getting logged by terraform, due to an issue with the
|
||||||
|
// terraform-provider-coder
|
||||||
|
// nolint: paralleltest
|
||||||
|
func TestProvision_TextFileBusy(t *testing.T) {
|
||||||
|
if runtime.GOOS == "windows" {
|
||||||
|
t.Skip("This test uses unix sockets and is not supported on Windows")
|
||||||
|
}
|
||||||
|
|
||||||
|
cwd, err := os.Getwd()
|
||||||
|
require.NoError(t, err)
|
||||||
|
fakeBin := filepath.Join(cwd, "testdata", "fake_text_file_busy.sh")
|
||||||
|
|
||||||
|
dir := t.TempDir()
|
||||||
|
binPath := filepath.Join(dir, "terraform")
|
||||||
|
|
||||||
|
// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
|
||||||
|
content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
|
||||||
|
err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
workDir := t.TempDir()
|
||||||
|
|
||||||
|
err = os.Mkdir(filepath.Join(workDir, ".coder"), 0o700)
|
||||||
|
require.NoError(t, err)
|
||||||
|
l, err := net.Listen("unix", filepath.Join(workDir, ".coder", "pprof"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer l.Close()
|
||||||
|
handlerCalled := 0
|
||||||
|
// nolint: gosec
|
||||||
|
srv := &http.Server{
|
||||||
|
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
assert.Equal(t, "/debug/pprof/goroutine", r.URL.Path)
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
_, err := w.Write([]byte("thestacks\n"))
|
||||||
|
assert.NoError(t, err)
|
||||||
|
handlerCalled++
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
srvErr := make(chan error, 1)
|
||||||
|
go func() {
|
||||||
|
srvErr <- srv.Serve(l)
|
||||||
|
}()
|
||||||
|
|
||||||
|
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
|
||||||
|
ctx, api := setupProvisioner(t, &provisionerServeOptions{
|
||||||
|
binaryPath: binPath,
|
||||||
|
exitTimeout: time.Second,
|
||||||
|
workDir: workDir,
|
||||||
|
logger: &logger,
|
||||||
|
})
|
||||||
|
|
||||||
|
sess := configure(ctx, t, api, &proto.Config{
|
||||||
|
TemplateSourceArchive: makeTar(t, nil),
|
||||||
|
})
|
||||||
|
|
||||||
|
err = sendPlan(sess, proto.WorkspaceTransition_START)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
found := false
|
||||||
|
for {
|
||||||
|
msg, err := sess.Recv()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
if c := msg.GetPlan(); c != nil {
|
||||||
|
require.Contains(t, c.Error, "exit status 1")
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
require.True(t, found)
|
||||||
|
require.EqualValues(t, 1, handlerCalled)
|
||||||
|
}
|
||||||
|
|
||||||
func TestProvision(t *testing.T) {
|
func TestProvision(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
40
provisioner/terraform/testdata/fake_text_file_busy.sh
vendored
Executable file
40
provisioner/terraform/testdata/fake_text_file_busy.sh
vendored
Executable file
@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
VERSION=$1
|
||||||
|
shift 1
|
||||||
|
|
||||||
|
json_print() {
|
||||||
|
echo "{\"@level\":\"error\",\"@message\":\"$*\"}"
|
||||||
|
}
|
||||||
|
|
||||||
|
case "$1" in
|
||||||
|
version)
|
||||||
|
cat <<-EOF
|
||||||
|
{
|
||||||
|
"terraform_version": "${VERSION}",
|
||||||
|
"platform": "linux_amd64",
|
||||||
|
"provider_selections": {},
|
||||||
|
"terraform_outdated": false
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
init)
|
||||||
|
echo "init"
|
||||||
|
echo >&2 "Error: Failed to install provider"
|
||||||
|
echo >&2 " Error while installing coder/coder v1.0.4: open"
|
||||||
|
echo >&2 " /home/coder/.cache/coder/provisioner-0/tf/registry.terraform.io/coder/coder/1.0.3/linux_amd64/terraform-provider-coder_v1.0.4:"
|
||||||
|
echo >&2 " text file busy"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
plan)
|
||||||
|
echo "plan not supported"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
apply)
|
||||||
|
echo "apply not supported"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
exit 10
|
Reference in New Issue
Block a user