chore: cache terraform providers between CI test runs (#17373)

Addresses https://github.com/coder/internal/issues/322.

This PR starts caching Terraform providers used by `TestProvision` in
`provisioner/terraform/provision_test.go`. The goal is to improve the
reliability of this test by cutting down on the number of network calls
to external services. It leverages GitHub Actions cache, which [on depot
runners is persisted for 14 days by
default](https://depot.dev/docs/github-actions/overview#cache-retention-policy).

Other than the aforementioned `TestProvision`, I couldn't find any other
tests which depend on external terraform providers.
This commit is contained in:
Hugo Dutka
2025-04-28 10:57:24 +02:00
committed by GitHub
parent 08ad910171
commit b47d54d777
7 changed files with 393 additions and 34 deletions

View File

@ -0,0 +1,50 @@
name: "Download Test Cache"
description: |
Downloads the test cache and outputs today's cache key.
A PR job can use a cache if it was created by its base branch, its current
branch, or the default branch.
https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#restrictions-for-accessing-a-cache
outputs:
cache-key:
description: "Today's cache key"
value: ${{ steps.vars.outputs.cache-key }}
inputs:
key-prefix:
description: "Prefix for the cache key"
required: true
cache-path:
description: "Path to the cache directory"
required: true
# This path is defined in testutil/cache.go
default: "~/.cache/coderv2-test"
runs:
using: "composite"
steps:
- name: Get date values and cache key
id: vars
shell: bash
run: |
export YEAR_MONTH=$(date +'%Y-%m')
export PREV_YEAR_MONTH=$(date -d 'last month' +'%Y-%m')
export DAY=$(date +'%d')
echo "year-month=$YEAR_MONTH" >> $GITHUB_OUTPUT
echo "prev-year-month=$PREV_YEAR_MONTH" >> $GITHUB_OUTPUT
echo "cache-key=${{ inputs.key-prefix }}-${YEAR_MONTH}-${DAY}" >> $GITHUB_OUTPUT
# TODO: As a cost optimization, we could remove caches that are older than
# a day or two. By default, depot keeps caches for 14 days, which isn't
# necessary for the test cache.
# https://depot.dev/docs/github-actions/overview#cache-retention-policy
- name: Download test cache
uses: actions/cache/restore@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
with:
path: ${{ inputs.cache-path }}
key: ${{ steps.vars.outputs.cache-key }}
# > If there are multiple partial matches for a restore key, the action returns the most recently created cache.
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#matching-a-cache-key
# The second restore key allows non-main branches to use the cache from the previous month.
# This prevents PRs from rebuilding the cache on the first day of the month.
# It also makes sure that once a month, the cache is fully reset.
restore-keys: |
${{ inputs.key-prefix }}-${{ steps.vars.outputs.year-month }}-
${{ github.ref != 'refs/heads/main' && format('{0}-{1}-', inputs.key-prefix, steps.vars.outputs.prev-year-month) || '' }}

View File

@ -0,0 +1,20 @@
name: "Upload Test Cache"
description: Uploads the test cache. Only works on the main branch.
inputs:
cache-key:
description: "Cache key"
required: true
cache-path:
description: "Path to the cache directory"
required: true
# This path is defined in testutil/cache.go
default: "~/.cache/coderv2-test"
runs:
using: "composite"
steps:
- name: Upload test cache
if: ${{ github.ref == 'refs/heads/main' }}
uses: actions/cache/save@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
with:
path: ${{ inputs.cache-path }}
key: ${{ inputs.cache-key }}

View File

@ -341,6 +341,12 @@ jobs:
- name: Setup Terraform - name: Setup Terraform
uses: ./.github/actions/setup-tf uses: ./.github/actions/setup-tf
- name: Download Test Cache
id: download-cache
uses: ./.github/actions/test-cache/download
with:
key-prefix: test-go-${{ runner.os }}-${{ runner.arch }}
- name: Test with Mock Database - name: Test with Mock Database
id: test id: test
shell: bash shell: bash
@ -365,6 +371,11 @@ jobs:
gotestsum --junitfile="gotests.xml" --jsonfile="gotests.json" \ gotestsum --junitfile="gotests.xml" --jsonfile="gotests.json" \
--packages="./..." -- $PARALLEL_FLAG -short -failfast --packages="./..." -- $PARALLEL_FLAG -short -failfast
- name: Upload Test Cache
uses: ./.github/actions/test-cache/upload
with:
cache-key: ${{ steps.download-cache.outputs.cache-key }}
- name: Upload test stats to Datadog - name: Upload test stats to Datadog
timeout-minutes: 1 timeout-minutes: 1
continue-on-error: true continue-on-error: true
@ -462,6 +473,12 @@ jobs:
if: runner.os == 'Windows' if: runner.os == 'Windows'
uses: ./.github/actions/setup-imdisk uses: ./.github/actions/setup-imdisk
- name: Download Test Cache
id: download-cache
uses: ./.github/actions/test-cache/download
with:
key-prefix: test-go-pg-${{ runner.os }}-${{ runner.arch }}
- name: Test with PostgreSQL Database - name: Test with PostgreSQL Database
env: env:
POSTGRES_VERSION: "13" POSTGRES_VERSION: "13"
@ -476,6 +493,11 @@ jobs:
make test-postgres make test-postgres
- name: Upload Test Cache
uses: ./.github/actions/test-cache/upload
with:
cache-key: ${{ steps.download-cache.outputs.cache-key }}
- name: Upload test stats to Datadog - name: Upload test stats to Datadog
timeout-minutes: 1 timeout-minutes: 1
continue-on-error: true continue-on-error: true
@ -514,6 +536,12 @@ jobs:
- name: Setup Terraform - name: Setup Terraform
uses: ./.github/actions/setup-tf uses: ./.github/actions/setup-tf
- name: Download Test Cache
id: download-cache
uses: ./.github/actions/test-cache/download
with:
key-prefix: test-go-pg-16-${{ runner.os }}-${{ runner.arch }}
- name: Test with PostgreSQL Database - name: Test with PostgreSQL Database
env: env:
POSTGRES_VERSION: "16" POSTGRES_VERSION: "16"
@ -521,6 +549,11 @@ jobs:
run: | run: |
make test-postgres make test-postgres
- name: Upload Test Cache
uses: ./.github/actions/test-cache/upload
with:
cache-key: ${{ steps.download-cache.outputs.cache-key }}
- name: Upload test stats to Datadog - name: Upload test stats to Datadog
timeout-minutes: 1 timeout-minutes: 1
continue-on-error: true continue-on-error: true
@ -551,6 +584,12 @@ jobs:
- name: Setup Terraform - name: Setup Terraform
uses: ./.github/actions/setup-tf uses: ./.github/actions/setup-tf
- name: Download Test Cache
id: download-cache
uses: ./.github/actions/test-cache/download
with:
key-prefix: test-go-race-${{ runner.os }}-${{ runner.arch }}
# We run race tests with reduced parallelism because they use more CPU and we were finding # We run race tests with reduced parallelism because they use more CPU and we were finding
# instances where tests appear to hang for multiple seconds, resulting in flaky tests when # instances where tests appear to hang for multiple seconds, resulting in flaky tests when
# short timeouts are used. # short timeouts are used.
@ -559,6 +598,11 @@ jobs:
run: | run: |
gotestsum --junitfile="gotests.xml" -- -race -parallel 4 -p 4 ./... gotestsum --junitfile="gotests.xml" -- -race -parallel 4 -p 4 ./...
- name: Upload Test Cache
uses: ./.github/actions/test-cache/upload
with:
cache-key: ${{ steps.download-cache.outputs.cache-key }}
- name: Upload test stats to Datadog - name: Upload test stats to Datadog
timeout-minutes: 1 timeout-minutes: 1
continue-on-error: true continue-on-error: true
@ -589,6 +633,12 @@ jobs:
- name: Setup Terraform - name: Setup Terraform
uses: ./.github/actions/setup-tf uses: ./.github/actions/setup-tf
- name: Download Test Cache
id: download-cache
uses: ./.github/actions/test-cache/download
with:
key-prefix: test-go-race-pg-${{ runner.os }}-${{ runner.arch }}
# We run race tests with reduced parallelism because they use more CPU and we were finding # We run race tests with reduced parallelism because they use more CPU and we were finding
# instances where tests appear to hang for multiple seconds, resulting in flaky tests when # instances where tests appear to hang for multiple seconds, resulting in flaky tests when
# short timeouts are used. # short timeouts are used.
@ -600,6 +650,11 @@ jobs:
make test-postgres-docker make test-postgres-docker
DB=ci gotestsum --junitfile="gotests.xml" -- -race -parallel 4 -p 4 ./... DB=ci gotestsum --junitfile="gotests.xml" -- -race -parallel 4 -p 4 ./...
- name: Upload Test Cache
uses: ./.github/actions/test-cache/upload
with:
cache-key: ${{ steps.download-cache.outputs.cache-key }}
- name: Upload test stats to Datadog - name: Upload test stats to Datadog
timeout-minutes: 1 timeout-minutes: 1
continue-on-error: true continue-on-error: true

View File

@ -35,8 +35,9 @@ type executor struct {
mut *sync.Mutex mut *sync.Mutex
binaryPath string binaryPath string
// cachePath and workdir must not be used by multiple processes at once. // cachePath and workdir must not be used by multiple processes at once.
cachePath string cachePath string
workdir string cliConfigPath string
workdir string
// used to capture execution times at various stages // used to capture execution times at various stages
timings *timingAggregator timings *timingAggregator
} }
@ -50,6 +51,9 @@ func (e *executor) basicEnv() []string {
if e.cachePath != "" && runtime.GOOS == "linux" { if e.cachePath != "" && runtime.GOOS == "linux" {
env = append(env, "TF_PLUGIN_CACHE_DIR="+e.cachePath) env = append(env, "TF_PLUGIN_CACHE_DIR="+e.cachePath)
} }
if e.cliConfigPath != "" {
env = append(env, "TF_CLI_CONFIG_FILE="+e.cliConfigPath)
}
return env return env
} }

View File

@ -3,13 +3,17 @@
package terraform_test package terraform_test
import ( import (
"bytes"
"context" "context"
"crypto/sha256"
"encoding/hex"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"net" "net"
"net/http" "net/http"
"os" "os"
"os/exec"
"path/filepath" "path/filepath"
"sort" "sort"
"strings" "strings"
@ -29,10 +33,11 @@ import (
) )
type provisionerServeOptions struct { type provisionerServeOptions struct {
binaryPath string binaryPath string
exitTimeout time.Duration cliConfigPath string
workDir string exitTimeout time.Duration
logger *slog.Logger workDir string
logger *slog.Logger
} }
func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) { func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
@ -66,9 +71,10 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
Logger: *opts.logger, Logger: *opts.logger,
WorkDirectory: opts.workDir, WorkDirectory: opts.workDir,
}, },
BinaryPath: opts.binaryPath, BinaryPath: opts.binaryPath,
CachePath: cachePath, CachePath: cachePath,
ExitTimeout: opts.exitTimeout, ExitTimeout: opts.exitTimeout,
CliConfigPath: opts.cliConfigPath,
}) })
}() }()
api := proto.NewDRPCProvisionerClient(client) api := proto.NewDRPCProvisionerClient(client)
@ -85,6 +91,168 @@ func configure(ctx context.Context, t *testing.T, client proto.DRPCProvisionerCl
return sess return sess
} }
func hashTemplateFilesAndTestName(t *testing.T, testName string, templateFiles map[string]string) string {
t.Helper()
sortedFileNames := make([]string, 0, len(templateFiles))
for fileName := range templateFiles {
sortedFileNames = append(sortedFileNames, fileName)
}
sort.Strings(sortedFileNames)
// Inserting a delimiter between the file name and the file content
// ensures that a file named `ab` with content `cd`
// will not hash to the same value as a file named `abc` with content `d`.
// This can still happen if the file name or content include the delimiter,
// but hopefully they won't.
delimiter := []byte("🎉 🌱 🌷")
hasher := sha256.New()
for _, fileName := range sortedFileNames {
file := templateFiles[fileName]
_, err := hasher.Write([]byte(fileName))
require.NoError(t, err)
_, err = hasher.Write(delimiter)
require.NoError(t, err)
_, err = hasher.Write([]byte(file))
require.NoError(t, err)
}
_, err := hasher.Write(delimiter)
require.NoError(t, err)
_, err = hasher.Write([]byte(testName))
require.NoError(t, err)
return hex.EncodeToString(hasher.Sum(nil))
}
const (
terraformConfigFileName = "terraform.rc"
cacheProvidersDirName = "providers"
cacheTemplateFilesDirName = "files"
)
// Writes a Terraform CLI config file (`terraform.rc`) in `dir` to enforce using the local provider mirror.
// This blocks network access for providers, forcing Terraform to use only what's cached in `dir`.
// Returns the path to the generated config file.
func writeCliConfig(t *testing.T, dir string) string {
t.Helper()
cliConfigPath := filepath.Join(dir, terraformConfigFileName)
require.NoError(t, os.MkdirAll(filepath.Dir(cliConfigPath), 0o700))
content := fmt.Sprintf(`
provider_installation {
filesystem_mirror {
path = "%s"
include = ["*/*"]
}
direct {
exclude = ["*/*"]
}
}
`, filepath.Join(dir, cacheProvidersDirName))
require.NoError(t, os.WriteFile(cliConfigPath, []byte(content), 0o600))
return cliConfigPath
}
func runCmd(t *testing.T, dir string, args ...string) {
t.Helper()
stdout, stderr := bytes.NewBuffer(nil), bytes.NewBuffer(nil)
cmd := exec.Command(args[0], args[1:]...) //#nosec
cmd.Dir = dir
cmd.Stdout = stdout
cmd.Stderr = stderr
if err := cmd.Run(); err != nil {
t.Fatalf("failed to run %s: %s\nstdout: %s\nstderr: %s", strings.Join(args, " "), err, stdout.String(), stderr.String())
}
}
// Each test gets a unique cache dir based on its name and template files.
// This ensures that tests can download providers in parallel and that they
// will redownload providers if the template files change.
func getTestCacheDir(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string {
t.Helper()
hash := hashTemplateFilesAndTestName(t, testName, templateFiles)
dir := filepath.Join(rootDir, hash[:12])
return dir
}
// Ensures Terraform providers are downloaded and cached locally in a unique directory for the test.
// Uses `terraform init` then `mirror` to populate the cache if needed.
// Returns the cache directory path.
func downloadProviders(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string {
t.Helper()
dir := getTestCacheDir(t, rootDir, testName, templateFiles)
if _, err := os.Stat(dir); err == nil {
t.Logf("%s: using cached terraform providers", testName)
return dir
}
filesDir := filepath.Join(dir, cacheTemplateFilesDirName)
defer func() {
// The files dir will contain a copy of terraform providers generated
// by the terraform init command. We don't want to persist them since
// we already have a registry mirror in the providers dir.
if err := os.RemoveAll(filesDir); err != nil {
t.Logf("failed to remove files dir %s: %s", filesDir, err)
}
if !t.Failed() {
return
}
// If `downloadProviders` function failed, clean up the cache dir.
// We don't want to leave it around because it may be incomplete or corrupted.
if err := os.RemoveAll(dir); err != nil {
t.Logf("failed to remove dir %s: %s", dir, err)
}
}()
require.NoError(t, os.MkdirAll(filesDir, 0o700))
for fileName, file := range templateFiles {
filePath := filepath.Join(filesDir, fileName)
require.NoError(t, os.MkdirAll(filepath.Dir(filePath), 0o700))
require.NoError(t, os.WriteFile(filePath, []byte(file), 0o600))
}
providersDir := filepath.Join(dir, cacheProvidersDirName)
require.NoError(t, os.MkdirAll(providersDir, 0o700))
// We need to run init because if a test uses modules in its template,
// the mirror command will fail without it.
runCmd(t, filesDir, "terraform", "init")
// Now, mirror the providers into `providersDir`. We use this explicit mirror
// instead of relying only on the standard Terraform plugin cache.
//
// Why? Because this mirror, when used with the CLI config from `writeCliConfig`,
// prevents Terraform from hitting the network registry during `plan`. This cuts
// down on network calls, making CI tests less flaky.
//
// In contrast, the standard cache *still* contacts the registry for metadata
// during `init`, even if the plugins are already cached locally - see link below.
//
// Ref: https://developer.hashicorp.com/terraform/cli/config/config-file#provider-plugin-cache
// > When a plugin cache directory is enabled, the terraform init command will
// > still use the configured or implied installation methods to obtain metadata
// > about which plugins are available
runCmd(t, filesDir, "terraform", "providers", "mirror", providersDir)
return dir
}
// Caches providers locally and generates a Terraform CLI config to use *only* that cache.
// This setup prevents network access for providers during `terraform init`, improving reliability
// in subsequent test runs.
// Returns the path to the generated CLI config file.
func cacheProviders(t *testing.T, rootDir string, testName string, templateFiles map[string]string) string {
t.Helper()
providersParentDir := downloadProviders(t, rootDir, testName, templateFiles)
cliConfigPath := writeCliConfig(t, providersParentDir)
return cliConfigPath
}
func readProvisionLog(t *testing.T, response proto.DRPCProvisioner_SessionClient) string { func readProvisionLog(t *testing.T, response proto.DRPCProvisioner_SessionClient) string {
var logBuf strings.Builder var logBuf strings.Builder
for { for {
@ -352,6 +520,8 @@ func TestProvision(t *testing.T) {
Apply bool Apply bool
// Some tests may need to be skipped until the relevant provider version is released. // Some tests may need to be skipped until the relevant provider version is released.
SkipReason string SkipReason string
// If SkipCacheProviders is true, then skip caching the terraform providers for this test.
SkipCacheProviders bool
}{ }{
{ {
Name: "missing-variable", Name: "missing-variable",
@ -422,16 +592,18 @@ func TestProvision(t *testing.T) {
Files: map[string]string{ Files: map[string]string{
"main.tf": `a`, "main.tf": `a`,
}, },
ErrorContains: "initialize terraform", ErrorContains: "initialize terraform",
ExpectLogContains: "Argument or block definition required", ExpectLogContains: "Argument or block definition required",
SkipCacheProviders: true,
}, },
{ {
Name: "bad-syntax-2", Name: "bad-syntax-2",
Files: map[string]string{ Files: map[string]string{
"main.tf": `;asdf;`, "main.tf": `;asdf;`,
}, },
ErrorContains: "initialize terraform", ErrorContains: "initialize terraform",
ExpectLogContains: `The ";" character is not valid.`, ExpectLogContains: `The ";" character is not valid.`,
SkipCacheProviders: true,
}, },
{ {
Name: "destroy-no-state", Name: "destroy-no-state",
@ -838,6 +1010,23 @@ func TestProvision(t *testing.T) {
}, },
} }
// Remove unused cache dirs before running tests.
// This cleans up any cache dirs that were created by tests that no longer exist.
cacheRootDir := filepath.Join(testutil.PersistentCacheDir(t), "terraform_provision_test")
expectedCacheDirs := make(map[string]bool)
for _, testCase := range testCases {
cacheDir := getTestCacheDir(t, cacheRootDir, testCase.Name, testCase.Files)
expectedCacheDirs[cacheDir] = true
}
currentCacheDirs, err := filepath.Glob(filepath.Join(cacheRootDir, "*"))
require.NoError(t, err)
for _, cacheDir := range currentCacheDirs {
if _, ok := expectedCacheDirs[cacheDir]; !ok {
t.Logf("removing unused cache dir: %s", cacheDir)
require.NoError(t, os.RemoveAll(cacheDir))
}
}
for _, testCase := range testCases { for _, testCase := range testCases {
testCase := testCase testCase := testCase
t.Run(testCase.Name, func(t *testing.T) { t.Run(testCase.Name, func(t *testing.T) {
@ -847,7 +1036,18 @@ func TestProvision(t *testing.T) {
t.Skip(testCase.SkipReason) t.Skip(testCase.SkipReason)
} }
ctx, api := setupProvisioner(t, nil) cliConfigPath := ""
if !testCase.SkipCacheProviders {
cliConfigPath = cacheProviders(
t,
cacheRootDir,
testCase.Name,
testCase.Files,
)
}
ctx, api := setupProvisioner(t, &provisionerServeOptions{
cliConfigPath: cliConfigPath,
})
sess := configure(ctx, t, api, &proto.Config{ sess := configure(ctx, t, api, &proto.Config{
TemplateSourceArchive: testutil.CreateTar(t, testCase.Files), TemplateSourceArchive: testutil.CreateTar(t, testCase.Files),
}) })

View File

@ -28,7 +28,9 @@ type ServeOptions struct {
BinaryPath string BinaryPath string
// CachePath must not be used by multiple processes at once. // CachePath must not be used by multiple processes at once.
CachePath string CachePath string
Tracer trace.Tracer // CliConfigPath is the path to the Terraform CLI config file.
CliConfigPath string
Tracer trace.Tracer
// ExitTimeout defines how long we will wait for a running Terraform // ExitTimeout defines how long we will wait for a running Terraform
// command to exit (cleanly) if the provision was stopped. This // command to exit (cleanly) if the provision was stopped. This
@ -132,22 +134,24 @@ func Serve(ctx context.Context, options *ServeOptions) error {
options.ExitTimeout = unhanger.HungJobExitTimeout options.ExitTimeout = unhanger.HungJobExitTimeout
} }
return provisionersdk.Serve(ctx, &server{ return provisionersdk.Serve(ctx, &server{
execMut: &sync.Mutex{}, execMut: &sync.Mutex{},
binaryPath: options.BinaryPath, binaryPath: options.BinaryPath,
cachePath: options.CachePath, cachePath: options.CachePath,
logger: options.Logger, cliConfigPath: options.CliConfigPath,
tracer: options.Tracer, logger: options.Logger,
exitTimeout: options.ExitTimeout, tracer: options.Tracer,
exitTimeout: options.ExitTimeout,
}, options.ServeOptions) }, options.ServeOptions)
} }
type server struct { type server struct {
execMut *sync.Mutex execMut *sync.Mutex
binaryPath string binaryPath string
cachePath string cachePath string
logger slog.Logger cliConfigPath string
tracer trace.Tracer logger slog.Logger
exitTimeout time.Duration tracer trace.Tracer
exitTimeout time.Duration
} }
func (s *server) startTrace(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) { func (s *server) startTrace(ctx context.Context, name string, opts ...trace.SpanStartOption) (context.Context, trace.Span) {
@ -158,12 +162,13 @@ func (s *server) startTrace(ctx context.Context, name string, opts ...trace.Span
func (s *server) executor(workdir string, stage database.ProvisionerJobTimingStage) *executor { func (s *server) executor(workdir string, stage database.ProvisionerJobTimingStage) *executor {
return &executor{ return &executor{
server: s, server: s,
mut: s.execMut, mut: s.execMut,
binaryPath: s.binaryPath, binaryPath: s.binaryPath,
cachePath: s.cachePath, cachePath: s.cachePath,
workdir: workdir, cliConfigPath: s.cliConfigPath,
logger: s.logger.Named("executor"), workdir: workdir,
timings: newTimingAggregator(stage), logger: s.logger.Named("executor"),
timings: newTimingAggregator(stage),
} }
} }

25
testutil/cache.go Normal file
View File

@ -0,0 +1,25 @@
package testutil
import (
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
)
// PersistentCacheDir returns a path to a directory
// that will be cached between test runs in Github Actions.
func PersistentCacheDir(t *testing.T) string {
t.Helper()
// We don't use os.UserCacheDir() because the path it
// returns is different on different operating systems.
// This would make it harder to specify which cache dir to use
// in Github Actions.
home, err := os.UserHomeDir()
require.NoError(t, err)
dir := filepath.Join(home, ".cache", "coderv2-test")
return dir
}