mirror of
https://github.com/coder/coder.git
synced 2025-07-18 14:17:22 +00:00
chore: improve CI reliability (#16169)
We have an effort underway to replace `dbmem` (#15109), and consequently we've begun running our full test-suite (with Postgres) on all supported OSs - Windows, MacOS, and Linux, since #15520. Since this change, we've seen a marked decrease in the success rate of our builds on `main` (note how the Windows/MacOS failures account for the vast majority of failed builds):  We're still investigating why these OSs are a lot less reliable. It's likely that the VMs on which the builds are run have different characteristics from our Ubuntu runners such as disk I/O, network latency, or something else. **In the meantime, we need to start trusting CI failures in `main` again, as the current failures are too noisy / vague for us to correct.** We've also considered hosting our own runners where possible so we can get OS-level observability to rule out some possibilities. See the [meeting notes](https://www.notion.so/coderhq/CI-Investigation-Call-Notes-17dd579be59280d8897cc9fe4bb46695?pvs=6&utm_content=17dd579b-e592-80d8-897c-c9fe4bb46695&utm_campaign=T1ZPT2FL0&n=slack&n=slack_link_unfurl) where we linked into this for more detail. This PR introduces several changes: 1. Moves the full test-suite with Postgres on Windows/MacOS to the `nightly-gauntlet` workflow tradeoff: this means that any regressions may be more difficult to discover since we merge to main several times a day 2. Run only the CLI test-suite on each PR / merge to `main` on Windows/MacOS 3. `test-go` is still running the full test-suite against all OSs (including the CLI ones), but will soon be removed once #15109 is completed since it uses `dbmem` 4. Changes `nightly-gauntlet` to run at 4AM: we've seen several instances of the runner being stopped externally, and we're _guessing_ this may have something to do with the midnight UTC execution time, when other cron jobs may run 5. Removes the existing `nightly-gauntlet` jobs since they haven't passed in a long time, indicating that nobody cares enough to fix them and they don't provide diagnostic value; we can restore them later if necessary I've manually run both these new workflows successfully: - `ci`: https://github.com/coder/coder/actions/runs/12825874176/job/35764724907 - `nightly-gauntlet`: https://github.com/coder/coder/actions/runs/12825539092 --------- Signed-off-by: Danny Kopping <danny@coder.com> Co-authored-by: Muhammad Atif Ali <atif@coder.com>
This commit is contained in:
88
.github/workflows/ci.yaml
vendored
88
.github/workflows/ci.yaml
vendored
@ -378,8 +378,62 @@ jobs:
|
||||
with:
|
||||
api-key: ${{ secrets.DATADOG_API_KEY }}
|
||||
|
||||
# We don't run the full test-suite for Windows & MacOS, so we just run the CLI tests on every PR.
|
||||
# We run the test suite in test-go-pg, including CLI.
|
||||
test-cli:
|
||||
runs-on: ${{ matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
|
||||
needs: changes
|
||||
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
|
||||
strategy:
|
||||
matrix:
|
||||
os:
|
||||
- macos-latest
|
||||
- windows-2022
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Setup Go
|
||||
uses: ./.github/actions/setup-go
|
||||
|
||||
- name: Setup Terraform
|
||||
uses: ./.github/actions/setup-tf
|
||||
|
||||
# Sets up the ImDisk toolkit for Windows and creates a RAM disk on drive R:.
|
||||
- name: Setup ImDisk
|
||||
if: runner.os == 'Windows'
|
||||
uses: ./.github/actions/setup-imdisk
|
||||
|
||||
- name: Test CLI
|
||||
env:
|
||||
TS_DEBUG_DISCO: "true"
|
||||
LC_CTYPE: "en_US.UTF-8"
|
||||
LC_ALL: "en_US.UTF-8"
|
||||
shell: bash
|
||||
run: |
|
||||
# By default Go will use the number of logical CPUs, which
|
||||
# is a fine default.
|
||||
PARALLEL_FLAG=""
|
||||
|
||||
make test-cli
|
||||
|
||||
- name: Upload test stats to Datadog
|
||||
timeout-minutes: 1
|
||||
continue-on-error: true
|
||||
uses: ./.github/actions/upload-datadog
|
||||
if: success() || failure()
|
||||
with:
|
||||
api-key: ${{ secrets.DATADOG_API_KEY }}
|
||||
|
||||
test-go-pg:
|
||||
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'depot-ubuntu-22.04-4' || matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
|
||||
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'depot-ubuntu-22.04-4' || matrix.os }}
|
||||
needs: changes
|
||||
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
|
||||
# This timeout must be greater than the timeout set by `go test` in
|
||||
@ -391,8 +445,6 @@ jobs:
|
||||
matrix:
|
||||
os:
|
||||
- ubuntu-latest
|
||||
- macos-latest
|
||||
- windows-2022
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
|
||||
@ -423,39 +475,11 @@ jobs:
|
||||
LC_ALL: "en_US.UTF-8"
|
||||
shell: bash
|
||||
run: |
|
||||
# if macOS, install google-chrome for scaletests
|
||||
# As another concern, should we really have this kind of external dependency
|
||||
# requirement on standard CI?
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
brew install google-chrome
|
||||
fi
|
||||
|
||||
# By default Go will use the number of logical CPUs, which
|
||||
# is a fine default.
|
||||
PARALLEL_FLAG=""
|
||||
|
||||
# macOS will output "The default interactive shell is now zsh"
|
||||
# intermittently in CI...
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile
|
||||
fi
|
||||
|
||||
if [ "${{ runner.os }}" == "Linux" ]; then
|
||||
make test-postgres
|
||||
elif [ "${{ runner.os }}" == "Windows" ]; then
|
||||
# Create a temp dir on the R: ramdisk drive for Windows. The default
|
||||
# C: drive is extremely slow: https://github.com/actions/runner-images/issues/8755
|
||||
mkdir -p "R:/temp/embedded-pg"
|
||||
go run scripts/embedded-pg/main.go -path "R:/temp/embedded-pg"
|
||||
# Reduce test parallelism, mirroring what we do for race tests.
|
||||
# We'd been encountering issues with timing related flakes, and
|
||||
# this seems to help.
|
||||
DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...
|
||||
else
|
||||
go run scripts/embedded-pg/main.go
|
||||
# Reduce test parallelism, like for Windows above.
|
||||
DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...
|
||||
fi
|
||||
make test-postgres
|
||||
|
||||
- name: Upload test stats to Datadog
|
||||
timeout-minutes: 1
|
||||
|
111
.github/workflows/nightly-gauntlet.yaml
vendored
111
.github/workflows/nightly-gauntlet.yaml
vendored
@ -3,22 +3,27 @@
|
||||
name: nightly-gauntlet
|
||||
on:
|
||||
schedule:
|
||||
# Every day at midnight
|
||||
- cron: "0 0 * * *"
|
||||
# Every day at 4AM
|
||||
- cron: "0 4 * * 1-5"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
go-race:
|
||||
# While GitHub's toaster runners are likelier to flake, we want consistency
|
||||
# between this environment and the regular test environment for DataDog
|
||||
# statistics and to only show real workflow threats.
|
||||
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-8' || 'ubuntu-latest' }}
|
||||
# This runner costs 0.016 USD per minute,
|
||||
# so 0.016 * 240 = 3.84 USD per run.
|
||||
timeout-minutes: 240
|
||||
test-go-pg:
|
||||
runs-on: ${{ matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
|
||||
if: github.ref == 'refs/heads/main'
|
||||
# This timeout must be greater than the timeout set by `go test` in
|
||||
# `make test-postgres` to ensure we receive a trace of running
|
||||
# goroutines. Setting this to the timeout +5m should work quite well
|
||||
# even if some of the preceding steps are slow.
|
||||
timeout-minutes: 25
|
||||
strategy:
|
||||
matrix:
|
||||
os:
|
||||
- macos-latest
|
||||
- windows-2022
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
|
||||
@ -27,6 +32,8 @@ jobs:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Setup Go
|
||||
uses: ./.github/actions/setup-go
|
||||
@ -34,51 +41,63 @@ jobs:
|
||||
- name: Setup Terraform
|
||||
uses: ./.github/actions/setup-tf
|
||||
|
||||
- name: Run Tests
|
||||
# Sets up the ImDisk toolkit for Windows and creates a RAM disk on drive R:.
|
||||
- name: Setup ImDisk
|
||||
if: runner.os == 'Windows'
|
||||
uses: ./.github/actions/setup-imdisk
|
||||
|
||||
- name: Test with PostgreSQL Database
|
||||
env:
|
||||
POSTGRES_VERSION: "13"
|
||||
TS_DEBUG_DISCO: "true"
|
||||
LC_CTYPE: "en_US.UTF-8"
|
||||
LC_ALL: "en_US.UTF-8"
|
||||
shell: bash
|
||||
run: |
|
||||
# -race is likeliest to catch flaky tests
|
||||
# due to correctness detection and its performance
|
||||
# impact.
|
||||
gotestsum --junitfile="gotests.xml" -- -timeout=240m -count=10 -race ./...
|
||||
# if macOS, install google-chrome for scaletests
|
||||
# As another concern, should we really have this kind of external dependency
|
||||
# requirement on standard CI?
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
brew install google-chrome
|
||||
fi
|
||||
|
||||
- name: Upload test results to DataDog
|
||||
# By default Go will use the number of logical CPUs, which
|
||||
# is a fine default.
|
||||
PARALLEL_FLAG=""
|
||||
|
||||
# macOS will output "The default interactive shell is now zsh"
|
||||
# intermittently in CI...
|
||||
if [ "${{ matrix.os }}" == "macos-latest" ]; then
|
||||
touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile
|
||||
fi
|
||||
|
||||
if [ "${{ runner.os }}" == "Windows" ]; then
|
||||
# Create a temp dir on the R: ramdisk drive for Windows. The default
|
||||
# C: drive is extremely slow: https://github.com/actions/runner-images/issues/8755
|
||||
mkdir -p "R:/temp/embedded-pg"
|
||||
go run scripts/embedded-pg/main.go -path "R:/temp/embedded-pg"
|
||||
else
|
||||
go run scripts/embedded-pg/main.go
|
||||
fi
|
||||
|
||||
# Reduce test parallelism, mirroring what we do for race tests.
|
||||
# We'd been encountering issues with timing related flakes, and
|
||||
# this seems to help.
|
||||
DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...
|
||||
|
||||
- name: Upload test stats to Datadog
|
||||
timeout-minutes: 1
|
||||
continue-on-error: true
|
||||
uses: ./.github/actions/upload-datadog
|
||||
if: always()
|
||||
with:
|
||||
api-key: ${{ secrets.DATADOG_API_KEY }}
|
||||
|
||||
go-timing:
|
||||
# We run these tests with p=1 so we don't need a lot of compute.
|
||||
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04' || 'ubuntu-latest' }}
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: Harden Runner
|
||||
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
|
||||
with:
|
||||
egress-policy: audit
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
|
||||
|
||||
- name: Setup Go
|
||||
uses: ./.github/actions/setup-go
|
||||
|
||||
- name: Run Tests
|
||||
run: |
|
||||
gotestsum --junitfile="gotests.xml" -- --tags="timing" -p=1 -run='_Timing/' ./...
|
||||
|
||||
- name: Upload test results to DataDog
|
||||
uses: ./.github/actions/upload-datadog
|
||||
if: always()
|
||||
if: success() || failure()
|
||||
with:
|
||||
api-key: ${{ secrets.DATADOG_API_KEY }}
|
||||
|
||||
notify-slack-on-failure:
|
||||
needs:
|
||||
- go-race
|
||||
- go-timing
|
||||
- test-go-pg
|
||||
runs-on: ubuntu-latest
|
||||
if: failure()
|
||||
if: failure() && github.ref == 'refs/heads/main'
|
||||
|
||||
steps:
|
||||
- name: Send Slack notification
|
||||
|
4
Makefile
4
Makefile
@ -807,6 +807,10 @@ test:
|
||||
$(GIT_FLAGS) gotestsum --format standard-quiet -- -v -short -count=1 ./...
|
||||
.PHONY: test
|
||||
|
||||
test-cli:
|
||||
$(GIT_FLAGS) gotestsum --format standard-quiet -- -v -short -count=1 ./cli/...
|
||||
.PHONY: test-cli
|
||||
|
||||
# sqlc-cloud-is-setup will fail if no SQLc auth token is set. Use this as a
|
||||
# dependency for any sqlc-cloud related targets.
|
||||
sqlc-cloud-is-setup:
|
||||
|
Reference in New Issue
Block a user