chore: improve CI reliability (#16169)

We have an effort underway to replace `dbmem` (#15109), and consequently we've begun running our full test-suite (with Postgres) on all supported OSs - Windows, MacOS, and Linux, since #15520. Since this change, we've seen a marked decrease in the success rate of our builds on `main` (note how the Windows/MacOS failures account for the vast majority of failed builds): ![image](https://github.com/user-attachments/assets/a02c15b7-037d-428a-a600-2aed60553ac0) We're still investigating why these OSs are a lot less reliable. It's likely that the VMs on which the builds are run have different characteristics from our Ubuntu runners such as disk I/O, network latency, or something else. **In the meantime, we need to start trusting CI failures in `main` again, as the current failures are too noisy / vague for us to correct.** We've also considered hosting our own runners where possible so we can get OS-level observability to rule out some possibilities. See the [meeting notes](https://www.notion.so/coderhq/CI-Investigation-Call-Notes-17dd579be59280d8897cc9fe4bb46695?pvs=6&utm_content=17dd579b-e592-80d8-897c-c9fe4bb46695&utm_campaign=T1ZPT2FL0&n=slack&n=slack_link_unfurl) where we linked into this for more detail. This PR introduces several changes: 1. Moves the full test-suite with Postgres on Windows/MacOS to the `nightly-gauntlet` workflow tradeoff: this means that any regressions may be more difficult to discover since we merge to main several times a day 2. Run only the CLI test-suite on each PR / merge to `main` on Windows/MacOS 3. `test-go` is still running the full test-suite against all OSs (including the CLI ones), but will soon be removed once #15109 is completed since it uses `dbmem` 4. Changes `nightly-gauntlet` to run at 4AM: we've seen several instances of the runner being stopped externally, and we're _guessing_ this may have something to do with the midnight UTC execution time, when other cron jobs may run 5. Removes the existing `nightly-gauntlet` jobs since they haven't passed in a long time, indicating that nobody cares enough to fix them and they don't provide diagnostic value; we can restore them later if necessary I've manually run both these new workflows successfully: - `ci`: https://github.com/coder/coder/actions/runs/12825874176/job/35764724907 - `nightly-gauntlet`: https://github.com/coder/coder/actions/runs/12825539092 --------- Signed-off-by: Danny Kopping <danny@coder.com> Co-authored-by: Muhammad Atif Ali <atif@coder.com>
2025-07-18 14:17:22 +00:00 · 2025-01-20 09:06:33 +02:00
parent 738a7f6bd9
commit 5b72a4376d
3 changed files with 125 additions and 78 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -378,8 +378,62 @@ jobs:
        with:
          api-key: ${{ secrets.DATADOG_API_KEY }}

+  # We don't run the full test-suite for Windows & MacOS, so we just run the CLI tests on every PR.
+  # We run the test suite in test-go-pg, including CLI.
+  test-cli:
+    runs-on: ${{ matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
+    needs: changes
+    if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
+    strategy:
+      matrix:
+        os:
+          - macos-latest
+          - windows-2022
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
+        with:
+          egress-policy: audit
+
+      - name: Checkout
+        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        with:
+          fetch-depth: 1
+
+      - name: Setup Go
+        uses: ./.github/actions/setup-go
+
+      - name: Setup Terraform
+        uses: ./.github/actions/setup-tf
+
+      # Sets up the ImDisk toolkit for Windows and creates a RAM disk on drive R:.
+      - name: Setup ImDisk
+        if: runner.os == 'Windows'
+        uses: ./.github/actions/setup-imdisk
+
+      - name: Test CLI
+        env:
+          TS_DEBUG_DISCO: "true"
+          LC_CTYPE: "en_US.UTF-8"
+          LC_ALL: "en_US.UTF-8"
+        shell: bash
+        run: |
+          # By default Go will use the number of logical CPUs, which
+          # is a fine default.
+          PARALLEL_FLAG=""
+
+          make test-cli
+
+      - name: Upload test stats to Datadog
+        timeout-minutes: 1
+        continue-on-error: true
+        uses: ./.github/actions/upload-datadog
+        if: success() || failure()
+        with:
+          api-key: ${{ secrets.DATADOG_API_KEY }}
+
  test-go-pg:
-    runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'depot-ubuntu-22.04-4' || matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
+    runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'depot-ubuntu-22.04-4' || matrix.os }}
    needs: changes
    if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
    # This timeout must be greater than the timeout set by `go test` in
@ -391,8 +445,6 @@ jobs:
      matrix:
        os:
          - ubuntu-latest
-          - macos-latest
-          - windows-2022
    steps:
      - name: Harden Runner
        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
@ -423,39 +475,11 @@ jobs:
          LC_ALL: "en_US.UTF-8"
        shell: bash
        run: |
-          # if macOS, install google-chrome for scaletests
-          # As another concern, should we really have this kind of external dependency
-          # requirement on standard CI?
-          if [ "${{ matrix.os }}" == "macos-latest" ]; then
-            brew install google-chrome
-          fi
-
          # By default Go will use the number of logical CPUs, which
          # is a fine default.
          PARALLEL_FLAG=""

-          # macOS will output "The default interactive shell is now zsh"
-          # intermittently in CI...
-          if [ "${{ matrix.os }}" == "macos-latest" ]; then
-            touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile
-          fi
-
-          if [ "${{ runner.os }}" == "Linux" ]; then
-            make test-postgres
-          elif [ "${{ runner.os }}" == "Windows" ]; then
-            # Create a temp dir on the R: ramdisk drive for Windows. The default
-            # C: drive is extremely slow: https://github.com/actions/runner-images/issues/8755
-            mkdir -p "R:/temp/embedded-pg"
-            go run scripts/embedded-pg/main.go -path "R:/temp/embedded-pg"
-            # Reduce test parallelism, mirroring what we do for race tests.
-            # We'd been encountering issues with timing related flakes, and
-            # this seems to help.
-            DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...
-          else
-            go run scripts/embedded-pg/main.go
-            # Reduce test parallelism, like for Windows above.
-            DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...
-          fi
+          make test-postgres

      - name: Upload test stats to Datadog
        timeout-minutes: 1
--- a/.github/workflows/nightly-gauntlet.yaml
+++ b/.github/workflows/nightly-gauntlet.yaml
@ -3,22 +3,27 @@
 name: nightly-gauntlet
 on:
  schedule:
-    # Every day at midnight
-    - cron: "0 0 * * *"
+    # Every day at 4AM
+    - cron: "0 4 * * 1-5"
  workflow_dispatch:

 permissions:
  contents: read

 jobs:
-  go-race:
-    # While GitHub's toaster runners are likelier to flake, we want consistency
-    # between this environment and the regular test environment for DataDog
-    # statistics and to only show real workflow threats.
-    runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-8' || 'ubuntu-latest' }}
-    # This runner costs 0.016 USD per minute,
-    # so 0.016 * 240 = 3.84 USD per run.
-    timeout-minutes: 240
+  test-go-pg:
+    runs-on: ${{ matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
+    if: github.ref == 'refs/heads/main'
+    # This timeout must be greater than the timeout set by `go test` in
+    # `make test-postgres` to ensure we receive a trace of running
+    # goroutines. Setting this to the timeout +5m should work quite well
+    # even if some of the preceding steps are slow.
+    timeout-minutes: 25
+    strategy:
+      matrix:
+        os:
+          - macos-latest
+          - windows-2022
    steps:
      - name: Harden Runner
        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
@ -27,6 +32,8 @@ jobs:

      - name: Checkout
        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        with:
+          fetch-depth: 1

      - name: Setup Go
        uses: ./.github/actions/setup-go
@ -34,51 +41,63 @@ jobs:
      - name: Setup Terraform
        uses: ./.github/actions/setup-tf

-      - name: Run Tests
+      # Sets up the ImDisk toolkit for Windows and creates a RAM disk on drive R:.
+      - name: Setup ImDisk
+        if: runner.os == 'Windows'
+        uses: ./.github/actions/setup-imdisk
+
+      - name: Test with PostgreSQL Database
+        env:
+          POSTGRES_VERSION: "13"
+          TS_DEBUG_DISCO: "true"
+          LC_CTYPE: "en_US.UTF-8"
+          LC_ALL: "en_US.UTF-8"
+        shell: bash
        run: |
-          # -race is likeliest to catch flaky tests
-          # due to correctness detection and its performance
-          # impact.
-          gotestsum --junitfile="gotests.xml" -- -timeout=240m -count=10 -race ./...
+          # if macOS, install google-chrome for scaletests
+          # As another concern, should we really have this kind of external dependency
+          # requirement on standard CI?
+          if [ "${{ matrix.os }}" == "macos-latest" ]; then
+            brew install google-chrome
+          fi

-      - name: Upload test results to DataDog
+          # By default Go will use the number of logical CPUs, which
+          # is a fine default.
+          PARALLEL_FLAG=""
+
+          # macOS will output "The default interactive shell is now zsh"
+          # intermittently in CI...
+          if [ "${{ matrix.os }}" == "macos-latest" ]; then
+            touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile
+          fi
+
+          if [ "${{ runner.os }}" == "Windows" ]; then
+            # Create a temp dir on the R: ramdisk drive for Windows. The default
+            # C: drive is extremely slow: https://github.com/actions/runner-images/issues/8755
+            mkdir -p "R:/temp/embedded-pg"
+            go run scripts/embedded-pg/main.go -path "R:/temp/embedded-pg"
+          else
+            go run scripts/embedded-pg/main.go
+          fi
+
+          # Reduce test parallelism, mirroring what we do for race tests.
+          # We'd been encountering issues with timing related flakes, and
+          # this seems to help.
+          DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...
+
+      - name: Upload test stats to Datadog
+        timeout-minutes: 1
+        continue-on-error: true
        uses: ./.github/actions/upload-datadog
-        if: always()
-        with:
-          api-key: ${{ secrets.DATADOG_API_KEY }}
-
-  go-timing:
-    # We run these tests with p=1 so we don't need a lot of compute.
-    runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04' || 'ubuntu-latest' }}
-    timeout-minutes: 10
-    steps:
-      - name: Harden Runner
-        uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
-        with:
-          egress-policy: audit
-
-      - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-
-      - name: Setup Go
-        uses: ./.github/actions/setup-go
-
-      - name: Run Tests
-        run: |
-          gotestsum --junitfile="gotests.xml" -- --tags="timing" -p=1 -run='_Timing/' ./...
-
-      - name: Upload test results to DataDog
-        uses: ./.github/actions/upload-datadog
-        if: always()
+        if: success() || failure()
        with:
          api-key: ${{ secrets.DATADOG_API_KEY }}

  notify-slack-on-failure:
    needs:
-      - go-race
-      - go-timing
+      - test-go-pg
    runs-on: ubuntu-latest
-    if: failure()
+    if: failure() && github.ref == 'refs/heads/main'

    steps:
      - name: Send Slack notification
--- a/4
+++ b/4
@ -807,6 +807,10 @@ test:
 	$(GIT_FLAGS) gotestsum --format standard-quiet -- -v -short -count=1 ./...
 .PHONY: test

+test-cli:
+	$(GIT_FLAGS) gotestsum --format standard-quiet -- -v -short -count=1 ./cli/...
+.PHONY: test-cli
+
 # sqlc-cloud-is-setup will fail if no SQLc auth token is set. Use this as a
 # dependency for any sqlc-cloud related targets.
 sqlc-cloud-is-setup: