diff --git a/.github/actions/setup-build-environment/action.yml b/.github/actions/setup-build-environment/action.yml index 3e7bde9704e..d7dae7b63cf 100644 --- a/.github/actions/setup-build-environment/action.yml +++ b/.github/actions/setup-build-environment/action.yml @@ -5,6 +5,10 @@ inputs: cache-key: description: 'Cache key identifier for Go cache' required: true + save-cache: + description: 'Whether this job may save the Go cache (only effective on main). Set to false on jobs that share a cache-key across many matrix instances so only one designated job writes the key.' + required: false + default: 'true' runs: using: 'composite' @@ -38,9 +42,12 @@ runs: # On runs against main (push + the scheduled wipe-and-repopulate # cron added in #2092): restore now, save at job end via the # unified action's post-step (which fires at the calling job's - # end, even when invoked from a composite). + # end, even when invoked from a composite). Gated on save-cache so + # that when many matrix instances share one cache-key, only the + # designated job writes it (concurrent same-key saves all fail but + # the first, so the extra writers just waste time). - name: Restore and save Go cache (main) - if: github.ref == 'refs/heads/main' + if: github.ref == 'refs/heads/main' && inputs.save-cache == 'true' uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: path: | @@ -50,12 +57,12 @@ runs: restore-keys: | setup-go-${{ inputs.cache-key }}-${{ runner.os }}-go${{ steps.setup-go.outputs.go-version }}- - # On every other ref (PR / merge_group): restore only. Prefix - # fallback via restore-keys means runs whose go.sum differs from - # main still restore main's most recent cache and rebuild only - # the delta. + # On every other ref (PR / merge_group) or when this job is not the + # designated cache writer: restore only. Prefix fallback via + # restore-keys means runs whose go.sum differs from main still + # restore main's most recent cache and rebuild only the delta. - name: Restore Go cache (non-main) - if: github.ref != 'refs/heads/main' + if: github.ref != 'refs/heads/main' || inputs.save-cache != 'true' uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: path: | diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 0bab4ba3d76..8aaa12926a7 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -35,6 +35,7 @@ jobs: runs-on: ubuntu-latest outputs: targets: ${{ steps.mask1.outputs.targets || steps.mask2.outputs.targets || steps.mask3.outputs.targets }} + acc_matrix: ${{ steps.accmatrix.outputs.matrix }} steps: - name: Checkout repository uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -70,14 +71,60 @@ jobs: # Always run all tests echo "targets=[\"test\"]" >> $GITHUB_OUTPUT - test: + # Build the acceptance-test shard matrix. Shard counts vary per + # (os, engine), which a static cross-product matrix can't express, so we + # emit an explicit include-list consumed via fromJSON in the test job. + - name: Build acceptance test shard matrix + id: accmatrix + env: + EVENT_NAME: ${{ github.event_name }} + run: | + python3 - <<'PY' >> "$GITHUB_OUTPUT" + import json, os + + event = os.environ["EVENT_NAME"] + runners = { + "linux": {"group": "databricks-protected-runner-group-large", "labels": "linux-ubuntu-latest-large"}, + "windows": {"group": "databricks-protected-runner-group-large", "labels": "windows-server-latest-large"}, + "macos": {"labels": "macos-latest"}, + } + # (os, engine) -> shard count. Windows gets more shards because + # TASK_CONCURRENCY=1 serializes tests within each job, so the only + # way to cut its wall time is more parallel jobs. direct is faster + # than terraform and needs fewer shards. + shard_counts = { + ("linux", "terraform"): 4, + ("linux", "direct"): 2, + ("macos", "terraform"): 4, + ("macos", "direct"): 2, + ("windows", "terraform"): 8, + ("windows", "direct"): 4, + } + + include = [] + for (osname, engine), total in shard_counts.items(): + # Run on Linux only in merge queue to reduce time to merge. + if event == "merge_group" and osname != "linux": + continue + for index in range(total): + include.append({ + "os": {"name": osname, "runner": runners[osname]}, + "deployment": engine, + "shard_index": index, + "shard_total": total, + }) + + print("matrix=" + json.dumps({"include": include})) + PY + + test-unit: needs: - cleanups - testmask # Only run if the target is in the list of targets from testmask if: ${{ contains(fromJSON(needs.testmask.outputs.targets), 'test') }} - name: "task test (${{matrix.os.name}}, ${{matrix.deployment}})" + name: "task test-unit (${{matrix.os.name}})" runs-on: ${{ matrix.os.runner }} defaults: @@ -94,8 +141,6 @@ jobs: strategy: fail-fast: false matrix: - # Use separate fields for the OS name and runner configuration. - # When combined in a single object, "runs-on" errors with "Unexpected value 'name'". os: - name: linux runner: @@ -111,10 +156,6 @@ jobs: runner: labels: macos-latest - deployment: - - "terraform" - - "direct" - # Include "event_name" in the matrix so we can include/exclude based on it. event: - ${{ github.event_name }} @@ -135,20 +176,83 @@ jobs: - name: Setup build environment uses: ./.github/actions/setup-build-environment with: - cache-key: test-${{ matrix.deployment }} + # Sole writer of the shared "test" cache (test-acc shards restore it). + cache-key: test + + - name: Run tests + run: go tool -modfile=tools/task/go.mod task test-unit + + - name: Upload gotestsum JSON output + # Always upload so we can inspect timing even if tests fail. + # This is debug-only telemetry; a flaky artifact upload must not fail + # an otherwise-passing job. + if: ${{ always() }} + continue-on-error: true + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: test-output-unit-${{ matrix.os.name }} + path: test-output-unit.json + if-no-files-found: warn + retention-days: 7 + + test: + needs: + - cleanups + - testmask + + # Only run if the target is in the list of targets from testmask + if: ${{ contains(fromJSON(needs.testmask.outputs.targets), 'test') }} + name: "task test-acc (${{matrix.os.name}}, ${{matrix.deployment}}, shard ${{matrix.shard_index}}/${{matrix.shard_total}})" + runs-on: ${{ matrix.os.runner }} + + defaults: + run: + shell: bash + + permissions: + id-token: write + contents: read + + env: + TASK_CONCURRENCY: ${{ matrix.os.name == 'windows' && '1' || '' }} + + strategy: + fail-fast: false + # Generated by testmask: an include-list with per-(os, engine) shard + # counts. Each entry carries os{name,runner}, deployment, shard_index, + # and shard_total. + matrix: ${{ fromJSON(needs.testmask.outputs.acc_matrix) }} + + steps: + - name: Checkout repository and submodules + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup build environment + uses: ./.github/actions/setup-build-environment + with: + # Shares the cache-key with test-unit so these shards restore the + # cache it saves. save-cache is false because many shard/deployment + # instances share this key; test-unit is the sole writer. + cache-key: test + save-cache: false - name: Run tests env: ENVFILTER: DATABRICKS_BUNDLE_ENGINE=${{ matrix.deployment }} - run: go tool -modfile=tools/task/go.mod task test + SHARD_INDEX: ${{ matrix.shard_index }} + SHARD_TOTAL: ${{ matrix.shard_total }} + run: go tool -modfile=tools/task/go.mod task test-acc - name: Upload gotestsum JSON output # Always upload so we can inspect timing even if tests fail. + # This is debug-only telemetry; a flaky artifact upload must not fail + # an otherwise-passing job. if: ${{ always() }} + continue-on-error: true uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: test-output-${{ matrix.os.name }}-${{ matrix.deployment }} - path: test-output.json + name: test-output-${{ matrix.os.name }}-${{ matrix.deployment }}-shard${{ matrix.shard_index }} + path: test-output-acc.json if-no-files-found: warn retention-days: 7 @@ -329,6 +433,7 @@ jobs: # Reference: https://github.com/orgs/community/discussions/25970 test-result: needs: + - test-unit - test - test-exp-aitools - test-exp-ssh diff --git a/acceptance/acceptance_test.go b/acceptance/acceptance_test.go index f6ec0805fb2..fe8877c7489 100644 --- a/acceptance/acceptance_test.go +++ b/acceptance/acceptance_test.go @@ -350,6 +350,10 @@ func testAccept(t *testing.T, inprocessMode bool, singleTest string) int { return n != singleTest }) require.NotEmpty(t, testDirs, "singleTest=%#v did not match any tests\n%#v", singleTest, testDirs) + } else { + // Sharding applies only to the full run. A specific singleTest (e.g. + // TestInprocessMode) must never be filtered out by the shard split. + testDirs = shardTests(testDirs) } skippedDirs := 0 @@ -513,6 +517,24 @@ func getTests(t *testing.T) []string { return testDirs } +// shardTests returns the subset of testDirs assigned to this CI shard when +// SHARD_TOTAL > 1, or testDirs unchanged otherwise. testDirs must be sorted so +// the split is deterministic and stable across runs. +func shardTests(testDirs []string) []string { + total, _ := strconv.Atoi(os.Getenv("SHARD_TOTAL")) + if total <= 1 { + return testDirs + } + index, _ := strconv.Atoi(os.Getenv("SHARD_INDEX")) + sharded := testDirs[:0] + for i, d := range testDirs { + if i%total == index { + sharded = append(sharded, d) + } + } + return sharded +} + func validateTestPhase(phase int) error { if phase == 0 || phase == 1 { return nil