Skip to content
21 changes: 14 additions & 7 deletions .github/actions/setup-build-environment/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ inputs:
cache-key:
description: 'Cache key identifier for Go cache'
required: true
save-cache:
description: 'Whether this job may save the Go cache (only effective on main). Set to false on jobs that share a cache-key across many matrix instances so only one designated job writes the key.'
required: false
default: 'true'

runs:
using: 'composite'
Expand Down Expand Up @@ -38,9 +42,12 @@ runs:
# On runs against main (push + the scheduled wipe-and-repopulate
# cron added in #2092): restore now, save at job end via the
# unified action's post-step (which fires at the calling job's
# end, even when invoked from a composite).
# end, even when invoked from a composite). Gated on save-cache so
# that when many matrix instances share one cache-key, only the
# designated job writes it (concurrent same-key saves all fail but
# the first, so the extra writers just waste time).
- name: Restore and save Go cache (main)
if: github.ref == 'refs/heads/main'
if: github.ref == 'refs/heads/main' && inputs.save-cache == 'true'
uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: |
Expand All @@ -50,12 +57,12 @@ runs:
restore-keys: |
setup-go-${{ inputs.cache-key }}-${{ runner.os }}-go${{ steps.setup-go.outputs.go-version }}-

# On every other ref (PR / merge_group): restore only. Prefix
# fallback via restore-keys means runs whose go.sum differs from
# main still restore main's most recent cache and rebuild only
# the delta.
# On every other ref (PR / merge_group) or when this job is not the
# designated cache writer: restore only. Prefix fallback via
# restore-keys means runs whose go.sum differs from main still
# restore main's most recent cache and rebuild only the delta.
- name: Restore Go cache (non-main)
if: github.ref != 'refs/heads/main'
if: github.ref != 'refs/heads/main' || inputs.save-cache != 'true'
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: |
Expand Down
129 changes: 117 additions & 12 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
runs-on: ubuntu-latest
outputs:
targets: ${{ steps.mask1.outputs.targets || steps.mask2.outputs.targets || steps.mask3.outputs.targets }}
acc_matrix: ${{ steps.accmatrix.outputs.matrix }}
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
Expand Down Expand Up @@ -70,14 +71,60 @@ jobs:
# Always run all tests
echo "targets=[\"test\"]" >> $GITHUB_OUTPUT

test:
# Build the acceptance-test shard matrix. Shard counts vary per
# (os, engine), which a static cross-product matrix can't express, so we
# emit an explicit include-list consumed via fromJSON in the test job.
- name: Build acceptance test shard matrix
id: accmatrix
env:
EVENT_NAME: ${{ github.event_name }}
run: |
python3 - <<'PY' >> "$GITHUB_OUTPUT"
import json, os

event = os.environ["EVENT_NAME"]
runners = {
"linux": {"group": "databricks-protected-runner-group-large", "labels": "linux-ubuntu-latest-large"},
"windows": {"group": "databricks-protected-runner-group-large", "labels": "windows-server-latest-large"},
"macos": {"labels": "macos-latest"},
}
# (os, engine) -> shard count. Windows gets more shards because
# TASK_CONCURRENCY=1 serializes tests within each job, so the only
# way to cut its wall time is more parallel jobs. direct is faster
# than terraform and needs fewer shards.
shard_counts = {
("linux", "terraform"): 4,
("linux", "direct"): 2,
("macos", "terraform"): 4,
("macos", "direct"): 2,
("windows", "terraform"): 8,
("windows", "direct"): 4,
}

include = []
for (osname, engine), total in shard_counts.items():
# Run on Linux only in merge queue to reduce time to merge.
if event == "merge_group" and osname != "linux":
continue
for index in range(total):
include.append({
"os": {"name": osname, "runner": runners[osname]},
"deployment": engine,
"shard_index": index,
"shard_total": total,
})

print("matrix=" + json.dumps({"include": include}))
PY

test-unit:
needs:
- cleanups
- testmask

# Only run if the target is in the list of targets from testmask
if: ${{ contains(fromJSON(needs.testmask.outputs.targets), 'test') }}
name: "task test (${{matrix.os.name}}, ${{matrix.deployment}})"
name: "task test-unit (${{matrix.os.name}})"
runs-on: ${{ matrix.os.runner }}

defaults:
Expand All @@ -94,8 +141,6 @@ jobs:
strategy:
fail-fast: false
matrix:
# Use separate fields for the OS name and runner configuration.
# When combined in a single object, "runs-on" errors with "Unexpected value 'name'".
os:
- name: linux
runner:
Expand All @@ -111,10 +156,6 @@ jobs:
runner:
labels: macos-latest

deployment:
- "terraform"
- "direct"

# Include "event_name" in the matrix so we can include/exclude based on it.
event:
- ${{ github.event_name }}
Expand All @@ -135,20 +176,83 @@ jobs:
- name: Setup build environment
uses: ./.github/actions/setup-build-environment
with:
cache-key: test-${{ matrix.deployment }}
# Sole writer of the shared "test" cache (test-acc shards restore it).
cache-key: test

- name: Run tests
run: go tool -modfile=tools/task/go.mod task test-unit

- name: Upload gotestsum JSON output
# Always upload so we can inspect timing even if tests fail.
# This is debug-only telemetry; a flaky artifact upload must not fail
# an otherwise-passing job.
if: ${{ always() }}
continue-on-error: true
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: test-output-unit-${{ matrix.os.name }}
path: test-output-unit.json
if-no-files-found: warn
retention-days: 7

test:
needs:
- cleanups
- testmask

# Only run if the target is in the list of targets from testmask
if: ${{ contains(fromJSON(needs.testmask.outputs.targets), 'test') }}
name: "task test-acc (${{matrix.os.name}}, ${{matrix.deployment}}, shard ${{matrix.shard_index}}/${{matrix.shard_total}})"
runs-on: ${{ matrix.os.runner }}

defaults:
run:
shell: bash

permissions:
id-token: write
contents: read

env:
TASK_CONCURRENCY: ${{ matrix.os.name == 'windows' && '1' || '' }}

strategy:
fail-fast: false
# Generated by testmask: an include-list with per-(os, engine) shard
# counts. Each entry carries os{name,runner}, deployment, shard_index,
# and shard_total.
matrix: ${{ fromJSON(needs.testmask.outputs.acc_matrix) }}

steps:
- name: Checkout repository and submodules
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Setup build environment
uses: ./.github/actions/setup-build-environment
with:
# Shares the cache-key with test-unit so these shards restore the
# cache it saves. save-cache is false because many shard/deployment
# instances share this key; test-unit is the sole writer.
cache-key: test
save-cache: false

- name: Run tests
env:
ENVFILTER: DATABRICKS_BUNDLE_ENGINE=${{ matrix.deployment }}
run: go tool -modfile=tools/task/go.mod task test
SHARD_INDEX: ${{ matrix.shard_index }}
SHARD_TOTAL: ${{ matrix.shard_total }}
run: go tool -modfile=tools/task/go.mod task test-acc

- name: Upload gotestsum JSON output
# Always upload so we can inspect timing even if tests fail.
# This is debug-only telemetry; a flaky artifact upload must not fail
# an otherwise-passing job.
if: ${{ always() }}
continue-on-error: true
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: test-output-${{ matrix.os.name }}-${{ matrix.deployment }}
path: test-output.json
name: test-output-${{ matrix.os.name }}-${{ matrix.deployment }}-shard${{ matrix.shard_index }}
path: test-output-acc.json
if-no-files-found: warn
retention-days: 7

Expand Down Expand Up @@ -329,6 +433,7 @@ jobs:
# Reference: https://github.com/orgs/community/discussions/25970
test-result:
needs:
- test-unit
- test
- test-exp-aitools
- test-exp-ssh
Expand Down
22 changes: 22 additions & 0 deletions acceptance/acceptance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,10 @@ func testAccept(t *testing.T, inprocessMode bool, singleTest string) int {
return n != singleTest
})
require.NotEmpty(t, testDirs, "singleTest=%#v did not match any tests\n%#v", singleTest, testDirs)
} else {
// Sharding applies only to the full run. A specific singleTest (e.g.
// TestInprocessMode) must never be filtered out by the shard split.
testDirs = shardTests(testDirs)
}

skippedDirs := 0
Expand Down Expand Up @@ -513,6 +517,24 @@ func getTests(t *testing.T) []string {
return testDirs
}

// shardTests returns the subset of testDirs assigned to this CI shard when
// SHARD_TOTAL > 1, or testDirs unchanged otherwise. testDirs must be sorted so
// the split is deterministic and stable across runs.
func shardTests(testDirs []string) []string {
total, _ := strconv.Atoi(os.Getenv("SHARD_TOTAL"))
if total <= 1 {
return testDirs
}
index, _ := strconv.Atoi(os.Getenv("SHARD_INDEX"))
sharded := testDirs[:0]
for i, d := range testDirs {
if i%total == index {
sharded = append(sharded, d)
}
}
return sharded
}

func validateTestPhase(phase int) error {
if phase == 0 || phase == 1 {
return nil
Expand Down
Loading