Snapchat · kmontemayor2-sc · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
@@ -42,16 +42,29 @@ jobs:
   unit-test-python:
     if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/unit_test_py') || endsWith(github.event.comment.body, '/unit_test') || contains(github.event.comment.body, '/all_test')) }}
     runs-on: ubuntu-latest
-    # TODO(kmonte): Reduce this :(
-    timeout-minutes: 120
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "Type Check"
+            command: "make type_check"
+          - name: "Shard 0"
+            command: "make unit_test_py_shard SHARD_INDEX=0 TOTAL_SHARDS=4"
+          - name: "Shard 1"
+            command: "make unit_test_py_shard SHARD_INDEX=1 TOTAL_SHARDS=4"
+          - name: "Shard 2"
+            command: "make unit_test_py_shard SHARD_INDEX=2 TOTAL_SHARDS=4"
+          - name: "Shard 3"
+            command: "make unit_test_py_shard SHARD_INDEX=3 TOTAL_SHARDS=4"
     steps:
-    - name: Run Python Unit Tests
+    - name: Run Python Unit Tests (${{ matrix.name }})
       uses: snapchat/gigl/.github/actions/run-command-on-pr@main
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
         pr_number: ${{ github.event.issue.number }}
         should_leave_progress_comments: "true"
-        descriptive_workflow_name: "Python Unit Test"
+        descriptive_workflow_name: "Python Unit Test (${{ matrix.name }})"
         setup_gcloud: "true"
         # We use cloud run here instead of using github hosted runners because of limitation of tests
         # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand
@@ -61,8 +74,7 @@ jobs:
         gcp_project_id: ${{ vars.GCP_PROJECT_ID }}
         workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}
         gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}
-        command: |
-          make unit_test_py
+        command: ${{ matrix.command }}
 
   unit-test-scala:
     if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/unit_test_scala') || endsWith(github.event.comment.body, '/unit_test') || contains(github.event.comment.body, '/all_test')) }}
@@ -87,23 +99,37 @@ jobs:
   integration-test:
     if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/integration_test') || contains(github.event.comment.body, '/all_test')) }}
     runs-on: ubuntu-latest
-    # TODO(kmonte): Reduce this :(
-    timeout-minutes: 120
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "Shard 0"
+            command: "make integration_test_shard SHARD_INDEX=0 TOTAL_SHARDS=4"
+          - name: "Shard 1"
+            command: "make integration_test_shard SHARD_INDEX=1 TOTAL_SHARDS=4"
+          - name: "Shard 2"
+            command: "make integration_test_shard SHARD_INDEX=2 TOTAL_SHARDS=4"
+          - name: "Shard 3"
+            command: "make integration_test_shard SHARD_INDEX=3 TOTAL_SHARDS=4"
     steps:
-    - name: Run Integration Tests
+    - name: Run Integration Tests (${{ matrix.name }})
       uses: snapchat/gigl/.github/actions/run-command-on-pr@main
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
         pr_number: ${{ github.event.issue.number }}
         should_leave_progress_comments: "true"
-        descriptive_workflow_name: "Integration Test"
+        descriptive_workflow_name: "Integration Test (${{ matrix.name }})"
         setup_gcloud: "true"
+        # We use cloud run here instead of using github hosted runners because of limitation of tests
+        # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand
+        # how to leverage Workload Identity Federation to read assets from GCS, et al. See:
+        # https://github.com/tensorflow/tensorflow/issues/57104
         use_cloud_run: "true"
         gcp_project_id: ${{ vars.GCP_PROJECT_ID }}
         workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}
         gcp_service_account_email: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}
-        command: |
-            make integration_test
+        command: ${{ matrix.command }}
 
   integration-e2e-test:
     if: ${{ github.event.issue.pull_request && (contains(github.event.comment.body, '/e2e_test') || contains(github.event.comment.body, '/all_test')) }}

@@ -23,6 +23,20 @@ jobs:
     # Our tests take a long time to run, so this is not ideal.
     if: github.event_name == 'merge_group'
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "Type Check"
+            command: "make type_check"
+          - name: "Shard 0"
+            command: "make unit_test_py_shard SHARD_INDEX=0 TOTAL_SHARDS=4"
+          - name: "Shard 1"
+            command: "make unit_test_py_shard SHARD_INDEX=1 TOTAL_SHARDS=4"
+          - name: "Shard 2"
+            command: "make unit_test_py_shard SHARD_INDEX=2 TOTAL_SHARDS=4"
+          - name: "Shard 3"
+            command: "make unit_test_py_shard SHARD_INDEX=3 TOTAL_SHARDS=4"
     steps:
     - uses: actions/checkout@v4
     - name: Setup development environment
@@ -32,16 +46,16 @@ jobs:
           gcp_project_id: ${{ vars.GCP_PROJECT_ID }}
           workload_identity_provider: ${{ secrets.workload_identity_provider }}
           gcp_service_account_email: ${{ secrets.gcp_service_account_email }}
-    - name: Run Python Unit Tests
-      # We use cloud run here instead of using github hosted runners because of limitation of tests
+    - name: Run Python Unit Tests (${{ matrix.name }})
+      # We use Cloud Build instead of GitHub hosted runners because of limitation of tests
       # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand
       # how to leverage Workload Identity Federation to read assets from GCS, et al. See:
       # https://github.com/tensorflow/tensorflow/issues/57104
       uses: ./.github/actions/run-cloud-run-command-on-active-checkout
       with:
-        cmd: "make unit_test_py"
-        service_account:  ${{ secrets.gcp_service_account_email }}
-        project:  ${{ vars.GCP_PROJECT_ID }}
+        cmd: ${{ matrix.command }}
+        service_account: ${{ secrets.gcp_service_account_email }}
+        project: ${{ vars.GCP_PROJECT_ID }}
 
   ci-unit-test-scala:
     # Because of limitation discussed https://github.com/orgs/community/discussions/46757#discussioncomment-4912738
@@ -73,6 +87,18 @@ jobs:
   ci-integration-test:
     if: github.event_name == 'merge_group'
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "Shard 0"
+            command: "make integration_test_shard SHARD_INDEX=0 TOTAL_SHARDS=4"
+          - name: "Shard 1"
+            command: "make integration_test_shard SHARD_INDEX=1 TOTAL_SHARDS=4"
+          - name: "Shard 2"
+            command: "make integration_test_shard SHARD_INDEX=2 TOTAL_SHARDS=4"
+          - name: "Shard 3"
+            command: "make integration_test_shard SHARD_INDEX=3 TOTAL_SHARDS=4"
     steps:
     - uses: actions/checkout@v4
     - name: Setup development environment
@@ -82,12 +108,16 @@ jobs:
           gcp_project_id: ${{ vars.GCP_PROJECT_ID }}
           workload_identity_provider: ${{ secrets.workload_identity_provider }}
           gcp_service_account_email: ${{ secrets.gcp_service_account_email }}
-    - name: Run Integration Tests
+    - name: Run Integration Tests (${{ matrix.name }})
+      # We use Cloud Build instead of GitHub hosted runners because of limitation of tests
+      # using GFile library (a.k.a anything that does IO w/ Tensorflow). GFile does not understand
+      # how to leverage Workload Identity Federation to read assets from GCS, et al. See:
+      # https://github.com/tensorflow/tensorflow/issues/57104
       uses: ./.github/actions/run-cloud-run-command-on-active-checkout
       with:
-        cmd: "make integration_test"
-        service_account:  ${{ secrets.gcp_service_account_email }}
-        project:  ${{ vars.GCP_PROJECT_ID }}
+        cmd: ${{ matrix.command }}
+        service_account: ${{ secrets.gcp_service_account_email }}
+        project: ${{ vars.GCP_PROJECT_ID }}
 
   ci-integration-e2e-test:
     if: github.event_name == 'merge_group'

@@ -23,6 +23,8 @@ DOCKER_IMAGE_DEV_WORKBENCH_NAME_WITH_TAG?=${DOCKER_IMAGE_DEV_WORKBENCH_NAME}:${D
 
 PYTHON_DIRS:=.github/scripts examples gigl tests snapchat scripts
 PY_TEST_FILES?="*_test.py"
+SHARD_INDEX?=0
+TOTAL_SHARDS?=0
 # You can override GIGL_TEST_DEFAULT_RESOURCE_CONFIG by setting it in your environment i.e.
 # adding `export GIGL_TEST_DEFAULT_RESOURCE_CONFIG=your_resource_config` to your shell config (~/.bashrc, ~/.zshrc, etc.)
 GIGL_TEST_DEFAULT_RESOURCE_CONFIG?=${PWD}/deployment/configs/unittest_resource_config.yaml
@@ -81,6 +83,14 @@ unit_test_py: clean_build_files_py type_check
 		--resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \
 		--test_file_pattern=$(PY_TEST_FILES) \
 
+# Runs a single shard of the Python unit tests (no type checking).
+# Usage: make unit_test_py_shard SHARD_INDEX=0 TOTAL_SHARDS=4
+unit_test_py_shard: clean_build_files_py
+	uv run python -m tests.unit.main \
+		--env=test \
+		--resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \
+		--test_file_pattern=$(PY_TEST_FILES) \
+		--shard_index=$(SHARD_INDEX) --total_shards=$(TOTAL_SHARDS)
 
 unit_test_scala: clean_build_files_scala
 	( cd scala; sbt test )
@@ -121,6 +131,14 @@ integration_test:
 		--resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \
 		--test_file_pattern=$(PY_TEST_FILES) \
 
+# Runs a single shard of the integration tests.
+# Usage: make integration_test_shard SHARD_INDEX=0 TOTAL_SHARDS=4
+integration_test_shard: clean_build_files_py
+	uv run python -m tests.integration.main \
+		--env=test \
+		--resource_config_uri=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} \
+		--test_file_pattern=$(PY_TEST_FILES) \
+		--shard_index=$(SHARD_INDEX) --total_shards=$(TOTAL_SHARDS)
 
 notebooks_test:
 	RESOURCE_CONFIG_PATH=${GIGL_TEST_DEFAULT_RESOURCE_CONFIG} python -m tests.config_tests.notebooks_test