From 111f7749fe8132b1272f05f3a8d63e3e47300f7e Mon Sep 17 00:00:00 2001 From: Brad Lugo Date: Fri, 27 Mar 2026 00:29:08 -0700 Subject: [PATCH] ROX-33852: Update OpenShift CI scripts --- .openshift-ci/ci_tests.py | 18 ++-- .openshift-ci/clusters.py | 42 +++++--- .openshift-ci/common.py | 15 ++- scripts/ci/cleanup-deployment.sh | 6 +- scripts/ci/gke.sh | 168 ++++++++++++++++++++----------- 5 files changed, 166 insertions(+), 83 deletions(-) diff --git a/.openshift-ci/ci_tests.py b/.openshift-ci/ci_tests.py index 4177610b7..7aa18dc26 100755 --- a/.openshift-ci/ci_tests.py +++ b/.openshift-ci/ci_tests.py @@ -15,15 +15,19 @@ class BaseTest: def __init__(self): self.test_output_dirs = [] - def run_with_graceful_kill(self, args, timeout, post_start_hook=None): + def run_with_graceful_kill(self, args, timeout): with subprocess.Popen(args) as cmd: - if post_start_hook is not None: - post_start_hook() try: exitstatus = cmd.wait(timeout) if exitstatus != 0: raise RuntimeError(f"Test failed: exit {exitstatus}") except subprocess.TimeoutExpired as err: + # Kill child processes as we cannot rely on bash scripts to + # handle signals and stop tests + subprocess.run( + ["/usr/bin/pkill", "-P", str(cmd.pid)], check=True, timeout=5 + ) + # Then kill the test command popen_graceful_kill(cmd) raise err @@ -41,7 +45,7 @@ def run(self): self.run_with_graceful_kill( ["scripts/ci/jobs/e2etests/e2e-tests.sh"], - E2ETest.TEST_TIMEOUT, + self.TEST_TIMEOUT, ) @@ -55,10 +59,10 @@ def run(self): self.run_with_graceful_kill( ["scripts/ci/jobs/e2etests/scale-tests.sh"], - ScaleTest.TEST_TIMEOUT, + self.TEST_TIMEOUT, ) - self.test_output_dirs.append(ScaleTest.OUTPUT_DIR) + self.test_output_dirs.append(self.OUTPUT_DIR) class SlimE2ETest(BaseTest): @@ -69,5 +73,5 @@ def run(self): self.run_with_graceful_kill( ["scripts/ci/jobs/e2etests/slim-e2e-tests.sh"], - SlimE2ETest.TEST_TIMEOUT, + self.TEST_TIMEOUT, ) diff --git a/.openshift-ci/clusters.py b/.openshift-ci/clusters.py index 43a237025..2858df4f9 100755 --- a/.openshift-ci/clusters.py +++ b/.openshift-ci/clusters.py @@ -3,7 +3,7 @@ """ Clusters used in test -Copied from https://github.com/stackrox/stackrox/blob/master/.openshift-ci/clusters.py +Adapted from https://github.com/stackrox/stackrox/blob/master/.openshift-ci/clusters.py """ import os @@ -25,35 +25,42 @@ def teardown(self): class GKECluster: # Provisioning timeout is tightly coupled to the time it may take gke.sh to # create a cluster. - PROVISION_TIMEOUT = 90 * 60 + PROVISION_TIMEOUT = 140 * 60 WAIT_TIMEOUT = 20 * 60 TEARDOWN_TIMEOUT = 5 * 60 + # separate script names used for testability - test_clusters.py PROVISION_PATH = "scripts/ci/gke.sh" WAIT_PATH = "scripts/ci/gke.sh" REFRESH_PATH = "scripts/ci/gke.sh" TEARDOWN_PATH = "scripts/ci/gke.sh" - def __init__(self, cluster_id, num_nodes=3, machine_type="e2-standard-4"): + def __init__(self, cluster_id, num_nodes=None, machine_type=None, disk_gb=None): self.cluster_id = cluster_id self.num_nodes = num_nodes self.machine_type = machine_type + self.disk_gb = disk_gb self.refresh_token_cmd = None def provision(self): + if self.num_nodes is not None: + os.environ["NUM_NODES"] = str(self.num_nodes) + if self.machine_type is not None: + os.environ["MACHINE_TYPE"] = str(self.machine_type) + if self.disk_gb is not None: + os.environ["DISK_SIZE_GB"] = str(self.disk_gb) with subprocess.Popen( [ - GKECluster.PROVISION_PATH, + self.PROVISION_PATH, "provision_gke_cluster", self.cluster_id, - str(self.num_nodes), - self.machine_type, ] ) as cmd: try: - exitstatus = cmd.wait(GKECluster.PROVISION_TIMEOUT) + exitstatus = cmd.wait(self.PROVISION_TIMEOUT) if exitstatus != 0: - raise RuntimeError(f"Cluster provision failed: exit {exitstatus}") + raise RuntimeError( + f"Cluster provision failed: exit {exitstatus}") except subprocess.TimeoutExpired as err: popen_graceful_kill(cmd) raise err @@ -62,38 +69,41 @@ def provision(self): signal.signal(signal.SIGINT, self.sigint_handler) subprocess.run( - [GKECluster.WAIT_PATH, "wait_for_cluster"], + [self.WAIT_PATH, "wait_for_cluster"], check=True, - timeout=GKECluster.WAIT_TIMEOUT, + timeout=self.WAIT_TIMEOUT, ) # pylint: disable=consider-using-with self.refresh_token_cmd = subprocess.Popen( - [GKECluster.REFRESH_PATH, "refresh_gke_token"] + [self.REFRESH_PATH, "refresh_gke_token"] ) return self - def teardown(self): + def teardown(self, canceled=False): while os.path.exists("/tmp/hold-cluster"): print("Pausing teardown because /tmp/hold-cluster exists") time.sleep(60) - if self.refresh_token_cmd is not None: + if self.refresh_token_cmd is not None and not canceled: print("Terminating GKE token refresh") try: popen_graceful_kill(self.refresh_token_cmd) except Exception as err: print(f"Could not terminate the token refresh: {err}") + args = [self.TEARDOWN_PATH, "teardown_gke_cluster"] + if canceled: + args.append("true") subprocess.run( - [GKECluster.TEARDOWN_PATH, "teardown_gke_cluster"], + args, check=True, - timeout=GKECluster.TEARDOWN_TIMEOUT, + timeout=self.TEARDOWN_TIMEOUT, ) return self def sigint_handler(self, signum, frame): print("Tearing down the cluster due to SIGINT", signum, frame) - self.teardown() + self.teardown(canceled=True) diff --git a/.openshift-ci/common.py b/.openshift-ci/common.py index 832851e46..23f02b35f 100644 --- a/.openshift-ci/common.py +++ b/.openshift-ci/common.py @@ -1,14 +1,25 @@ +from datetime import datetime import subprocess ''' -Copied from https://github.com/stackrox/stackrox/blob/master/.openshift-ci/common.py +Adapted from https://github.com/stackrox/stackrox/blob/master/.openshift-ci/common.py ''' def popen_graceful_kill(cmd): + log_print(f"Sending SIGTERM to {cmd.args}") cmd.terminate() try: cmd.wait(5) + log_print("Terminated") except subprocess.TimeoutExpired as err: + log_print(f"Exception raised waiting after SIGTERM to {cmd.args}, {err}") + # SIGKILL if necessary + log_print(f"Sending SIGKILL to {cmd.args}") cmd.kill() cmd.wait(5) - raise err + log_print("Terminated") + +def log_print(*args): + now = datetime.now() + time = now.strftime("%H:%M:%S") + print(f"{time}:", *args) diff --git a/scripts/ci/cleanup-deployment.sh b/scripts/ci/cleanup-deployment.sh index f99c1955b..ac7af9a27 100755 --- a/scripts/ci/cleanup-deployment.sh +++ b/scripts/ci/cleanup-deployment.sh @@ -1,5 +1,7 @@ -#! /bin/bash +#!/usr/bin/env bash # Copied from https://github.com/stackrox/stackrox/blob/master/scripts/ci/cleanup-deployment.sh -kubectl -n stackrox get cm,deploy,ds,networkpolicy,pv,pvc,secret,svc,serviceaccount -o name | xargs kubectl -n stackrox delete --wait +namespace=${1:-stackrox} + +kubectl -n "${namespace}" get cm,deploy,ds,networkpolicy,pv,pvc,secret,svc,serviceaccount -o name | xargs kubectl -n "${namespace}" delete --wait diff --git a/scripts/ci/gke.sh b/scripts/ci/gke.sh index f0f9aaf66..22993007f 100755 --- a/scripts/ci/gke.sh +++ b/scripts/ci/gke.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash # A collection of GKE related reusable bash functions for CI -# Copied from https://github.com/stackrox/stackrox/blob/master/scripts/ci/gke.sh +# Adapted from https://github.com/stackrox/stackrox/blob/master/scripts/ci/gke.sh SCRIPTS_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/../.. && pwd)" # shellcheck source=../../scripts/ci/lib.sh @@ -22,13 +22,11 @@ provision_gke_cluster() { assign_env_variables() { info "Assigning environment variables for later steps" - if [[ "$#" -lt 1 ]]; then - die "missing args. usage: assign_env_variables [ ]" + if [[ "$#" -ne 1 ]]; then + die "missing args. usage: assign_env_variables " fi local cluster_id="$1" - local num_nodes="${2:-3}" - local machine_type="${3:-e2-standard-4}" ensure_CI @@ -45,19 +43,45 @@ assign_env_variables() { ci_export CLUSTER_NAME "$cluster_name" echo "Assigned cluster name is $cluster_name" - ci_export NUM_NODES "$num_nodes" - echo "Number of nodes for cluster is $num_nodes" + choose_release_channel + choose_cluster_version +} - ci_export MACHINE_TYPE "$machine_type" - echo "Machine type is set as to $machine_type" +choose_release_channel() { + if ! is_in_PR_context; then + GKE_RELEASE_CHANNEL="${GKE_RELEASE_CHANNEL:-stable}" + elif pr_has_label ci-gke-use-rapid-channel; then + GKE_RELEASE_CHANNEL="rapid" + elif pr_has_label ci-gke-use-regular-channel; then + GKE_RELEASE_CHANNEL="regular" + elif pr_has_label ci-gke-use-stable-channel; then + GKE_RELEASE_CHANNEL="stable" + elif pr_has_pragma gke_release_channel; then + GKE_RELEASE_CHANNEL="$(pr_get_pragma gke_release_channel)" + fi +} - local gke_release_channel="stable" - ci_export GKE_RELEASE_CHANNEL "$gke_release_channel" - echo "Using gke release channel: $gke_release_channel" +choose_cluster_version() { + if is_in_PR_context && pr_has_pragma gke_cluster_version; then + GKE_CLUSTER_VERSION="$(pr_get_pragma gke_cluster_version)" + fi + if [[ "${GKE_CLUSTER_VERSION:-}" == "latest" ]]; then + GKE_CLUSTER_VERSION="$(gcloud container get-server-config --format json | jq -r ".validMasterVersions[0]")" + elif [[ "${GKE_CLUSTER_VERSION:-}" == "oldest" ]]; then + GKE_CLUSTER_VERSION="$(gcloud container get-server-config --format json | jq -r ".validMasterVersions[-1]")" + fi + if [[ "${GKE_CLUSTER_VERSION:-}" == "null" ]]; then + echo "WARNING: Unable to extract version from gcloud config." + echo "Valid versions are:" + gcloud container get-server-config --format json | jq .validMasterVersions + unset GKE_CLUSTER_VERSION + fi } create_cluster() { info "Creating a GKE cluster" + # Store requested timestamp to create log query link with time range. + date -u +"%Y-%m-%dT%H:%M:%SZ" > /tmp/GKE_CLUSTER_REQUESTED_TIMESTAMP ensure_CI @@ -103,27 +127,30 @@ create_cluster() { # The "services" secondary range is for ClusterIP services ("--services-ipv4-cidr"). # See https://cloud.google.com/kubernetes-engine/docs/how-to/alias-ips#cluster_sizing. - REGION=us-central1 + REGION=us-east4 NUM_NODES="${NUM_NODES:-3}" GCP_IMAGE_TYPE="${GCP_IMAGE_TYPE:-UBUNTU_CONTAINERD}" POD_SECURITY_POLICIES="${POD_SECURITY_POLICIES:-false}" GKE_RELEASE_CHANNEL="${GKE_RELEASE_CHANNEL:-stable}" MACHINE_TYPE="${MACHINE_TYPE:-e2-standard-4}" + DISK_SIZE_GB=${DISK_SIZE_GB:-80} - echo "Creating ${NUM_NODES} node cluster with image type \"${GCP_IMAGE_TYPE}\"" + echo "Creating ${NUM_NODES} node cluster with image type \"${GCP_IMAGE_TYPE}\" and ${DISK_SIZE_GB}GB disks." - VERSION_ARGS=(--release-channel "${GKE_RELEASE_CHANNEL}") - get_supported_cluster_version - if [[ -n "${CLUSTER_VERSION:-}" ]]; then - echo "using cluster version: ${CLUSTER_VERSION}" - VERSION_ARGS=(--cluster-version "${CLUSTER_VERSION}") + if [[ -n "${GKE_CLUSTER_VERSION:-}" ]]; then + ensure_supported_cluster_version + echo "Using GKE cluster version: ${GKE_CLUSTER_VERSION} (which overrides release channel ${GKE_RELEASE_CHANNEL})" + VERSION_ARGS=(--cluster-version "${GKE_CLUSTER_VERSION}" --no-enable-autoupgrade) + else + echo "Using GKE release channel: $GKE_RELEASE_CHANNEL" + VERSION_ARGS=(--release-channel "${GKE_RELEASE_CHANNEL}") fi PSP_ARG= if [[ "${POD_SECURITY_POLICIES}" == "true" ]]; then PSP_ARG="--enable-pod-security-policy" fi - zones=$(gcloud compute zones list --filter="region=$REGION" | grep UP | cut -f1 -d' ' | shuf) + zones=$(gcloud compute zones list --format="value(name,region.basename(),status)" | awk "/${REGION}\tUP\$/{print \$1}" | shuf) success=0 for zone in $zones; do echo "Trying zone $zone" @@ -131,23 +158,23 @@ create_cluster() { gcloud config set compute/zone "${zone}" status=0 # shellcheck disable=SC2153 - timeout 630 gcloud beta container clusters create \ - --machine-type "${MACHINE_TYPE}" \ - --num-nodes "${NUM_NODES}" \ - --disk-type=pd-standard \ - --disk-size=40GB \ - --create-subnetwork range=/28 \ - --cluster-ipv4-cidr=/20 \ - --services-ipv4-cidr=/24 \ - --enable-ip-alias \ - --enable-network-policy \ - --enable-autorepair \ - "${VERSION_ARGS[@]}" \ - --image-type "${GCP_IMAGE_TYPE}" \ - --tags="${tags}" \ - --labels="${labels}" \ - ${PSP_ARG} \ - "${CLUSTER_NAME}" || status="$?" + timeout 830 gcloud beta container clusters create \ + --machine-type "${MACHINE_TYPE}" \ + --num-nodes "${NUM_NODES}" \ + --disk-type=pd-ssd \ + --disk-size="${DISK_SIZE_GB}GB" \ + --create-subnetwork range=/28 \ + --cluster-ipv4-cidr=/20 \ + --services-ipv4-cidr=/24 \ + --enable-ip-alias \ + --enable-network-policy \ + --no-enable-autorepair \ + "${VERSION_ARGS[@]}" \ + --image-type "${GCP_IMAGE_TYPE}" \ + --tags="${tags}" \ + --labels="${labels}" \ + ${PSP_ARG} \ + "${CLUSTER_NAME}" || status="$?" if [[ "${status}" == 0 ]]; then success=1 break @@ -168,21 +195,39 @@ create_cluster() { if [[ "${success}" == 1 ]]; then info "Successfully launched cluster ${CLUSTER_NAME}" + local kubeconfig="${KUBECONFIG:-${HOME}/.kube/config}" + ls -l "${kubeconfig}" || true + gcloud container clusters get-credentials "$CLUSTER_NAME" + ls -l "${kubeconfig}" || true break fi - warn "Timed out" - warn "Attempting to delete the cluster before trying another zone" + info "Timed out" + info "Attempting to delete the cluster before trying another zone" gcloud container clusters delete "${CLUSTER_NAME}" || { - error "An error occurred deleting the cluster: $?" + info "An error occurred deleting the cluster: $?" true } fi done if [[ "${success}" == "0" ]]; then - error "Cluster creation failed" + info "Cluster creation failed" return 1 fi + + add_a_maintenance_exclusion +} + +add_a_maintenance_exclusion() { + from_now="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + plus_five_epoch=$(($(date -u '+%s') + 5*3600)) + plus_five="$(date -u --date=@${plus_five_epoch} +"%Y-%m-%dT%H:%M:%SZ")" + + gcloud container clusters update "${CLUSTER_NAME}" \ + --add-maintenance-exclusion-name leave-these-clusters-alone \ + --add-maintenance-exclusion-start "${from_now}" \ + --add-maintenance-exclusion-end "${plus_five}" \ + --add-maintenance-exclusion-scope no_upgrades } wait_for_cluster() { @@ -217,18 +262,16 @@ wait_for_cluster() { done } -get_supported_cluster_version() { - if [[ -n "${CLUSTER_VERSION:-}" ]]; then - local match - match=$(gcloud container get-server-config --format json | jq "[.validMasterVersions | .[] | select(.|test(\"^${CLUSTER_VERSION}\"))][0]") - if [[ -z "${match}" || "${match}" == "null" ]]; then - echo "A supported version cannot be found that matches ${CLUSTER_VERSION}." - echo "Valid master versions are:" - gcloud container get-server-config --format json | jq .validMasterVersions - exit 1 - fi - CLUSTER_VERSION=$(sed -e 's/^"//' -e 's/"$//' <<<"${match}") +ensure_supported_cluster_version() { + local match + match=$(gcloud container get-server-config --format json | jq "[.validMasterVersions | .[] | select(.|test(\"^${GKE_CLUSTER_VERSION}\"))][0]") + if [[ -z "${match}" || "${match}" == "null" ]]; then + echo "ERROR: A supported version cannot be found that matches ${GKE_CLUSTER_VERSION}." + echo "Valid master versions are:" + gcloud container get-server-config --format json | jq .validMasterVersions + exit 1 fi + GKE_CLUSTER_VERSION=$(sed -e 's/^"//' -e 's/"$//' <<<"${match}") } refresh_gke_token() { @@ -245,7 +288,9 @@ refresh_gke_token() { sleep 900 & pid="$!" kill_sleep() { + # shellcheck disable=SC2317 echo "refresh_gke_token() terminated, killing the background sleep ($pid)" + # shellcheck disable=SC2317 kill "$pid" } trap kill_sleep SIGINT SIGTERM @@ -263,15 +308,26 @@ refresh_gke_token() { } teardown_gke_cluster() { - info "Tearing down the GKE cluster: ${CLUSTER_NAME:-}" + local canceled="${1:-false}" + + info "Tearing down the GKE cluster: ${CLUSTER_NAME:-}, canceled: ${canceled}" require_environment "CLUSTER_NAME" require_executable "gcloud" - # (prefix output to avoid triggering prow log focus) - "$SCRIPTS_ROOT/scripts/ci/cleanup-deployment.sh" 2>&1 | sed -e 's/^/out: /' || true + if [[ "${canceled}" == "false" ]]; then + # (prefix output to avoid triggering prow log focus) + "$SCRIPTS_ROOT/scripts/ci/cleanup-deployment.sh" 2>&1 | sed -e 's/^/out: /' || true + fi - gcloud config set compute/zone "${ZONE}" + for i in {1..10}; do + gcloud container clusters describe "${CLUSTER_NAME}" --format "flattened(status)" + if [[ ! "$(gcloud container clusters describe "${CLUSTER_NAME}" --format 'get(status)')" =~ PROVISIONING|RECONCILING ]]; then + break + fi + info "Before deleting, waiting for cluster ${CLUSTER_NAME} to leave provisioning state (wait $i of 10)" + sleep 60 + done gcloud container clusters delete "$CLUSTER_NAME" --async info "Cluster deleting asynchronously"