Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions .openshift-ci/ci_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,19 @@ class BaseTest:
def __init__(self):
self.test_output_dirs = []

def run_with_graceful_kill(self, args, timeout, post_start_hook=None):
def run_with_graceful_kill(self, args, timeout):
with subprocess.Popen(args) as cmd:
if post_start_hook is not None:
post_start_hook()
try:
exitstatus = cmd.wait(timeout)
if exitstatus != 0:
raise RuntimeError(f"Test failed: exit {exitstatus}")
except subprocess.TimeoutExpired as err:
# Kill child processes as we cannot rely on bash scripts to
# handle signals and stop tests
subprocess.run(
["/usr/bin/pkill", "-P", str(cmd.pid)], check=True, timeout=5
)
# Then kill the test command
popen_graceful_kill(cmd)
raise err

Expand All @@ -41,7 +45,7 @@ def run(self):

self.run_with_graceful_kill(
["scripts/ci/jobs/e2etests/e2e-tests.sh"],
E2ETest.TEST_TIMEOUT,
self.TEST_TIMEOUT,
)


Expand All @@ -55,10 +59,10 @@ def run(self):

self.run_with_graceful_kill(
["scripts/ci/jobs/e2etests/scale-tests.sh"],
ScaleTest.TEST_TIMEOUT,
self.TEST_TIMEOUT,
)

self.test_output_dirs.append(ScaleTest.OUTPUT_DIR)
self.test_output_dirs.append(self.OUTPUT_DIR)


class SlimE2ETest(BaseTest):
Expand All @@ -69,5 +73,5 @@ def run(self):

self.run_with_graceful_kill(
["scripts/ci/jobs/e2etests/slim-e2e-tests.sh"],
SlimE2ETest.TEST_TIMEOUT,
self.TEST_TIMEOUT,
)
42 changes: 26 additions & 16 deletions .openshift-ci/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
Clusters used in test

Copied from https://github.com/stackrox/stackrox/blob/master/.openshift-ci/clusters.py
Adapted from https://github.com/stackrox/stackrox/blob/master/.openshift-ci/clusters.py
"""

import os
Expand All @@ -25,35 +25,42 @@ def teardown(self):
class GKECluster:
# Provisioning timeout is tightly coupled to the time it may take gke.sh to
# create a cluster.
PROVISION_TIMEOUT = 90 * 60
PROVISION_TIMEOUT = 140 * 60
WAIT_TIMEOUT = 20 * 60
TEARDOWN_TIMEOUT = 5 * 60
# separate script names used for testability - test_clusters.py
PROVISION_PATH = "scripts/ci/gke.sh"
WAIT_PATH = "scripts/ci/gke.sh"
REFRESH_PATH = "scripts/ci/gke.sh"
TEARDOWN_PATH = "scripts/ci/gke.sh"

def __init__(self, cluster_id, num_nodes=3, machine_type="e2-standard-4"):
def __init__(self, cluster_id, num_nodes=None, machine_type=None, disk_gb=None):
self.cluster_id = cluster_id
self.num_nodes = num_nodes
self.machine_type = machine_type
self.disk_gb = disk_gb
self.refresh_token_cmd = None

def provision(self):
if self.num_nodes is not None:
os.environ["NUM_NODES"] = str(self.num_nodes)
if self.machine_type is not None:
os.environ["MACHINE_TYPE"] = str(self.machine_type)
if self.disk_gb is not None:
os.environ["DISK_SIZE_GB"] = str(self.disk_gb)
with subprocess.Popen(
[
GKECluster.PROVISION_PATH,
self.PROVISION_PATH,
"provision_gke_cluster",
self.cluster_id,
str(self.num_nodes),
self.machine_type,
]
) as cmd:

try:
exitstatus = cmd.wait(GKECluster.PROVISION_TIMEOUT)
exitstatus = cmd.wait(self.PROVISION_TIMEOUT)
if exitstatus != 0:
raise RuntimeError(f"Cluster provision failed: exit {exitstatus}")
raise RuntimeError(
f"Cluster provision failed: exit {exitstatus}")
except subprocess.TimeoutExpired as err:
popen_graceful_kill(cmd)
raise err
Expand All @@ -62,38 +69,41 @@ def provision(self):
signal.signal(signal.SIGINT, self.sigint_handler)

subprocess.run(
[GKECluster.WAIT_PATH, "wait_for_cluster"],
[self.WAIT_PATH, "wait_for_cluster"],
check=True,
timeout=GKECluster.WAIT_TIMEOUT,
timeout=self.WAIT_TIMEOUT,
)

# pylint: disable=consider-using-with
self.refresh_token_cmd = subprocess.Popen(
[GKECluster.REFRESH_PATH, "refresh_gke_token"]
[self.REFRESH_PATH, "refresh_gke_token"]
)

return self

def teardown(self):
def teardown(self, canceled=False):
while os.path.exists("/tmp/hold-cluster"):
print("Pausing teardown because /tmp/hold-cluster exists")
time.sleep(60)

if self.refresh_token_cmd is not None:
if self.refresh_token_cmd is not None and not canceled:
print("Terminating GKE token refresh")
try:
popen_graceful_kill(self.refresh_token_cmd)
except Exception as err:
print(f"Could not terminate the token refresh: {err}")

args = [self.TEARDOWN_PATH, "teardown_gke_cluster"]
if canceled:
args.append("true")
subprocess.run(
[GKECluster.TEARDOWN_PATH, "teardown_gke_cluster"],
args,
check=True,
timeout=GKECluster.TEARDOWN_TIMEOUT,
timeout=self.TEARDOWN_TIMEOUT,
)

return self

def sigint_handler(self, signum, frame):
print("Tearing down the cluster due to SIGINT", signum, frame)
self.teardown()
self.teardown(canceled=True)
15 changes: 13 additions & 2 deletions .openshift-ci/common.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
from datetime import datetime
import subprocess

'''
Copied from https://github.com/stackrox/stackrox/blob/master/.openshift-ci/common.py
Adapted from https://github.com/stackrox/stackrox/blob/master/.openshift-ci/common.py
'''

def popen_graceful_kill(cmd):
log_print(f"Sending SIGTERM to {cmd.args}")
cmd.terminate()
try:
cmd.wait(5)
log_print("Terminated")
except subprocess.TimeoutExpired as err:
log_print(f"Exception raised waiting after SIGTERM to {cmd.args}, {err}")
# SIGKILL if necessary
log_print(f"Sending SIGKILL to {cmd.args}")
cmd.kill()
cmd.wait(5)
raise err
log_print("Terminated")

def log_print(*args):
now = datetime.now()
time = now.strftime("%H:%M:%S")
print(f"{time}:", *args)
6 changes: 4 additions & 2 deletions scripts/ci/cleanup-deployment.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#! /bin/bash
#!/usr/bin/env bash

# Copied from https://github.com/stackrox/stackrox/blob/master/scripts/ci/cleanup-deployment.sh

kubectl -n stackrox get cm,deploy,ds,networkpolicy,pv,pvc,secret,svc,serviceaccount -o name | xargs kubectl -n stackrox delete --wait
namespace=${1:-stackrox}

kubectl -n "${namespace}" get cm,deploy,ds,networkpolicy,pv,pvc,secret,svc,serviceaccount -o name | xargs kubectl -n "${namespace}" delete --wait
Loading