Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Automatically kill stuck PR tests and report back #2440

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion jenkins/start-slave.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ SCRIPT_DIR=$(cd $(dirname $0); /bin/pwd)
TARGET=$1
CLEANUP_WORKSPACE=$2
REMOTE_USER=$(echo $TARGET | sed 's|@.*||')
SSH_OPTS="-q -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ServerAliveInterval=60"
SSH_OPTS="-q -o IdentitiesOnly=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ServerAliveInterval=60 -o ConnectTimeout=60"

#Check unique slave conenction
if [ "${SLAVE_UNIQUE_TARGET}" = "YES" ] ; then
Expand Down
26 changes: 26 additions & 0 deletions kill-stuck-pr-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
rm -f *.prop

if [ "X${UPLOAD_UNIQUE_ID}" = "X" ] ; then exit 0 ; fi
if [ "X${PULL_REQUEST}" = "X" ] ; then exit 0 ; fi

REPOSITORY=$(echo ${PULL_REQUEST} | cut -d '#' -f 1)
PR_ID=$(echo ${PULL_REQUEST} | cut -d '#' -f 2)

COMMIT_ID=$(curl -L http://localhost/SDT/jenkins-artifacts/pull-request-integration/${UPLOAD_UNIQUE_ID}/prs_commits.txt | grep "^${PULL_REQUEST}=")
if [ "X${COMMIT_ID}" = "X" ] ; then exit 0 ; fi

./cms-bot/update-commit-statuses-matching.py -r ${REPOSITORY} -c ${COMMIT_ID} -p ${CONTEXT} rocm

touch abort-jenkins-job.prop
echo "JENKINS_PROJECT_TO_KILL=${JENKINS_PROJECT_TO_KILL}" >> abort-jenkins-job.prop
echo "JENKINS_PROJECT_PARAMS=${JENKINS_PROJECT_PARAMS}" >> abort-jenkins-job.prop
echo "EXTRA_PARAMS=${EXTRA_PARAMS}" >> abort-jenkins-job.prop

source $(dirname $0)/setup-pr-test-env.sh

echo "MATRIXROCM_TESTS;ERROR,Matrix ROCM Tests Outputs,Timed out waiting for node,none" > ${RESULTS_DIR}/relvalROCM.txt
echo "RelVals-ROCM" > ${RESULTS_DIR}/12ROCM-relvals-failed.res
echo "rocm_UNIT_TEST_RESULTS;ERROR,ROCM GPU Unit Tests,Timed out waiting for node,none" > ${RESULTS_DIR}/unittestrocm.txt
echo "rocmUnitTests" > ${RESULTS_DIR}/14-failed.res
prepare_upload_results
5 changes: 5 additions & 0 deletions parse_jenkins_builds.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"whitelist": ["ib-run-pr-unittests", "ib-run-pr-relvals", "ib-run-baseline"],
"timeout": 3600,
"custom": {}
}
73 changes: 63 additions & 10 deletions parse_jenkins_builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import subprocess
from es_utils import send_payload, get_payload, resend_payload, get_payload_wscroll
from cmsutils import epoch2week
import json

JENKINS_PREFIX = "jenkins"
try:
Expand Down Expand Up @@ -149,25 +150,77 @@ def grep(filename, pattern, verbose=False):
jque_res = subprocess.run(que_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
queue_json = json.loads(jque_res.stdout)

with open("parse_jenkins_builds.json") as f:
config = json.load(f)

jenkins_queue = dict()
current_time = get_current_time()
for element in queue_json["items"]:
payload = dict()

job_name = element["task"]["name"]
queue_id = int(element["id"])
queue_time = int(element["inQueueSince"])
labels = element["why"].encode("ascii", "ignore").decode("ascii", "ignore")
reason = process_queue_reason(labels)

payload["jenkins_server"] = JENKINS_PREFIX
payload["in_queue_since"] = queue_time
payload["queue_id"] = queue_id
payload["job_name"] = job_name
payload["node_labels"] = reason
payload["in_queue"] = 1
payload["wait_time"] = current_time - queue_time
payload["start_time"] = 0
payload = {
"jenkins_server": JENKINS_PREFIX,
"in_queue_since": queue_time,
"queue_id": queue_id,
"job_name": job_name,
"node_labels": reason,
"in_queue": 1,
"wait_time": current_time - queue_time,
"start_time": 0,
}

kill_index = 0

# Abort stuck jobs
if (
job_name in config["whitelist"]
and reason.endswith("-offline")
and reason != "multiple-offline"
and (payload["wait_time"] / 1000 > config["custom"].get(job_name, config["timeout"]))
):
params = dict(
line.split("=", 1) for line in element["params"].strip().splitlines() if "=" in line
)

if "rocm" not in (params.get("GPU_FLAVOR"), params.get("TEST_FLAVOR")):
continue

# Try to reconnect the node
node_name = reason.rsplit("-", 1)[0]
connect_node = os.environ.get("JENKINS_CLI_CMD") + " connect-node " + node_name + " -f"
try_count = 1
while try_count < 4:
try_count += 1
ret = os.system(connect_node)
if ret == 0:
break

if try_count == 4:
try:
pull_request = params["PULL_REQUEST"]
main_params = f"PULL_REQUEST={pull_request}"
release = params["RELEASE_FORMAT"]
context = params["CONTEXT_PREFIX"]
upload_unique_id = params["UPLOAD_UNIQ_ID"]
except KeyError:
continue

other_params = ";".join(f"{k}={v}" for k, v in params if k != "PULL_REQUEST")

with open(f"abort-{kill_index}.prop", "w") as f:
f.write(f"UPLOAD_UNIQ_ID={upload_unique_id}\n")
f.write(f"PULL_REQUEST={pull_request}\n")
f.write(f"CONTEXT={context}\n")
f.write(f"JENKINS_PROJECT_TO_KILL={job_name}\n")
f.write(f"JENKINS_PROJECT_PARAMS={main_params}\n")
f.write(f"EXTRA_PARAMS={other_params}\n")
f.write(f"RELEASE_FORMAT={release}\n")

kill_index += 1

unique_id = (
JENKINS_PREFIX + ":/build/builds/" + job_name + "/" + str(queue_id)
Expand Down
36 changes: 36 additions & 0 deletions update-commit-statues-matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import github_utils
import argparse


def main():
parser = argparse.ArgumentParser()
parser.add_argument("--repository", "-r")
parser.add_argument("--commit", "-c")
parser.add_argument("--prefix", "-p")
parser.add_argument("suffix")
args = parser.parse_args()

status_prefix = f"{args.prefix}/"

all_statuses = github_utils.get_combined_statuses(args.commit, args.repository).get(
"statuses", []
)

for status in all_statuses:
if (
status["context"].startswith(status_prefix)
and status["context"].endswith(f"/{args.suffix}")
and status["state"] == "pending"
):
github_utils.mark_commit_status(
args.commit,
args.repository,
status["context"],
"success",
"",
"Timed out waiting for node",
)


if __name__ == "__main__":
main()
Loading