-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwatchdog
executable file
·68 lines (57 loc) · 2.22 KB
/
watchdog
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env bash
set -e
FAIL_COUNT=${FAIL_COUNT:-2}
CHECK_INTERVAL=${CHECK_INTERVAL:-30} # seconds
region=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone/ | sed 's/[a-z]$//')
declare -A nodes='()'
if [[ -n "$DEBUG" ]]; then
silent_flag=""
set -x
else
silent_flag="-s"
fi
curl_k8s()
{
curl $silent_flag \
-H "Content-type: application/json" \
-H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
--cacert /var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
https://$KUBERNETES_SERVICE_HOST/$@
}
log()
{
echo -n "[$(date '+%Y-%m-%d %H:%M:%S')] "
echo $@
}
### Health-check code ###
while true; do
[[ -n "$DEBUG" ]] && curl_k8s api/v1/nodes
node_json=$(curl_k8s api/v1/nodes)
total_node_count=$(echo $node_json | jq '.items | length')
unhealthy=($(echo $node_json | jq -r '.items[] | select(.status.conditions[] | contains({"type":"Ready","status":"Unknown"}) or contains({"type":"Ready","message":"not healthy"})) | .spec.externalID '))
[[ -n "$DEBUG" ]] && echo "Unhealthy nodes: $unhealthy"
# The first check is a "circuit breaker" that prevents this script from terminating the cluster; in the event of something
# catastrophic, or possibly a network-related failure; or possibly a null response from the API
if [[ $total_node_count == ${#unhealthy[@]} ]]; then
log "All nodes appear offline; will NOT attempt termination."
elif [[ ${unhealthy[@]} = "" ]]; then
[[ -n "$DEBUG" ]] && log "All nodes reported in"
else
for instance in ${unhealthy[@]}; do
nodes[$instance]=$((nodes[$instance]+1))
if [[ -n ${nodes[${instance}]} ]] && [[ ${nodes[${instance}]} -eq $FAIL_COUNT ]]; then
log "Instance ${instance} in status \"Unknown\" ${FAIL_COUNT} times. Terminating."
aws autoscaling set-instance-health --region $region --health-status Unhealthy --instance-id $instance
else
log "Instance ${instance} in status \"Unknown\" ${nodes[${instance}]} times."
fi
done
fi
for node in "${!nodes[@]}"; do
if [[ ! " ${unhealthy[@]} " =~ " ${node} " ]]; then
log "Instance ${node} reported in."
unset nodes[$instance]
fi
done
sleep $CHECK_INTERVAL
done