-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
check_tsd
executable file
·294 lines (268 loc) · 12 KB
/
check_tsd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#!/usr/bin/env python
#
# Script which queries TSDB with a given metric and alerts based on
# supplied threshold. Compatible with Nagios output format, so can be
# used as a nagios command.
#
# This file is part of OpenTSDB.
# Copyright (C) 2010-2012 The OpenTSDB Authors.
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 2.1 of the License, or (at your
# option) any later version. This program is distributed in the hope that it
# will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
# of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
# General Public License for more details. You should have received a copy
# of the GNU Lesser General Public License along with this program. If not,
# see <http://www.gnu.org/licenses/>.
#
#
# check_tsd -m mysql.slave.seconds_behind_master -t host=foo -t schema=mydb
# -d 600 -a avg -x gt -w 50 -c 100
#
import httplib
import operator
import socket
import sys
import time
from optparse import OptionParser
AGGREGATORS = ('avg', 'count', 'dev', 'diff',
'ep50r3', 'ep50r7', 'ep75r3', 'ep75r7', 'ep90r3', 'ep90r7', 'ep95r3', 'ep95r7',
'ep99r3', 'ep99r7', 'ep999r3', 'ep999r7',
'mimmin', 'mimmax', 'min', 'max', 'none',
'p50', 'p75', 'p90', 'p95', 'p99', 'p999',
'sum', 'zimsum')
FILL_POLICIES = ('none','nan','null','zero')
def main(argv):
"""Pulls data out of the TSDB and do very simple alerting from Nagios."""
parser = OptionParser(description='Simple TSDB data extractor for Nagios.')
parser.add_option('-H', '--host', default='localhost', metavar='HOST',
help='Hostname to use to connect to the TSD.')
parser.add_option('-p', '--port', type='int', default=80, metavar='PORT',
help='Port to connect to the TSD instance on.')
parser.add_option('-m', '--metric', metavar='METRIC',
help='Metric to query.')
parser.add_option('-t', '--tag', action='append', default=[],
metavar='TAG', help='Tags to filter the metric on.')
parser.add_option('-d', '--duration', type='int', default=600,
metavar='SECONDS', help='How far back to look for data.')
parser.add_option('-D', '--downsample', default='none', metavar='METHOD',
help='Downsample function, e.g. one of avg, min, sum, or max.')
parser.add_option('-W', '--downsample-window', type='int', default=60,
metavar='SECONDS', help='Window size over which to downsample.')
parser.add_option('-F', '--downsample-fill-policy', default='none', metavar='POLICY',
help='Fill Policies, e.g. one of none, nan, null, or zero.')
parser.add_option('-a', '--aggregator', default='sum', metavar='METHOD',
help='Aggregation method: avg, min, sum (default), max.')
parser.add_option('-x', '--method', dest='comparator', default='gt',
metavar='METHOD', help='Comparison method: gt, ge, lt, le, eq, ne.')
parser.add_option('-r', '--rate', default=False,
action='store_true', help='Use rate value as comparison operand.')
parser.add_option('-w', '--warning', type='float', metavar='THRESHOLD',
help='Threshold for warning. Uses the comparison method.')
parser.add_option('-c', '--critical', type='float', metavar='THRESHOLD',
help='Threshold for critical. Uses the comparison method.')
parser.add_option('-v', '--verbose', default=False,
action='store_true', help='Be more verbose.')
parser.add_option('-T', '--timeout', type='int', default=10,
metavar='SECONDS',
help='How long to wait for the response from TSD.')
parser.add_option('-E', '--no-result-ok', default=False,
action='store_true',
help='Return OK when TSD query returns no result.')
parser.add_option('-I', '--ignore-recent', default=0, type='int',
metavar='SECONDS', help='Ignore data points that are that'
' are that recent.')
parser.add_option('-P', '--percent-over', dest='percent_over', default=0,
metavar='PERCENT', type='float', help='Only alarm if PERCENT of the data'
' points violate the threshold.')
parser.add_option('-N', '--now', type='int', default=None,
metavar='UTC',
help='Set unix timestamp for "now", for testing')
parser.add_option('-B', '--bad_percent', dest='bad_percent', default=None,
metavar='PERCENT', type='float', help='Ignore alarm if PERCENT of the data'
' points is bad')
parser.add_option('-S', '--ssl', default=False, action='store_true',
help='Make queries to OpenTSDB via SSL (https)')
(options, args) = parser.parse_args(args=argv[1:])
# argument validation
if options.comparator not in ('gt', 'ge', 'lt', 'le', 'eq', 'ne'):
parser.error("Comparator '%s' not valid." % options.comparator)
elif options.downsample not in ('none',)+AGGREGATORS:
parser.error("Downsample '%s' not valid." % options.downsample)
elif options.downsample_fill_policy not in FILL_POLICIES:
parser.error("Downsample Fill policy '%s' not valid." % options.downsample_fill_policy)
elif options.aggregator not in AGGREGATORS:
parser.error("Aggregator '%s' not valid." % options.aggregator)
elif not options.metric:
parser.error('You must specify a metric (option -m).')
elif options.duration <= 0:
parser.error('Duration must be strictly positive.')
elif options.downsample_window <= 0:
parser.error('Downsample window must be strictly positive.')
elif options.critical is None and options.warning is None:
parser.error('You must specify at least a warning threshold (-w) or a'
' critical threshold (-c).')
elif options.ignore_recent < 0:
parser.error('--ignore-recent must be positive.')
elif options.percent_over < 0 or options.percent_over > 100:
parser.error('--percent-over must be in the range 0..100.')
options.percent_over /= 100.0 # Convert to range 0-1
if options.critical is None:
options.critical = options.warning
elif options.warning is None:
options.warning = options.critical
# argument construction
tags = ','.join(options.tag)
if tags:
tags = '{' + tags + '}'
# URL building and fetching
if options.downsample == 'none':
downsampling = ''
else:
downsampling = '%ds-%s-%s:' % (options.downsample_window,
options.downsample, options.downsample_fill_policy)
if options.rate:
rate = 'rate:'
else:
rate = ''
if options.now:
now = options.now
start = '%s' % (now - int(options.duration))
else:
now = int(time.time())
start = '%ss-ago' % options.duration
url = ('/q?start=%s&m=%s:%s%s%s%s&ascii&nagios'
% (start, options.aggregator, downsampling, rate,
options.metric, tags))
tsd = '%s:%d' % (options.host, options.port)
if options.ssl: # Pick the class to instantiate first.
conn = httplib.HTTPSConnection
else:
conn = httplib.HTTPConnection
if sys.version_info[0] * 10 + sys.version_info[1] >= 26: # Python >2.6
conn = conn(tsd, timeout=options.timeout)
else: # Python 2.5 or less, using the timeout kwarg will make it croak :(
conn = conn(tsd)
try:
conn.connect()
except socket.error, e:
print ("ERROR: couldn't connect to %s: %s" % (tsd, e))
return 2
if options.verbose:
peer = conn.sock.getpeername()
print ('Connected to %s:%d' % (peer[0], peer[1]))
conn.set_debuglevel(1)
try:
conn.request('GET', url)
res = conn.getresponse()
datapoints = res.read()
conn.close()
except socket.error, e:
print ("ERROR: couldn't GET %s from %s: %s" % (url, tsd, e))
return 2
# if failure...
if res.status not in (200, 202):
print ('CRITICAL: status = %d when talking to %s:%d'
% (res.status, options.host, options.port))
if options.verbose:
print ('TSD said:')
print (datapoints)
return 2
# but we won!
datapoints = datapoints.splitlines()
def no_data_point():
if options.no_result_ok:
print ('OK: query did not return any data point (--no-result-ok)')
return 0
else:
print ('CRITICAL: query did not return any data point')
return 2
if not len(datapoints):
return no_data_point()
comparator = operator.__dict__[options.comparator]
rv = 0 # return value for this script
badts = None # Timestamp of the bad value we found, if any.
badval = None # Value of the bad value we found, if any.
npoints = 0 # How many values have we seen?
nbad = 0 # How many bad values have we seen?
ncrit = 0 # How many critical values have we seen?
nwarn = 0 # How many warning values have we seen?
for datapoint_str in datapoints:
datapoint = datapoint_str.split()
ts = int(datapoint[1])
delta = now - ts
if delta > options.duration or delta <= options.ignore_recent:
if options.verbose:
print "%s (ignored, delta %ds)" % (datapoint_str, delta)
if delta < 0:
break # Skip the rest, we got what we came for.
continue # Ignore data points outside of our range.
if options.verbose:
print datapoint_str
npoints += 1
val = datapoint[2]
if '.' in val:
val = float(val)
else:
val = int(val)
bad = False # Is the current value bad?
# compare to warning/crit
if comparator(val, options.critical):
bad = True
ncrit += 1
nwarn += 1
elif rv < 2 and comparator(val, options.warning):
bad = True
nwarn += 1
if (bad and
(badval is None # First bad value we find.
or comparator(val, badval))): # Worse value.
badval = val
badts = ts
if ncrit > 0 and (float(ncrit) / npoints > options.percent_over):
rv = 2
nbad = ncrit
elif nwarn > 0 and (float(nwarn) / npoints > options.percent_over):
rv = 1
nbad = nwarn
else:
rv = 0
if options.verbose and len(datapoints) != npoints:
print ('ignored %d/%d data points for being more than %ds old'
% (len(datapoints) - npoints, len(datapoints), options.duration))
if not npoints:
return no_data_point()
if badts:
if options.verbose:
print ('worse data point value=%s at ts=%s' % (badval, badts))
badts = time.asctime(time.localtime(badts))
bad_pct = nbad * 100.0 / npoints
# in nrpe, pipe character is something special, but it's used in tag
# searches. Translate it to something else for the purposes of output.
ttags = tags.replace("|",":")
# Retrieve metric name for performance data label.
perf_label = options.metric.split(".")[-1]
if not rv:
status = 'OK: %s%s: %d values OK, last=%r' \
% (options.metric, ttags, npoints, val)
status += ' | {0}={1};{2};{3};0;{3}'.format(
perf_label, npoints, options.warning, options.critical)
print(status)
else:
if rv == 1:
level = 'WARNING'
threshold = options.warning
elif rv == 2:
level = 'CRITICAL'
threshold = options.critical
status = '%s: %s%s %s %s: %d/%d bad values (%.1f%%) worst: %r @ %s' \
% (level, options.metric, ttags, options.comparator, threshold,
nbad, npoints, bad_pct, badval, badts)
status += ' | {0}={1};{2};{3};0;{3}'.format(
perf_label, npoints, options.warning, options.critical)
print(status)
return rv
if __name__ == '__main__':
sys.exit(main(sys.argv))