1 |
#!/bin/dash |
2 |
# |
3 |
# Copyright (C) Nikhef 2011 |
4 |
# |
5 |
# Licensed under the Apache License, Version 2.0 (the "License"); |
6 |
# you may not use this file except in compliance with the License. |
7 |
# You may obtain a copy of the License at |
8 |
# |
9 |
# http://www.apache.org/licenses/LICENSE-2.0 |
10 |
# |
11 |
# Unless required by applicable law or agreed to in writing, software |
12 |
# distributed under the License is distributed on an "AS IS" BASIS, |
13 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 |
# See the License for the specific language governing permissions and |
15 |
# limitations under the License. |
16 |
# |
17 |
# Author: |
18 |
# Mischa Sall\'e <msalle@nikhef.nl> |
19 |
# NIKHEF Amsterdam, the Netherlands |
20 |
# |
21 |
######################################################################## |
22 |
# |
23 |
# Nagios probe to test functioning of gLExec |
24 |
# |
25 |
# Nagios state can be one of the following: |
26 |
# - Missing glexec command: CRITICAL |
27 |
# - input proxies empty: UNKNOWN |
28 |
# - short timeout exceeded: WARNING |
29 |
# - timeout exceeded: CRITICAL |
30 |
# - gLExec exit codes: |
31 |
# 0 glexec succeeded: OK |
32 |
# 201 Client error: CRITICAL |
33 |
# 202 Internal error: CRITICAL |
34 |
# 203 Auth error: CRITICAL |
35 |
# 204 Overlap: CRITICAL |
36 |
# 126 execve failed: WARNING |
37 |
# 128+n signal: WARNING |
38 |
# !=0 rc of payload: WARNING |
39 |
# |
40 |
######################################################################## |
41 |
|
42 |
# version |
43 |
VERSION=0.1 |
44 |
|
45 |
# plugin name and version |
46 |
PROG=`basename $0` |
47 |
# command will be set after arguments are parsed |
48 |
DEF_GLEXEC_CMD="id" |
49 |
DEF_GLEXEC_CMD_ARGS="-a" |
50 |
# glexec command itself |
51 |
GLEXEC_EXE="" |
52 |
# Default verbosity |
53 |
VERBOSE=0 |
54 |
# Default timeout |
55 |
TIMEOUT=10 |
56 |
# Default short timeout, longer than this results in warning |
57 |
CRITTIMEOUT=8 |
58 |
# Default short timeout, longer than this results in warning |
59 |
WARNTIMEOUT=5 |
60 |
# Default GLEXEC_CLIENT_CERT will be set after arguments are parsed |
61 |
GLEXEC_CLIENT_CERT="" |
62 |
|
63 |
######################################################################## |
64 |
# |
65 |
# general options |
66 |
# |
67 |
######################################################################## |
68 |
|
69 |
# Short usage text |
70 |
shortusage() { |
71 |
echo "Usage: $PROG [options]" |
72 |
exit 0 |
73 |
} |
74 |
|
75 |
# Long usage text |
76 |
usage() { |
77 |
echo "Usage: $PROG [options]" |
78 |
echo |
79 |
echo "Options:" |
80 |
echo " -t|--timeout <timeout> maximum runtime for probe, default $TIMEOUT sec" |
81 |
echo " -w|--warning <timeout> runtime after which to warn, default $WARNTIMEOUT sec" |
82 |
echo " -c|--critical <timeout> runtime after which to probe is to be killed, default $CRITTIMEOUT sec" |
83 |
echo " -x|--x509-user-proxy <file> set X509_USER_PROXY to given file" |
84 |
echo " -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file" |
85 |
echo " default: value of variable X509_USER_PROXY" |
86 |
echo " -e|--execute <cmd> command to execute by gLExec" |
87 |
echo " default: \"$DEF_GLEXEC_CMD_EXE\"" |
88 |
echo " -v|--verbose be more verbose, more -v means more verbosity" |
89 |
echo " -V|--version print version" |
90 |
echo " -h|--help show this helptext" |
91 |
exit 0 |
92 |
} |
93 |
|
94 |
# Log function: log <level> <message> |
95 |
log() { |
96 |
level=$1 |
97 |
shift |
98 |
if [ $VERBOSE -ge $level ];then |
99 |
for line in "$@" ; do |
100 |
echo "$line" |
101 |
done |
102 |
fi |
103 |
} |
104 |
|
105 |
# Prints nagios status line: <stat>: <summary> |
106 |
nagios_status() { |
107 |
code=$1 |
108 |
shift |
109 |
summary="$*" |
110 |
case "$code" in |
111 |
0) stat='OK' ;; |
112 |
1) stat='WARNING' ;; |
113 |
2) stat='CRITICAL' ;; |
114 |
3) stat='UNKNOWN' ;; |
115 |
*) stat='INVALID NAGIOS CODE $1' ;; |
116 |
esac |
117 |
log 0 "NAGIOS_STATUS_LINE $code $stat: $summary" |
118 |
exit $code |
119 |
} |
120 |
|
121 |
# Prints last line first, then rest. Last line is supposed to contain |
122 |
# NAGIOS_STATUS_LINE exitcode text |
123 |
nagios_output() { |
124 |
logstring="" |
125 |
nagiosline="" |
126 |
code=0 |
127 |
while read line;do |
128 |
line2=${line##NAGIOS_STATUS_LINE } |
129 |
if [ "$line2" = "$line" ];then |
130 |
if [ -z "$logstring" ];then |
131 |
logstring="${line}" |
132 |
else |
133 |
logstring="${logstring}\n${line}" |
134 |
fi |
135 |
else |
136 |
code=`echo $line2|cut -d' ' -f1` |
137 |
nagiosline=`echo $line2|cut -d' ' -f2-` |
138 |
fi |
139 |
done |
140 |
if [ -z "$nagiosline" ];then |
141 |
printf "${logstring}" |
142 |
else |
143 |
printf "${nagiosline}\n${logstring}" |
144 |
fi |
145 |
exit $code |
146 |
} |
147 |
|
148 |
# parsing error function |
149 |
parse_err() { |
150 |
code=3 # Unknown |
151 |
summary="" |
152 |
for line in "$@" ; do |
153 |
[ -z "$summary" ] && summary="$line" |
154 |
log 2 "$line" |
155 |
done |
156 |
nagios_status $code "$summary" |
157 |
} |
158 |
|
159 |
# don't use builtin which since it might not exist |
160 |
which_cmd() { |
161 |
for dir in `echo $PATH|tr : ' '` ; do |
162 |
cmd="${dir}/$1" |
163 |
log 3 "Looking for $1 in $dir" |
164 |
if [ -f "$cmd" ];then |
165 |
echo $cmd |
166 |
return |
167 |
fi |
168 |
done |
169 |
} |
170 |
|
171 |
# Assumes range as input, finds upperlimit, currently @ is not understood |
172 |
# see http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT |
173 |
parse_range() { |
174 |
[ $# -gt 1 ] && return 1 |
175 |
# invalid chars? |
176 |
echo $1 | grep -q '[^~:0-9]' && return 1 |
177 |
# find upperlimit |
178 |
echo $1 | grep -q ':' \ |
179 |
&& uplimit=`echo $1|cut -d: -f2-` \ |
180 |
|| uplimit=$1 |
181 |
# invalid chars? |
182 |
echo $uplimit | grep -q '[^0-9]' && return 1 |
183 |
# print limit, note: might be empty |
184 |
echo $uplimit |
185 |
return 0 |
186 |
} |
187 |
|
188 |
# Parses command line options |
189 |
parse_short_args() { |
190 |
while getopts ":g:x:t:c:w:e:hvVH:p:u:" i ; do |
191 |
case "$i" in |
192 |
g) GLEXEC_CLIENT_CERT="$OPTARG" ;; |
193 |
x) X509_USER_PROXY="$OPTARG" ;; |
194 |
t) TIMEOUT=`parse_range "$OPTARG"` |
195 |
if [ -z "$TIMEOUT" ];then |
196 |
parse_err "Not a valid timeout: \"$OPTARG\"" |
197 |
else |
198 |
log 2 "TIMEOUT set to $TIMEOUT" |
199 |
fi |
200 |
;; |
201 |
c) CRITTIMEOUT=`parse_range "$OPTARG"` |
202 |
if [ -z "$CRITTIMEOUT" ];then |
203 |
parse_err "Not a valid timeout: \"$OPTARG\"" |
204 |
else |
205 |
log 2 "CRITTIMEOUT set to $CRITTIMEOUT" |
206 |
fi |
207 |
;; |
208 |
w) WARNTIMEOUT=`parse_range "$OPTARG"` |
209 |
if [ -z "$WARNTIMEOUT" ];then |
210 |
parse_err "Not a valid timeout: \"$OPTARG\"" |
211 |
else |
212 |
log 2 "WARNTIMEOUT set to $WARNTIMEOUT" |
213 |
fi |
214 |
;; |
215 |
e) GLEXEC_CMD_EXE="$OPTARG" ;; |
216 |
h) shortusage ;; |
217 |
V) nagios_status 0 "$PROG version $VERSION" ;; |
218 |
v) VERBOSE=`expr $VERBOSE + 1` ;; |
219 |
H) log 2 "$PROG: option -H/--hostname is not used" ;; |
220 |
p) log 2 "$PROG: option -p/--port is not used" ;; |
221 |
u) log 2 "$PROG: option -u/--url is not used" ;; |
222 |
:) parse_err "Option requires an argument -- '$OPTARG'" \ |
223 |
"Try \`$PROG -h' for more information." |
224 |
;; |
225 |
\?) parse_err "Invalid option -- '$OPTARG'" \ |
226 |
"Try \`$PROG -h' for more information." |
227 |
;; |
228 |
esac |
229 |
done |
230 |
# Check if we specified GLEXEC_CLIENT_CERT |
231 |
if [ -z "$GLEXEC_CLIENT_CERT" ];then |
232 |
GLEXEC_CLIENT_CERT="$X509_USER_PROXY" |
233 |
log 2 "Using same proxy for GLEXEC_CLIENT_CERT and X509_USER_PROXY" |
234 |
fi |
235 |
# Check if we specified a command |
236 |
if [ -z "$GLEXEC_CMD_EXE" ];then |
237 |
GLEXEC_CMD_EXE="$DEF_GLEXEC_CMD_EXE" |
238 |
log 2 "Using default payload command \"$GLEXEC_CMD_EXE\"" |
239 |
fi |
240 |
} |
241 |
|
242 |
# Converts long options into short options |
243 |
parse_args() { |
244 |
# Find default executable |
245 |
cmd="`which_cmd $DEF_GLEXEC_CMD`" |
246 |
if [ -n "$cmd" ];then |
247 |
DEF_GLEXEC_CMD_EXE="$cmd $DEF_GLEXEC_CMD_ARGS" |
248 |
fi |
249 |
args="" |
250 |
for arg in "$@" ; do |
251 |
subarg=${arg##--} |
252 |
if [ "${subarg}" != "${arg}" ];then |
253 |
case "$subarg" in |
254 |
x509-user-proxy) args="$args -x " ;; |
255 |
glexec-client-cert) args="$args -c " ;; |
256 |
execute) args="$args -e " ;; |
257 |
timeout) args="$args -t " ;; |
258 |
critical) args="$args -c " ;; |
259 |
warning) args="$args -w " ;; |
260 |
verbose) args="$args -v " ;; |
261 |
version) args="$args -V " ;; |
262 |
help) usage ;; |
263 |
# Unused long options: |
264 |
hostname) args="$args -H " ;; |
265 |
port) args="$args -p " ;; |
266 |
url) args="$args -u " ;; |
267 |
*) |
268 |
parse_err "$PROG: invalid longoption -- '$subarg'" \ |
269 |
"Try \`$PROG -h' for more information." |
270 |
;; |
271 |
esac |
272 |
else |
273 |
args="$args \"$arg\"" |
274 |
fi |
275 |
done |
276 |
# Now parse the resulting short options |
277 |
eval parse_short_args `echo $args` |
278 |
} |
279 |
|
280 |
# wait wrapper |
281 |
wait_func() { |
282 |
if [ $# -ne 1 ];then |
283 |
log 2 "wait_func needs exactly one argument" |
284 |
return 1 |
285 |
fi |
286 |
if [ $VERBOSE -ge 1 ];then |
287 |
wait $1 2>&1 |
288 |
else |
289 |
wait $1 2> /dev/null |
290 |
fi |
291 |
return $? |
292 |
} |
293 |
|
294 |
# kill wrapper |
295 |
kill_func() { |
296 |
if [ $# -eq 0 ];then |
297 |
log 2 "kill_func needs at least one argument" |
298 |
return 1 |
299 |
fi |
300 |
ps -fjA|grep "${pid##-}" |
301 |
if [ $# -eq 1 ];then |
302 |
signo="-TERM" |
303 |
pid=$1 |
304 |
else |
305 |
signo=$1 |
306 |
pid=$2 |
307 |
fi |
308 |
log 3 "About to send $signo to $pid" |
309 |
if [ $VERBOSE -gt 1 ];then |
310 |
/bin/kill $signo $pid 2>&1 |
311 |
else |
312 |
/bin/kill $signo $pid 2> /dev/null |
313 |
fi |
314 |
return $? |
315 |
} |
316 |
|
317 |
# wait for background process to finish or timeout |
318 |
waiter() { |
319 |
pid=$1 |
320 |
code=0 |
321 |
if [ $CRITTIMEOUT -lt $TIMEOUT ];then |
322 |
sleep $CRITTIMEOUT |
323 |
if [ -n "`ps -opid= -p $pid`" ];then |
324 |
log 2 "Child process $pid is running after critical range $CRITTIMEOUT sec, sending SIGTERM" |
325 |
kill_func -$pid |
326 |
fi |
327 |
sleep $((TIMEOUT-CRITTIMEOUT)) |
328 |
else |
329 |
sleep $TIMEOUT |
330 |
fi |
331 |
|
332 |
# If process still running: kill it |
333 |
if [ -n "`ps -opid= -p $pid`" ];then |
334 |
# TIMEOUT exceeded: kill it |
335 |
log 2 "Child process $pid is running after timeout $TIMEOUT sec," \ |
336 |
"sending SIGKILL" |
337 |
kill_func -9 -$pid |
338 |
fi |
339 |
} |
340 |
|
341 |
######################################################################## |
342 |
# |
343 |
# gLExec specific functions |
344 |
# |
345 |
######################################################################## |
346 |
|
347 |
# Converts gLExec exit values to corresponding nagios codes |
348 |
glexecrc_to_nagios() { |
349 |
rc=$1 |
350 |
|
351 |
case "$rc" in |
352 |
0) |
353 |
code=0 # Success |
354 |
summary='success' |
355 |
;; |
356 |
126) |
357 |
code=1 # Warning |
358 |
summary="executable can't be executed ($rc)" |
359 |
;; |
360 |
201) |
361 |
code=2 # Critical |
362 |
summary="client error ($rc)" |
363 |
;; |
364 |
202) |
365 |
code=2 # Critical |
366 |
summary="system error ($rc)" |
367 |
;; |
368 |
203) |
369 |
code=2 # Critical |
370 |
summary="authorization error ($rc)" |
371 |
;; |
372 |
204) |
373 |
code=2 # Critical |
374 |
summary="exit code overlap ($rc)" |
375 |
;; |
376 |
*) |
377 |
code=1 # Warning |
378 |
summary="executable failed with exit code $rc" |
379 |
;; |
380 |
esac |
381 |
|
382 |
return $code |
383 |
} |
384 |
|
385 |
# Searches for gLExec and sets global GLEXEC_EXE variable |
386 |
find_glexec() { |
387 |
# First look at GLEXEC_LOCATION |
388 |
if [ -n "$GLEXEC_LOCATION" ];then |
389 |
log 3 "GLEXEC_LOCATION=$GLEXEC_LOCATION" |
390 |
glexloc="${GLEXEC_LOCATION}/sbin/glexec" |
391 |
if [ -f "$glexloc" ];then |
392 |
log 2 "gLExec found at $glexloc" |
393 |
GLEXEC_EXE=$glexloc |
394 |
return |
395 |
else |
396 |
log 2 "gLExec not found at \$GLEXEC_LOCATION" |
397 |
fi |
398 |
fi |
399 |
|
400 |
# Set GLITE_LOCATION if unset |
401 |
if [ -z "$GLITE_LOCATION" ];then |
402 |
GLITE_LOCATION=/opt/glite |
403 |
else |
404 |
log 3 "GLITE_LOCATION=$GLITE_LOCATION" |
405 |
fi |
406 |
|
407 |
|
408 |
# Check in PATH, GLITE_LOCATION and extra dirs |
409 |
for dir in `echo $PATH|tr : ' '` \ |
410 |
${GLITE_LOCATION}/sbin \ |
411 |
/usr/local/sbin /usr/sbin /sbin /usr/local/bin /usr/bin ; do |
412 |
glexloc="${dir}/glexec" |
413 |
log 3 "Looking for gLExec at $glexloc" |
414 |
if [ -f "$glexloc" ];then |
415 |
log 2 "gLExec found at $glexloc" |
416 |
GLEXEC_EXE=$glexloc |
417 |
return |
418 |
fi |
419 |
done |
420 |
} |
421 |
|
422 |
# Full gLExec run including finding the command and printing nagios status when |
423 |
# successful. In case of timeout, nagios status will come from run_probe. |
424 |
run_glexec() { |
425 |
# Store start time |
426 |
t1=$(date +%s) |
427 |
|
428 |
# Find glexec executable |
429 |
find_glexec |
430 |
|
431 |
# Test executable |
432 |
if [ -z "$GLEXEC_EXE" ] ; then |
433 |
code=2 # Critical |
434 |
summary='glexec command not found.' |
435 |
nagios_status $code $summary |
436 |
fi |
437 |
# Test proxy variable |
438 |
if [ -z "$X509_USER_PROXY" ] ; then |
439 |
code=3 # Unknown |
440 |
summary="\$X509_USER_PROXY is unset." |
441 |
nagios_status $code $summary |
442 |
fi |
443 |
# Test proxy file |
444 |
if [ ! -f "$X509_USER_PROXY" -o ! -s "$X509_USER_PROXY" ] ; then |
445 |
code=3 # Unknown |
446 |
summary="\$X509_USER_PROXY does not point to a nonempty file." |
447 |
nagios_status $code $summary |
448 |
fi |
449 |
|
450 |
if [ ! -f "$GLEXEC_CLIENT_CERT" -o ! -s "$X509_USER_PROXY" ] ; then |
451 |
code=3 # Unknown |
452 |
summary="\$GLEXEC_CLIENT_CERT does not point to a nonempty file." |
453 |
nagios_status $code $summary |
454 |
fi |
455 |
|
456 |
log 2 "Running $GLEXEC_EXE $GLEXEC_CMD_EXE" |
457 |
export GLEXEC_CLIENT_CERT X509_USER_PROXY |
458 |
if [ $VERBOSE -gt 1 ];then |
459 |
eval $GLEXEC_EXE $GLEXEC_CMD_EXE 2>&1 |
460 |
else |
461 |
eval $GLEXEC_EXE $GLEXEC_CMD_EXE > /dev/null 2>&1 |
462 |
fi |
463 |
glexecrc_to_nagios $? |
464 |
|
465 |
# Store end time |
466 |
dt=$(( $(date +%s) - t1)) |
467 |
|
468 |
if [ "$code" -eq 0 -a $dt -gt $WARNTIMEOUT ];then |
469 |
code=1 # Warning |
470 |
summary="gLExec took long time to succeed" |
471 |
fi |
472 |
|
473 |
perfdata="time=${dt}s;$WARNTIMEOUT;$CRITTIMEOUT;0" |
474 |
nagios_status $code "$summary|$perfdata" |
475 |
} |
476 |
|
477 |
######################################################################## |
478 |
# |
479 |
# main |
480 |
# |
481 |
######################################################################## |
482 |
|
483 |
{ |
484 |
# Turn on jobcontrol (separate process groups for subshells), such that we |
485 |
# can kill the process group for the background processes. |
486 |
set -m |
487 |
|
488 |
# Parse cmdline arguments (long ones are converted in corresponding short |
489 |
# ones) |
490 |
parse_args "$@" |
491 |
|
492 |
# Start glexec_run in background to have control over timeout |
493 |
run_glexec & |
494 |
probe_pid=$! |
495 |
|
496 |
# Start watch process in background: will kill probe after timeout |
497 |
waiter $probe_pid & |
498 |
waiter_pid=$! |
499 |
|
500 |
# Wait for run_glexec: it will either end by itself or by the waiter() |
501 |
log 3 "Waiting at most $TIMEOUT seconds for probe $probe_pid to finish" |
502 |
wait_func $probe_pid |
503 |
probe_rc=$? |
504 |
|
505 |
# Kill the waiter if it is still there |
506 |
if [ -n "`ps -opid= -p $waiter_pid`" ];then |
507 |
log 3 "Cleaning up waiter process $waiter_pid" |
508 |
kill_func -$waiter_pid |
509 |
fi |
510 |
# Call wait here to prevent logging of termination at end of script |
511 |
wait_func $waiter_pid |
512 |
|
513 |
# If probe was killed, it's exit value will be outside valid nagios range of |
514 |
# 0-3, if gLExec itself fails, the run_probe will exit with a 3 |
515 |
case "$probe_rc" in |
516 |
0|1|2|3) # Normal ending of run_glexec, which has called nagios_status |
517 |
code=$probe_rc; exit $code |
518 |
;; |
519 |
137) # run_glexec ended via SIGKILL |
520 |
nagios_status 2 "probe TIMEOUT of $TIMEOUT seconds exceeded" |
521 |
;; |
522 |
143) # run_glexec ended via SIGTERM |
523 |
nagios_status 2 "probe critical range of $CRITTIMEOUT seconds exceeded" |
524 |
;; |
525 |
*) # run_glexec ended prematurely?! |
526 |
nagios_status 3 "background process died unexpectly with rc=$probe_rc" |
527 |
;; |
528 |
esac; |
529 |
} | nagios_output |
530 |
|
531 |
exit $? |