1 |
#!/bin/dash |
2 |
# |
3 |
# Copyright (C) Nikhef 2011 |
4 |
# |
5 |
# Licensed under the Apache License, Version 2.0 (the "License"); |
6 |
# you may not use this file except in compliance with the License. |
7 |
# You may obtain a copy of the License at |
8 |
# |
9 |
# http://www.apache.org/licenses/LICENSE-2.0 |
10 |
# |
11 |
# Unless required by applicable law or agreed to in writing, software |
12 |
# distributed under the License is distributed on an "AS IS" BASIS, |
13 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 |
# See the License for the specific language governing permissions and |
15 |
# limitations under the License. |
16 |
# |
17 |
# Author: |
18 |
# Mischa Sall\'e <msalle@nikhef.nl> |
19 |
# NIKHEF Amsterdam, the Netherlands |
20 |
# |
21 |
######################################################################## |
22 |
# |
23 |
# Nagios probe to test functioning of gLExec |
24 |
# |
25 |
# Nagios state can be one of the following: |
26 |
# - Missing glexec command: CRITICAL |
27 |
# - input proxies empty: UNKNOWN |
28 |
# - timeout exceeded: UNKNOWN |
29 |
# - gLExec exit codes: |
30 |
# 0 glexec succeeded: OK |
31 |
# 201 Client error: CRITICAL |
32 |
# 202 Internal error: CRITICAL |
33 |
# 203 Auth error: CRITICAL |
34 |
# 204 Overlap: CRITICAL |
35 |
# 126 execve failed: WARNING |
36 |
# 128+n signal: WARNING |
37 |
# !=0 rc of payload: WARNING |
38 |
# |
39 |
######################################################################## |
40 |
|
41 |
# version |
42 |
VERSION=0.1 |
43 |
|
44 |
# plugin name and version |
45 |
PROG=`basename $0` |
46 |
# command will be set after arguments are parsed |
47 |
DEF_GLEXEC_CMD="id" |
48 |
DEF_GLEXEC_CMD_ARGS="-a" |
49 |
# glexec command itself |
50 |
GLEXEC_EXE="" |
51 |
# Default verbosity |
52 |
VERBOSE=0 |
53 |
# Default timeout |
54 |
TIMEOUT=10 |
55 |
# Default short timeout, longer than this results in warning |
56 |
SHORTTIMEOUT=5 |
57 |
# Default GLEXEC_CLIENT_CERT will be set after arguments are parsed |
58 |
GLEXEC_CLIENT_CERT="" |
59 |
|
60 |
######################################################################## |
61 |
# |
62 |
# general options |
63 |
# |
64 |
######################################################################## |
65 |
|
66 |
# Short usage text |
67 |
shortusage() { |
68 |
echo "Usage: $PROG [options]" |
69 |
exit 0 |
70 |
} |
71 |
|
72 |
# Long usage text |
73 |
usage() { |
74 |
echo "Usage: $PROG [options]" |
75 |
echo |
76 |
echo "Options:" |
77 |
echo " -t|--timeout <timeout> maximum runtime for probe, default $TIMEOUT sec" |
78 |
echo " -s|--shorttimeout <timeout> runtime after which to warn, default $SHORTTIMEOUT sec" |
79 |
echo " -x|--x509-user-proxy <file> set X509_USER_PROXY to given file" |
80 |
echo " -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file" |
81 |
echo " default: value of variable X509_USER_PROXY" |
82 |
echo " -e|--execute <cmd> command to execute by gLExec" |
83 |
echo " default: \"$DEF_GLEXEC_CMD_EXE\"" |
84 |
echo " -v|--verbose be more verbose, more -v means more verbosity" |
85 |
echo " -V|--version print version" |
86 |
echo " -h|--help show this helptext" |
87 |
exit 0 |
88 |
} |
89 |
|
90 |
# Log function: log <level> <message> |
91 |
log() { |
92 |
level=$1 |
93 |
shift |
94 |
if [ $VERBOSE -ge $level ];then |
95 |
for line in "$@" ; do |
96 |
echo "$line" |
97 |
done |
98 |
fi |
99 |
} |
100 |
|
101 |
# Prints nagios status line: <stat>: <summary> |
102 |
nagios_printout() { |
103 |
code=$1 |
104 |
shift |
105 |
summary="$*" |
106 |
case "$code" in |
107 |
0) stat='OK' ;; |
108 |
1) stat='WARNING' ;; |
109 |
2) stat='CRITICAL' ;; |
110 |
3) stat='UNKNOWN' ;; |
111 |
*) stat='INVALID NAGIOS CODE $1' ;; |
112 |
esac |
113 |
log 0 "$stat: $summary" |
114 |
exit $code |
115 |
} |
116 |
|
117 |
# parsing error function |
118 |
parse_err() { |
119 |
code=3 # Unknown |
120 |
summary="" |
121 |
for line in "$@" ; do |
122 |
[ -z "$summary" ] && summary="$line" |
123 |
log 2 "$line" |
124 |
done |
125 |
nagios_printout $code "$summary" |
126 |
} |
127 |
|
128 |
# don't use builtin which since it might not exist |
129 |
which_cmd() { |
130 |
for dir in `echo $PATH|tr : ' '` ; do |
131 |
cmd="${dir}/$1" |
132 |
log 3 "Looking for $1 in $dir" |
133 |
if [ -f "$cmd" ];then |
134 |
echo $cmd |
135 |
return |
136 |
fi |
137 |
done |
138 |
} |
139 |
|
140 |
# Assumes range as input, finds upperlimit, currently @ is not understood |
141 |
# see http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT |
142 |
parse_range() { |
143 |
[ $# -gt 1 ] && return 1 |
144 |
# invalid chars? |
145 |
echo $1 | grep -q '[^~:0-9]' && return 1 |
146 |
# find upperlimit |
147 |
echo $1 | grep -q ':' \ |
148 |
&& uplimit=`echo $1|cut -d: -f2-` \ |
149 |
|| uplimit=$1 |
150 |
# invalid chars? |
151 |
echo $uplimit | grep -q '[^0-9]' && return 1 |
152 |
# print limit, note: might be empty |
153 |
echo $uplimit |
154 |
return 0 |
155 |
} |
156 |
|
157 |
# Parses command line options |
158 |
parse_short_args() { |
159 |
while getopts ":g:x:t:c:w:e:hvVH:p:u:" i ; do |
160 |
case "$i" in |
161 |
g) GLEXEC_CLIENT_CERT="$OPTARG" ;; |
162 |
x) X509_USER_PROXY="$OPTARG" ;; |
163 |
c|t) TIMEOUT=`parse_range "$OPTARG"` |
164 |
if [ -z "$TIMEOUT" ];then |
165 |
parse_err "Not a valid timeout: \"$OPTARG\"" |
166 |
else |
167 |
log 2 "TIMEOUT set to $TIMEOUT" |
168 |
fi |
169 |
;; |
170 |
w) SHORTTIMEOUT=`parse_range "$OPTARG"` |
171 |
if [ -z "$SHORTTIMEOUT" ];then |
172 |
parse_err "Not a valid timeout: \"$OPTARG\"" |
173 |
else |
174 |
log 2 "SHORTTIMEOUT set to $TIMEOUT" |
175 |
fi |
176 |
;; |
177 |
e) GLEXEC_CMD_EXE="$OPTARG" ;; |
178 |
h) shortusage ;; |
179 |
V) nagios_printout 0 "$PROG version $VERSION" ;; |
180 |
v) VERBOSE=`expr $VERBOSE + 1` ;; |
181 |
H) log 2 "$PROG: option -H/--hostname is not used" ;; |
182 |
p) log 2 "$PROG: option -p/--port is not used" ;; |
183 |
u) log 2 "$PROG: option -u/--url is not used" ;; |
184 |
:) parse_err "Option requires an argument -- '$OPTARG'" \ |
185 |
"Try \`$PROG -h' for more information." |
186 |
;; |
187 |
\?) parse_err "Invalid option -- '$OPTARG'" \ |
188 |
"Try \`$PROG -h' for more information." |
189 |
;; |
190 |
esac |
191 |
done |
192 |
# Check if we specified GLEXEC_CLIENT_CERT |
193 |
if [ -z "$GLEXEC_CLIENT_CERT" ];then |
194 |
GLEXEC_CLIENT_CERT="$X509_USER_PROXY" |
195 |
log 2 "Using same proxy for GLEXEC_CLIENT_CERT and X509_USER_PROXY" |
196 |
fi |
197 |
# Check if we specified a command |
198 |
if [ -z "$GLEXEC_CMD_EXE" ];then |
199 |
GLEXEC_CMD_EXE="$DEF_GLEXEC_CMD_EXE" |
200 |
log 2 "Using default payload command \"$GLEXEC_CMD_EXE\"" |
201 |
fi |
202 |
} |
203 |
|
204 |
# Converts long options into short options |
205 |
parse_args() { |
206 |
# Find default executable |
207 |
cmd="`which_cmd $DEF_GLEXEC_CMD`" |
208 |
if [ -n "$cmd" ];then |
209 |
DEF_GLEXEC_CMD_EXE="$cmd $DEF_GLEXEC_CMD_ARGS" |
210 |
fi |
211 |
args="" |
212 |
for arg in "$@" ; do |
213 |
subarg=${arg##--} |
214 |
if [ "${subarg}" != "${arg}" ];then |
215 |
case "$subarg" in |
216 |
x509-user-proxy) args="$args -x " ;; |
217 |
glexec-client-cert) args="$args -c " ;; |
218 |
execute) args="$args -e " ;; |
219 |
timeout) args="$args -t " ;; |
220 |
shorttimeout) args="$args -s " ;; |
221 |
verbose) args="$args -v " ;; |
222 |
version) args="$args -V " ;; |
223 |
help) usage ;; |
224 |
# Unused long options: |
225 |
hostname) args="$args -H " ;; |
226 |
port) args="$args -p " ;; |
227 |
url) args="$args -u " ;; |
228 |
warning) args="$args -w " ;; |
229 |
critical) args="$args -c " ;; |
230 |
*) |
231 |
parse_err "$PROG: invalid longoption -- '$subarg'" \ |
232 |
"Try \`$PROG -h' for more information." |
233 |
;; |
234 |
esac |
235 |
else |
236 |
args="$args \"$arg\"" |
237 |
fi |
238 |
done |
239 |
# Now parse the resulting short options |
240 |
eval parse_short_args `echo $args` |
241 |
} |
242 |
|
243 |
# wait for background process to finish or timeout |
244 |
waiter() { |
245 |
pid=$1 |
246 |
sleep $TIMEOUT |
247 |
|
248 |
# If process still running: kill it |
249 |
if [ -n "`ps -opid= -p $pid`" ];then |
250 |
# TIMEOUT exceeded: kill it |
251 |
log 2 "Child process $pid is still running after timeout $TIMEOUT" |
252 |
kill -9 $pid |
253 |
exit 3 |
254 |
else |
255 |
exit 0 |
256 |
fi |
257 |
} |
258 |
|
259 |
######################################################################## |
260 |
# |
261 |
# gLExec specific functions |
262 |
# |
263 |
######################################################################## |
264 |
|
265 |
# Converts gLExec exit values to corresponding nagios codes |
266 |
glexecrc_to_nagios() { |
267 |
rc=$1 |
268 |
|
269 |
case "$rc" in |
270 |
0) |
271 |
code=0 # Success |
272 |
summary='success' |
273 |
;; |
274 |
126) |
275 |
code=1 # Warning |
276 |
summary="executable can't be executed ($rc)" |
277 |
;; |
278 |
201) |
279 |
code=2 # Critical |
280 |
summary="client error ($rc)" |
281 |
;; |
282 |
202) |
283 |
code=2 # Critical |
284 |
summary="system error ($rc)" |
285 |
;; |
286 |
203) |
287 |
code=2 # Critical |
288 |
summary="authorization error ($rc)" |
289 |
;; |
290 |
204) |
291 |
code=2 # Critical |
292 |
summary="exit code overlap ($rc)" |
293 |
;; |
294 |
*) |
295 |
code=1 # Warning |
296 |
summary="executable failed with exit code $rc" |
297 |
;; |
298 |
esac |
299 |
|
300 |
return $code |
301 |
} |
302 |
|
303 |
# Searches for gLExec and sets global GLEXEC_EXE variable |
304 |
find_glexec() { |
305 |
# First look at GLEXEC_LOCATION |
306 |
if [ -n "$GLEXEC_LOCATION" ];then |
307 |
log 3 "GLEXEC_LOCATION=$GLEXEC_LOCATION" |
308 |
glexloc="${GLEXEC_LOCATION}/sbin/glexec" |
309 |
if [ -f "$glexloc" ];then |
310 |
log 2 "gLExec found at $glexloc" |
311 |
GLEXEC_EXE=$glexloc |
312 |
return |
313 |
else |
314 |
log 2 "gLExec not found at \$GLEXEC_LOCATION" |
315 |
fi |
316 |
fi |
317 |
|
318 |
# Set GLITE_LOCATION if unset |
319 |
if [ -z "$GLITE_LOCATION" ];then |
320 |
GLITE_LOCATION=/opt/glite |
321 |
else |
322 |
log 3 "GLITE_LOCATION=$GLITE_LOCATION" |
323 |
fi |
324 |
|
325 |
|
326 |
# Check in PATH, GLITE_LOCATION and extra dirs |
327 |
for dir in `echo $PATH|tr : ' '` \ |
328 |
${GLITE_LOCATION}/sbin \ |
329 |
/usr/local/sbin /usr/sbin /sbin /usr/local/bin /usr/bin ; do |
330 |
glexloc="${dir}/glexec" |
331 |
log 3 "Looking for gLExec at $glexloc" |
332 |
if [ -f "$glexloc" ];then |
333 |
log 2 "gLExec found at $glexloc" |
334 |
GLEXEC_EXE=$glexloc |
335 |
return |
336 |
fi |
337 |
done |
338 |
} |
339 |
|
340 |
# Full run: |
341 |
# - search for glexec command |
342 |
# - run glexec |
343 |
# - print nagios status |
344 |
run_glexec() { |
345 |
# Store start time |
346 |
t1=$(date +%s) |
347 |
|
348 |
# Find glexec executable |
349 |
find_glexec |
350 |
|
351 |
# Test executable |
352 |
if [ -z "$GLEXEC_EXE" ] ; then |
353 |
code=2 # Critical |
354 |
summary='glexec command not found.' |
355 |
nagios_printout $code $summary |
356 |
fi |
357 |
# Test proxy variable |
358 |
if [ -z "$X509_USER_PROXY" ] ; then |
359 |
code=3 # Unknown |
360 |
summary="\$X509_USER_PROXY is unset." |
361 |
nagios_printout $code $summary |
362 |
fi |
363 |
# Test proxy file |
364 |
if [ ! -f "$X509_USER_PROXY" -o ! -s "$X509_USER_PROXY" ] ; then |
365 |
code=3 # Unknown |
366 |
summary="\$X509_USER_PROXY does not point to a nonempty file." |
367 |
nagios_printout $code $summary |
368 |
fi |
369 |
|
370 |
if [ ! -f "$GLEXEC_CLIENT_CERT" -o ! -s "$X509_USER_PROXY" ] ; then |
371 |
code=3 # Unknown |
372 |
summary="\$GLEXEC_CLIENT_CERT does not point to a nonempty file." |
373 |
nagios_printout $code $summary |
374 |
fi |
375 |
|
376 |
log 2 "Running $GLEXEC_EXE $GLEXEC_CMD_EXE" |
377 |
export GLEXEC_CLIENT_CERT X509_USER_PROXY |
378 |
if [ $VERBOSE -gt 1 ];then |
379 |
eval $GLEXEC_EXE $GLEXEC_CMD_EXE 2>&1 |
380 |
else |
381 |
eval $GLEXEC_EXE $GLEXEC_CMD_EXE > /dev/null 2>&1 |
382 |
fi |
383 |
glexecrc_to_nagios $? |
384 |
|
385 |
# Store end time |
386 |
dt=$(( $(date +%s) - t1)) |
387 |
|
388 |
if [ "$code" -eq 0 -a $dt -gt $SHORTTIMEOUT ];then |
389 |
code=1 # Warning |
390 |
summary="gLExec took long time to succeed" |
391 |
fi |
392 |
|
393 |
perfdata="time=${dt}s;$SHORTTIMEOUT;$TIMEOUT;0" |
394 |
nagios_printout $code "$summary|$perfdata" |
395 |
} |
396 |
|
397 |
######################################################################## |
398 |
# |
399 |
# main program |
400 |
# |
401 |
######################################################################## |
402 |
|
403 |
# Parse cmdline arguments (long ones are converted in corresponding short ones) |
404 |
parse_args "$@" |
405 |
|
406 |
# Start glexec_run in background to have control over timeout |
407 |
run_glexec & |
408 |
probe_pid=$! |
409 |
|
410 |
# Start watch process in background: will kill probe after timeout |
411 |
waiter $probe_pid & |
412 |
waiter_pid=$! |
413 |
|
414 |
# Wait for run_glexec: it will either end by itself or by the waiter() |
415 |
log 3 "Waiting at most $TIMEOUT seconds for probe $probe_pid to finish" |
416 |
if [ $VERBOSE -ge 1 ];then |
417 |
wait $probe_pid |
418 |
else |
419 |
wait $probe_pid 2> /dev/null |
420 |
fi |
421 |
probe_rc=$? |
422 |
|
423 |
# Kill the waiter if it is still there |
424 |
if [ -n "`ps -opid= -p $waiter_pid`" ];then |
425 |
kill $waiter_pid 2> /dev/null |
426 |
fi |
427 |
|
428 |
# If probe was killed, it's exit value will be outside valid nagios range of 0-3 |
429 |
if [ $probe_rc -gt 3 ];then |
430 |
code=2 # Critical |
431 |
if [ $VERBOSE -eq 0 ];then |
432 |
nagios_printout $code "probe TIMEOUT of $TIMEOUT seconds exceeded" |
433 |
else |
434 |
nagios_printout $code "probe TIMEOUT of $TIMEOUT seconds exceeded (rc=$probe_rc)" |
435 |
fi |
436 |
else |
437 |
code=$probe_rc |
438 |
fi |
439 |
|
440 |
# run_glexec has finished: parse its exit value and exit with it |
441 |
exit $probe_rc |