/[pdpsoft]/trunk/nagios/glexec/check_glexec.sh
ViewVC logotype

Contents of /trunk/nagios/glexec/check_glexec.sh

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2454 - (show annotations) (download) (as text)
Sun Dec 4 12:51:33 2011 UTC (10 years, 1 month ago) by msalle
File MIME type: application/x-shellscript
File size: 14197 byte(s)
New version of check_glexec, now in perl instead of sh, old version renamed
intocheck_glexec.sh.
- It can now properly work with a timeout and a critical time: a SIGTERM is sent
  after critical, while a SIGKILL is sent after timeout (it's the probe overall
  timeout).
- payload can be relative in which case path is used (default: id -a)
- also has sighandlers for SIGINT and SIGTERM which try to printout useful
  status output. Useful if the probe is killed by the batch system.
- nagios status is first line of output, log stack follows.

1 #!/bin/dash
2 #
3 # Copyright (C) Nikhef 2011
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 # Author:
18 # Mischa Sall\'e <msalle@nikhef.nl>
19 # NIKHEF Amsterdam, the Netherlands
20 #
21 ########################################################################
22 #
23 # Nagios probe to test functioning of gLExec
24 #
25 # Nagios state can be one of the following:
26 # - Missing glexec command: CRITICAL
27 # - input proxies empty: UNKNOWN
28 # - short timeout exceeded: WARNING
29 # - timeout exceeded: CRITICAL
30 # - gLExec exit codes:
31 # 0 glexec succeeded: OK
32 # 201 Client error: CRITICAL
33 # 202 Internal error: CRITICAL
34 # 203 Auth error: CRITICAL
35 # 204 Overlap: CRITICAL
36 # 126 execve failed: WARNING
37 # 128+n signal: WARNING
38 # !=0 rc of payload: WARNING
39 #
40 ########################################################################
41
42 # version
43 VERSION=0.1
44
45 # plugin name and version
46 PROG=`basename $0`
47 # command will be set after arguments are parsed
48 DEF_GLEXEC_CMD="id"
49 DEF_GLEXEC_CMD_ARGS="-a"
50 # glexec command itself
51 GLEXEC_EXE=""
52 # Default verbosity
53 VERBOSE=0
54 # Default timeout
55 TIMEOUT=10
56 # Default short timeout, longer than this results in warning
57 CRITTIMEOUT=8
58 # Default short timeout, longer than this results in warning
59 WARNTIMEOUT=5
60 # Default GLEXEC_CLIENT_CERT will be set after arguments are parsed
61 GLEXEC_CLIENT_CERT=""
62
63 ########################################################################
64 #
65 # general options
66 #
67 ########################################################################
68
69 # Short usage text
70 shortusage() {
71 echo "Usage: $PROG [options]"
72 exit 0
73 }
74
75 # Long usage text
76 usage() {
77 echo "Usage: $PROG [options]"
78 echo
79 echo "Options:"
80 echo " -t|--timeout <timeout> maximum runtime for probe, default $TIMEOUT sec"
81 echo " -w|--warning <timeout> runtime after which to warn, default $WARNTIMEOUT sec"
82 echo " -c|--critical <timeout> runtime after which to probe is to be killed, default $CRITTIMEOUT sec"
83 echo " -x|--x509-user-proxy <file> set X509_USER_PROXY to given file"
84 echo " -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file"
85 echo " default: value of variable X509_USER_PROXY"
86 echo " -e|--execute <cmd> command to execute by gLExec"
87 echo " default: \"$DEF_GLEXEC_CMD_EXE\""
88 echo " -v|--verbose be more verbose, more -v means more verbosity"
89 echo " -V|--version print version"
90 echo " -h|--help show this helptext"
91 exit 0
92 }
93
94 # Log function: log <level> <message>
95 log() {
96 level=$1
97 shift
98 if [ $VERBOSE -ge $level ];then
99 for line in "$@" ; do
100 echo "$line"
101 done
102 fi
103 }
104
105 # Prints nagios status line: <stat>: <summary>
106 nagios_status() {
107 code=$1
108 shift
109 summary="$*"
110 case "$code" in
111 0) stat='OK' ;;
112 1) stat='WARNING' ;;
113 2) stat='CRITICAL' ;;
114 3) stat='UNKNOWN' ;;
115 *) stat='INVALID NAGIOS CODE $1' ;;
116 esac
117 log 0 "NAGIOS_STATUS_LINE $code $stat: $summary"
118 exit $code
119 }
120
121 # Prints last line first, then rest. Last line is supposed to contain
122 # NAGIOS_STATUS_LINE exitcode text
123 nagios_output() {
124 logstring=""
125 nagiosline=""
126 code=0
127 while read line;do
128 line2=${line##NAGIOS_STATUS_LINE }
129 if [ "$line2" = "$line" ];then
130 if [ -z "$logstring" ];then
131 logstring="${line}"
132 else
133 logstring="${logstring}\n${line}"
134 fi
135 else
136 code=`echo $line2|cut -d' ' -f1`
137 nagiosline=`echo $line2|cut -d' ' -f2-`
138 fi
139 done
140 if [ -z "$nagiosline" ];then
141 printf "${logstring}"
142 else
143 printf "${nagiosline}\n${logstring}"
144 fi
145 exit $code
146 }
147
148 # parsing error function
149 parse_err() {
150 code=3 # Unknown
151 summary=""
152 for line in "$@" ; do
153 [ -z "$summary" ] && summary="$line"
154 log 2 "$line"
155 done
156 nagios_status $code "$summary"
157 }
158
159 # don't use builtin which since it might not exist
160 which_cmd() {
161 for dir in `echo $PATH|tr : ' '` ; do
162 cmd="${dir}/$1"
163 log 3 "Looking for $1 in $dir"
164 if [ -f "$cmd" ];then
165 echo $cmd
166 return
167 fi
168 done
169 }
170
171 # Assumes range as input, finds upperlimit, currently @ is not understood
172 # see http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
173 parse_range() {
174 [ $# -gt 1 ] && return 1
175 # invalid chars?
176 echo $1 | grep -q '[^~:0-9]' && return 1
177 # find upperlimit
178 echo $1 | grep -q ':' \
179 && uplimit=`echo $1|cut -d: -f2-` \
180 || uplimit=$1
181 # invalid chars?
182 echo $uplimit | grep -q '[^0-9]' && return 1
183 # print limit, note: might be empty
184 echo $uplimit
185 return 0
186 }
187
188 # Parses command line options
189 parse_short_args() {
190 while getopts ":g:x:t:c:w:e:hvVH:p:u:" i ; do
191 case "$i" in
192 g) GLEXEC_CLIENT_CERT="$OPTARG" ;;
193 x) X509_USER_PROXY="$OPTARG" ;;
194 t) TIMEOUT=`parse_range "$OPTARG"`
195 if [ -z "$TIMEOUT" ];then
196 parse_err "Not a valid timeout: \"$OPTARG\""
197 else
198 log 2 "TIMEOUT set to $TIMEOUT"
199 fi
200 ;;
201 c) CRITTIMEOUT=`parse_range "$OPTARG"`
202 if [ -z "$CRITTIMEOUT" ];then
203 parse_err "Not a valid timeout: \"$OPTARG\""
204 else
205 log 2 "CRITTIMEOUT set to $CRITTIMEOUT"
206 fi
207 ;;
208 w) WARNTIMEOUT=`parse_range "$OPTARG"`
209 if [ -z "$WARNTIMEOUT" ];then
210 parse_err "Not a valid timeout: \"$OPTARG\""
211 else
212 log 2 "WARNTIMEOUT set to $WARNTIMEOUT"
213 fi
214 ;;
215 e) GLEXEC_CMD_EXE="$OPTARG" ;;
216 h) shortusage ;;
217 V) nagios_status 0 "$PROG version $VERSION" ;;
218 v) VERBOSE=`expr $VERBOSE + 1` ;;
219 H) log 2 "$PROG: option -H/--hostname is not used" ;;
220 p) log 2 "$PROG: option -p/--port is not used" ;;
221 u) log 2 "$PROG: option -u/--url is not used" ;;
222 :) parse_err "Option requires an argument -- '$OPTARG'" \
223 "Try \`$PROG -h' for more information."
224 ;;
225 \?) parse_err "Invalid option -- '$OPTARG'" \
226 "Try \`$PROG -h' for more information."
227 ;;
228 esac
229 done
230 # Check if we specified GLEXEC_CLIENT_CERT
231 if [ -z "$GLEXEC_CLIENT_CERT" ];then
232 GLEXEC_CLIENT_CERT="$X509_USER_PROXY"
233 log 2 "Using same proxy for GLEXEC_CLIENT_CERT and X509_USER_PROXY"
234 fi
235 # Check if we specified a command
236 if [ -z "$GLEXEC_CMD_EXE" ];then
237 GLEXEC_CMD_EXE="$DEF_GLEXEC_CMD_EXE"
238 log 2 "Using default payload command \"$GLEXEC_CMD_EXE\""
239 fi
240 }
241
242 # Converts long options into short options
243 parse_args() {
244 # Find default executable
245 cmd="`which_cmd $DEF_GLEXEC_CMD`"
246 if [ -n "$cmd" ];then
247 DEF_GLEXEC_CMD_EXE="$cmd $DEF_GLEXEC_CMD_ARGS"
248 fi
249 args=""
250 for arg in "$@" ; do
251 subarg=${arg##--}
252 if [ "${subarg}" != "${arg}" ];then
253 case "$subarg" in
254 x509-user-proxy) args="$args -x " ;;
255 glexec-client-cert) args="$args -c " ;;
256 execute) args="$args -e " ;;
257 timeout) args="$args -t " ;;
258 critical) args="$args -c " ;;
259 warning) args="$args -w " ;;
260 verbose) args="$args -v " ;;
261 version) args="$args -V " ;;
262 help) usage ;;
263 # Unused long options:
264 hostname) args="$args -H " ;;
265 port) args="$args -p " ;;
266 url) args="$args -u " ;;
267 *)
268 parse_err "$PROG: invalid longoption -- '$subarg'" \
269 "Try \`$PROG -h' for more information."
270 ;;
271 esac
272 else
273 args="$args \"$arg\""
274 fi
275 done
276 # Now parse the resulting short options
277 eval parse_short_args `echo $args`
278 }
279
280 # wait wrapper
281 wait_func() {
282 if [ $# -ne 1 ];then
283 log 2 "wait_func needs exactly one argument"
284 return 1
285 fi
286 if [ $VERBOSE -ge 1 ];then
287 wait $1 2>&1
288 else
289 wait $1 2> /dev/null
290 fi
291 return $?
292 }
293
294 # kill wrapper
295 kill_func() {
296 if [ $# -eq 0 ];then
297 log 2 "kill_func needs at least one argument"
298 return 1
299 fi
300 ps -fjA|grep "${pid##-}"
301 if [ $# -eq 1 ];then
302 signo="-TERM"
303 pid=$1
304 else
305 signo=$1
306 pid=$2
307 fi
308 log 3 "About to send $signo to $pid"
309 if [ $VERBOSE -gt 1 ];then
310 /bin/kill $signo $pid 2>&1
311 else
312 /bin/kill $signo $pid 2> /dev/null
313 fi
314 return $?
315 }
316
317 # wait for background process to finish or timeout
318 waiter() {
319 pid=$1
320 code=0
321 if [ $CRITTIMEOUT -lt $TIMEOUT ];then
322 sleep $CRITTIMEOUT
323 if [ -n "`ps -opid= -p $pid`" ];then
324 log 2 "Child process $pid is running after critical range $CRITTIMEOUT sec, sending SIGTERM"
325 kill_func -$pid
326 fi
327 sleep $((TIMEOUT-CRITTIMEOUT))
328 else
329 sleep $TIMEOUT
330 fi
331
332 # If process still running: kill it
333 if [ -n "`ps -opid= -p $pid`" ];then
334 # TIMEOUT exceeded: kill it
335 log 2 "Child process $pid is running after timeout $TIMEOUT sec," \
336 "sending SIGKILL"
337 kill_func -9 -$pid
338 fi
339 }
340
341 ########################################################################
342 #
343 # gLExec specific functions
344 #
345 ########################################################################
346
347 # Converts gLExec exit values to corresponding nagios codes
348 glexecrc_to_nagios() {
349 rc=$1
350
351 case "$rc" in
352 0)
353 code=0 # Success
354 summary='success'
355 ;;
356 126)
357 code=1 # Warning
358 summary="executable can't be executed ($rc)"
359 ;;
360 201)
361 code=2 # Critical
362 summary="client error ($rc)"
363 ;;
364 202)
365 code=2 # Critical
366 summary="system error ($rc)"
367 ;;
368 203)
369 code=2 # Critical
370 summary="authorization error ($rc)"
371 ;;
372 204)
373 code=2 # Critical
374 summary="exit code overlap ($rc)"
375 ;;
376 *)
377 code=1 # Warning
378 summary="executable failed with exit code $rc"
379 ;;
380 esac
381
382 return $code
383 }
384
385 # Searches for gLExec and sets global GLEXEC_EXE variable
386 find_glexec() {
387 # First look at GLEXEC_LOCATION
388 if [ -n "$GLEXEC_LOCATION" ];then
389 log 3 "GLEXEC_LOCATION=$GLEXEC_LOCATION"
390 glexloc="${GLEXEC_LOCATION}/sbin/glexec"
391 if [ -f "$glexloc" ];then
392 log 2 "gLExec found at $glexloc"
393 GLEXEC_EXE=$glexloc
394 return
395 else
396 log 2 "gLExec not found at \$GLEXEC_LOCATION"
397 fi
398 fi
399
400 # Set GLITE_LOCATION if unset
401 if [ -z "$GLITE_LOCATION" ];then
402 GLITE_LOCATION=/opt/glite
403 else
404 log 3 "GLITE_LOCATION=$GLITE_LOCATION"
405 fi
406
407
408 # Check in PATH, GLITE_LOCATION and extra dirs
409 for dir in `echo $PATH|tr : ' '` \
410 ${GLITE_LOCATION}/sbin \
411 /usr/local/sbin /usr/sbin /sbin /usr/local/bin /usr/bin ; do
412 glexloc="${dir}/glexec"
413 log 3 "Looking for gLExec at $glexloc"
414 if [ -f "$glexloc" ];then
415 log 2 "gLExec found at $glexloc"
416 GLEXEC_EXE=$glexloc
417 return
418 fi
419 done
420 }
421
422 # Full gLExec run including finding the command and printing nagios status when
423 # successful. In case of timeout, nagios status will come from run_probe.
424 run_glexec() {
425 # Store start time
426 t1=$(date +%s)
427
428 # Find glexec executable
429 find_glexec
430
431 # Test executable
432 if [ -z "$GLEXEC_EXE" ] ; then
433 code=2 # Critical
434 summary='glexec command not found.'
435 nagios_status $code $summary
436 fi
437 # Test proxy variable
438 if [ -z "$X509_USER_PROXY" ] ; then
439 code=3 # Unknown
440 summary="\$X509_USER_PROXY is unset."
441 nagios_status $code $summary
442 fi
443 # Test proxy file
444 if [ ! -f "$X509_USER_PROXY" -o ! -s "$X509_USER_PROXY" ] ; then
445 code=3 # Unknown
446 summary="\$X509_USER_PROXY does not point to a nonempty file."
447 nagios_status $code $summary
448 fi
449
450 if [ ! -f "$GLEXEC_CLIENT_CERT" -o ! -s "$X509_USER_PROXY" ] ; then
451 code=3 # Unknown
452 summary="\$GLEXEC_CLIENT_CERT does not point to a nonempty file."
453 nagios_status $code $summary
454 fi
455
456 log 2 "Running $GLEXEC_EXE $GLEXEC_CMD_EXE"
457 export GLEXEC_CLIENT_CERT X509_USER_PROXY
458 if [ $VERBOSE -gt 1 ];then
459 eval $GLEXEC_EXE $GLEXEC_CMD_EXE 2>&1
460 else
461 eval $GLEXEC_EXE $GLEXEC_CMD_EXE > /dev/null 2>&1
462 fi
463 glexecrc_to_nagios $?
464
465 # Store end time
466 dt=$(( $(date +%s) - t1))
467
468 if [ "$code" -eq 0 -a $dt -gt $WARNTIMEOUT ];then
469 code=1 # Warning
470 summary="gLExec took long time to succeed"
471 fi
472
473 perfdata="time=${dt}s;$WARNTIMEOUT;$CRITTIMEOUT;0"
474 nagios_status $code "$summary|$perfdata"
475 }
476
477 ########################################################################
478 #
479 # main
480 #
481 ########################################################################
482
483 {
484 # Turn on jobcontrol (separate process groups for subshells), such that we
485 # can kill the process group for the background processes.
486 set -m
487
488 # Parse cmdline arguments (long ones are converted in corresponding short
489 # ones)
490 parse_args "$@"
491
492 # Start glexec_run in background to have control over timeout
493 run_glexec &
494 probe_pid=$!
495
496 # Start watch process in background: will kill probe after timeout
497 waiter $probe_pid &
498 waiter_pid=$!
499
500 # Wait for run_glexec: it will either end by itself or by the waiter()
501 log 3 "Waiting at most $TIMEOUT seconds for probe $probe_pid to finish"
502 wait_func $probe_pid
503 probe_rc=$?
504
505 # Kill the waiter if it is still there
506 if [ -n "`ps -opid= -p $waiter_pid`" ];then
507 log 3 "Cleaning up waiter process $waiter_pid"
508 kill_func -$waiter_pid
509 fi
510 # Call wait here to prevent logging of termination at end of script
511 wait_func $waiter_pid
512
513 # If probe was killed, it's exit value will be outside valid nagios range of
514 # 0-3, if gLExec itself fails, the run_probe will exit with a 3
515 case "$probe_rc" in
516 0|1|2|3) # Normal ending of run_glexec, which has called nagios_status
517 code=$probe_rc; exit $code
518 ;;
519 137) # run_glexec ended via SIGKILL
520 nagios_status 2 "probe TIMEOUT of $TIMEOUT seconds exceeded"
521 ;;
522 143) # run_glexec ended via SIGTERM
523 nagios_status 2 "probe critical range of $CRITTIMEOUT seconds exceeded"
524 ;;
525 *) # run_glexec ended prematurely?!
526 nagios_status 3 "background process died unexpectly with rc=$probe_rc"
527 ;;
528 esac;
529 } | nagios_output
530
531 exit $?

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28