/[pdpsoft]/trunk/nagios/glexec/check_glexec.sh
ViewVC logotype

Annotation of /trunk/nagios/glexec/check_glexec.sh

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2454 - (hide annotations) (download) (as text)
Sun Dec 4 12:51:33 2011 UTC (10 years, 5 months ago) by msalle
File MIME type: application/x-shellscript
File size: 14197 byte(s)
New version of check_glexec, now in perl instead of sh, old version renamed
intocheck_glexec.sh.
- It can now properly work with a timeout and a critical time: a SIGTERM is sent
  after critical, while a SIGKILL is sent after timeout (it's the probe overall
  timeout).
- payload can be relative in which case path is used (default: id -a)
- also has sighandlers for SIGINT and SIGTERM which try to printout useful
  status output. Useful if the probe is killed by the batch system.
- nagios status is first line of output, log stack follows.

1 msalle 2454 #!/bin/dash
2 msalle 2451 #
3     # Copyright (C) Nikhef 2011
4     #
5     # Licensed under the Apache License, Version 2.0 (the "License");
6     # you may not use this file except in compliance with the License.
7     # You may obtain a copy of the License at
8     #
9     # http://www.apache.org/licenses/LICENSE-2.0
10     #
11     # Unless required by applicable law or agreed to in writing, software
12     # distributed under the License is distributed on an "AS IS" BASIS,
13     # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14     # See the License for the specific language governing permissions and
15     # limitations under the License.
16     #
17     # Author:
18     # Mischa Sall\'e <msalle@nikhef.nl>
19     # NIKHEF Amsterdam, the Netherlands
20     #
21     ########################################################################
22     #
23     # Nagios probe to test functioning of gLExec
24     #
25     # Nagios state can be one of the following:
26     # - Missing glexec command: CRITICAL
27     # - input proxies empty: UNKNOWN
28 msalle 2452 # - short timeout exceeded: WARNING
29     # - timeout exceeded: CRITICAL
30 msalle 2451 # - gLExec exit codes:
31     # 0 glexec succeeded: OK
32     # 201 Client error: CRITICAL
33     # 202 Internal error: CRITICAL
34     # 203 Auth error: CRITICAL
35     # 204 Overlap: CRITICAL
36     # 126 execve failed: WARNING
37     # 128+n signal: WARNING
38     # !=0 rc of payload: WARNING
39     #
40     ########################################################################
41    
42     # version
43     VERSION=0.1
44    
45     # plugin name and version
46     PROG=`basename $0`
47     # command will be set after arguments are parsed
48     DEF_GLEXEC_CMD="id"
49     DEF_GLEXEC_CMD_ARGS="-a"
50     # glexec command itself
51     GLEXEC_EXE=""
52     # Default verbosity
53     VERBOSE=0
54     # Default timeout
55     TIMEOUT=10
56     # Default short timeout, longer than this results in warning
57 msalle 2454 CRITTIMEOUT=8
58     # Default short timeout, longer than this results in warning
59     WARNTIMEOUT=5
60 msalle 2451 # Default GLEXEC_CLIENT_CERT will be set after arguments are parsed
61     GLEXEC_CLIENT_CERT=""
62    
63     ########################################################################
64     #
65     # general options
66     #
67     ########################################################################
68    
69     # Short usage text
70     shortusage() {
71     echo "Usage: $PROG [options]"
72     exit 0
73     }
74    
75     # Long usage text
76     usage() {
77     echo "Usage: $PROG [options]"
78     echo
79     echo "Options:"
80     echo " -t|--timeout <timeout> maximum runtime for probe, default $TIMEOUT sec"
81 msalle 2454 echo " -w|--warning <timeout> runtime after which to warn, default $WARNTIMEOUT sec"
82     echo " -c|--critical <timeout> runtime after which to probe is to be killed, default $CRITTIMEOUT sec"
83 msalle 2451 echo " -x|--x509-user-proxy <file> set X509_USER_PROXY to given file"
84     echo " -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file"
85     echo " default: value of variable X509_USER_PROXY"
86     echo " -e|--execute <cmd> command to execute by gLExec"
87     echo " default: \"$DEF_GLEXEC_CMD_EXE\""
88     echo " -v|--verbose be more verbose, more -v means more verbosity"
89     echo " -V|--version print version"
90     echo " -h|--help show this helptext"
91     exit 0
92     }
93    
94     # Log function: log <level> <message>
95     log() {
96     level=$1
97     shift
98     if [ $VERBOSE -ge $level ];then
99     for line in "$@" ; do
100     echo "$line"
101     done
102     fi
103     }
104    
105     # Prints nagios status line: <stat>: <summary>
106 msalle 2454 nagios_status() {
107 msalle 2451 code=$1
108     shift
109     summary="$*"
110     case "$code" in
111     0) stat='OK' ;;
112     1) stat='WARNING' ;;
113     2) stat='CRITICAL' ;;
114     3) stat='UNKNOWN' ;;
115     *) stat='INVALID NAGIOS CODE $1' ;;
116     esac
117 msalle 2454 log 0 "NAGIOS_STATUS_LINE $code $stat: $summary"
118 msalle 2451 exit $code
119     }
120    
121 msalle 2454 # Prints last line first, then rest. Last line is supposed to contain
122     # NAGIOS_STATUS_LINE exitcode text
123     nagios_output() {
124     logstring=""
125     nagiosline=""
126     code=0
127     while read line;do
128     line2=${line##NAGIOS_STATUS_LINE }
129     if [ "$line2" = "$line" ];then
130     if [ -z "$logstring" ];then
131     logstring="${line}"
132     else
133     logstring="${logstring}\n${line}"
134     fi
135     else
136     code=`echo $line2|cut -d' ' -f1`
137     nagiosline=`echo $line2|cut -d' ' -f2-`
138     fi
139     done
140     if [ -z "$nagiosline" ];then
141     printf "${logstring}"
142     else
143     printf "${nagiosline}\n${logstring}"
144     fi
145     exit $code
146     }
147    
148 msalle 2451 # parsing error function
149     parse_err() {
150     code=3 # Unknown
151     summary=""
152     for line in "$@" ; do
153     [ -z "$summary" ] && summary="$line"
154     log 2 "$line"
155     done
156 msalle 2454 nagios_status $code "$summary"
157 msalle 2451 }
158    
159     # don't use builtin which since it might not exist
160     which_cmd() {
161     for dir in `echo $PATH|tr : ' '` ; do
162     cmd="${dir}/$1"
163     log 3 "Looking for $1 in $dir"
164     if [ -f "$cmd" ];then
165     echo $cmd
166     return
167     fi
168     done
169     }
170    
171     # Assumes range as input, finds upperlimit, currently @ is not understood
172     # see http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
173     parse_range() {
174     [ $# -gt 1 ] && return 1
175     # invalid chars?
176     echo $1 | grep -q '[^~:0-9]' && return 1
177     # find upperlimit
178     echo $1 | grep -q ':' \
179     && uplimit=`echo $1|cut -d: -f2-` \
180     || uplimit=$1
181     # invalid chars?
182     echo $uplimit | grep -q '[^0-9]' && return 1
183     # print limit, note: might be empty
184     echo $uplimit
185     return 0
186     }
187    
188     # Parses command line options
189     parse_short_args() {
190     while getopts ":g:x:t:c:w:e:hvVH:p:u:" i ; do
191     case "$i" in
192     g) GLEXEC_CLIENT_CERT="$OPTARG" ;;
193     x) X509_USER_PROXY="$OPTARG" ;;
194 msalle 2454 t) TIMEOUT=`parse_range "$OPTARG"`
195 msalle 2451 if [ -z "$TIMEOUT" ];then
196     parse_err "Not a valid timeout: \"$OPTARG\""
197     else
198     log 2 "TIMEOUT set to $TIMEOUT"
199     fi
200     ;;
201 msalle 2454 c) CRITTIMEOUT=`parse_range "$OPTARG"`
202     if [ -z "$CRITTIMEOUT" ];then
203 msalle 2451 parse_err "Not a valid timeout: \"$OPTARG\""
204     else
205 msalle 2454 log 2 "CRITTIMEOUT set to $CRITTIMEOUT"
206 msalle 2451 fi
207     ;;
208 msalle 2454 w) WARNTIMEOUT=`parse_range "$OPTARG"`
209     if [ -z "$WARNTIMEOUT" ];then
210     parse_err "Not a valid timeout: \"$OPTARG\""
211     else
212     log 2 "WARNTIMEOUT set to $WARNTIMEOUT"
213     fi
214     ;;
215 msalle 2451 e) GLEXEC_CMD_EXE="$OPTARG" ;;
216     h) shortusage ;;
217 msalle 2454 V) nagios_status 0 "$PROG version $VERSION" ;;
218 msalle 2451 v) VERBOSE=`expr $VERBOSE + 1` ;;
219     H) log 2 "$PROG: option -H/--hostname is not used" ;;
220     p) log 2 "$PROG: option -p/--port is not used" ;;
221     u) log 2 "$PROG: option -u/--url is not used" ;;
222     :) parse_err "Option requires an argument -- '$OPTARG'" \
223     "Try \`$PROG -h' for more information."
224     ;;
225     \?) parse_err "Invalid option -- '$OPTARG'" \
226     "Try \`$PROG -h' for more information."
227     ;;
228     esac
229     done
230     # Check if we specified GLEXEC_CLIENT_CERT
231     if [ -z "$GLEXEC_CLIENT_CERT" ];then
232     GLEXEC_CLIENT_CERT="$X509_USER_PROXY"
233     log 2 "Using same proxy for GLEXEC_CLIENT_CERT and X509_USER_PROXY"
234     fi
235     # Check if we specified a command
236     if [ -z "$GLEXEC_CMD_EXE" ];then
237     GLEXEC_CMD_EXE="$DEF_GLEXEC_CMD_EXE"
238     log 2 "Using default payload command \"$GLEXEC_CMD_EXE\""
239     fi
240     }
241    
242     # Converts long options into short options
243     parse_args() {
244     # Find default executable
245     cmd="`which_cmd $DEF_GLEXEC_CMD`"
246     if [ -n "$cmd" ];then
247     DEF_GLEXEC_CMD_EXE="$cmd $DEF_GLEXEC_CMD_ARGS"
248     fi
249     args=""
250     for arg in "$@" ; do
251     subarg=${arg##--}
252     if [ "${subarg}" != "${arg}" ];then
253     case "$subarg" in
254     x509-user-proxy) args="$args -x " ;;
255     glexec-client-cert) args="$args -c " ;;
256     execute) args="$args -e " ;;
257     timeout) args="$args -t " ;;
258 msalle 2454 critical) args="$args -c " ;;
259     warning) args="$args -w " ;;
260 msalle 2451 verbose) args="$args -v " ;;
261     version) args="$args -V " ;;
262     help) usage ;;
263     # Unused long options:
264     hostname) args="$args -H " ;;
265     port) args="$args -p " ;;
266     url) args="$args -u " ;;
267     *)
268     parse_err "$PROG: invalid longoption -- '$subarg'" \
269     "Try \`$PROG -h' for more information."
270     ;;
271     esac
272     else
273     args="$args \"$arg\""
274     fi
275     done
276     # Now parse the resulting short options
277     eval parse_short_args `echo $args`
278     }
279    
280 msalle 2454 # wait wrapper
281     wait_func() {
282     if [ $# -ne 1 ];then
283     log 2 "wait_func needs exactly one argument"
284     return 1
285     fi
286     if [ $VERBOSE -ge 1 ];then
287     wait $1 2>&1
288     else
289     wait $1 2> /dev/null
290     fi
291     return $?
292     }
293    
294     # kill wrapper
295     kill_func() {
296     if [ $# -eq 0 ];then
297     log 2 "kill_func needs at least one argument"
298     return 1
299     fi
300     ps -fjA|grep "${pid##-}"
301     if [ $# -eq 1 ];then
302     signo="-TERM"
303     pid=$1
304     else
305     signo=$1
306     pid=$2
307     fi
308     log 3 "About to send $signo to $pid"
309     if [ $VERBOSE -gt 1 ];then
310     /bin/kill $signo $pid 2>&1
311     else
312     /bin/kill $signo $pid 2> /dev/null
313     fi
314     return $?
315     }
316    
317 msalle 2451 # wait for background process to finish or timeout
318     waiter() {
319     pid=$1
320 msalle 2454 code=0
321     if [ $CRITTIMEOUT -lt $TIMEOUT ];then
322     sleep $CRITTIMEOUT
323     if [ -n "`ps -opid= -p $pid`" ];then
324     log 2 "Child process $pid is running after critical range $CRITTIMEOUT sec, sending SIGTERM"
325     kill_func -$pid
326     fi
327     sleep $((TIMEOUT-CRITTIMEOUT))
328     else
329     sleep $TIMEOUT
330     fi
331 msalle 2451
332     # If process still running: kill it
333     if [ -n "`ps -opid= -p $pid`" ];then
334     # TIMEOUT exceeded: kill it
335 msalle 2454 log 2 "Child process $pid is running after timeout $TIMEOUT sec," \
336     "sending SIGKILL"
337     kill_func -9 -$pid
338 msalle 2451 fi
339     }
340    
341     ########################################################################
342     #
343     # gLExec specific functions
344     #
345     ########################################################################
346    
347     # Converts gLExec exit values to corresponding nagios codes
348     glexecrc_to_nagios() {
349     rc=$1
350    
351     case "$rc" in
352     0)
353     code=0 # Success
354     summary='success'
355     ;;
356     126)
357     code=1 # Warning
358     summary="executable can't be executed ($rc)"
359     ;;
360     201)
361     code=2 # Critical
362     summary="client error ($rc)"
363     ;;
364     202)
365     code=2 # Critical
366     summary="system error ($rc)"
367     ;;
368     203)
369     code=2 # Critical
370     summary="authorization error ($rc)"
371     ;;
372     204)
373     code=2 # Critical
374     summary="exit code overlap ($rc)"
375     ;;
376     *)
377     code=1 # Warning
378     summary="executable failed with exit code $rc"
379     ;;
380     esac
381    
382     return $code
383     }
384    
385     # Searches for gLExec and sets global GLEXEC_EXE variable
386     find_glexec() {
387     # First look at GLEXEC_LOCATION
388     if [ -n "$GLEXEC_LOCATION" ];then
389     log 3 "GLEXEC_LOCATION=$GLEXEC_LOCATION"
390     glexloc="${GLEXEC_LOCATION}/sbin/glexec"
391     if [ -f "$glexloc" ];then
392     log 2 "gLExec found at $glexloc"
393     GLEXEC_EXE=$glexloc
394     return
395     else
396     log 2 "gLExec not found at \$GLEXEC_LOCATION"
397     fi
398     fi
399    
400     # Set GLITE_LOCATION if unset
401     if [ -z "$GLITE_LOCATION" ];then
402     GLITE_LOCATION=/opt/glite
403     else
404     log 3 "GLITE_LOCATION=$GLITE_LOCATION"
405     fi
406    
407    
408     # Check in PATH, GLITE_LOCATION and extra dirs
409     for dir in `echo $PATH|tr : ' '` \
410     ${GLITE_LOCATION}/sbin \
411     /usr/local/sbin /usr/sbin /sbin /usr/local/bin /usr/bin ; do
412     glexloc="${dir}/glexec"
413     log 3 "Looking for gLExec at $glexloc"
414     if [ -f "$glexloc" ];then
415     log 2 "gLExec found at $glexloc"
416     GLEXEC_EXE=$glexloc
417     return
418     fi
419     done
420     }
421    
422 msalle 2454 # Full gLExec run including finding the command and printing nagios status when
423     # successful. In case of timeout, nagios status will come from run_probe.
424 msalle 2451 run_glexec() {
425     # Store start time
426     t1=$(date +%s)
427    
428     # Find glexec executable
429     find_glexec
430    
431     # Test executable
432     if [ -z "$GLEXEC_EXE" ] ; then
433     code=2 # Critical
434     summary='glexec command not found.'
435 msalle 2454 nagios_status $code $summary
436 msalle 2451 fi
437     # Test proxy variable
438     if [ -z "$X509_USER_PROXY" ] ; then
439     code=3 # Unknown
440     summary="\$X509_USER_PROXY is unset."
441 msalle 2454 nagios_status $code $summary
442 msalle 2451 fi
443     # Test proxy file
444     if [ ! -f "$X509_USER_PROXY" -o ! -s "$X509_USER_PROXY" ] ; then
445     code=3 # Unknown
446     summary="\$X509_USER_PROXY does not point to a nonempty file."
447 msalle 2454 nagios_status $code $summary
448 msalle 2451 fi
449    
450     if [ ! -f "$GLEXEC_CLIENT_CERT" -o ! -s "$X509_USER_PROXY" ] ; then
451     code=3 # Unknown
452     summary="\$GLEXEC_CLIENT_CERT does not point to a nonempty file."
453 msalle 2454 nagios_status $code $summary
454 msalle 2451 fi
455    
456     log 2 "Running $GLEXEC_EXE $GLEXEC_CMD_EXE"
457     export GLEXEC_CLIENT_CERT X509_USER_PROXY
458     if [ $VERBOSE -gt 1 ];then
459     eval $GLEXEC_EXE $GLEXEC_CMD_EXE 2>&1
460     else
461     eval $GLEXEC_EXE $GLEXEC_CMD_EXE > /dev/null 2>&1
462     fi
463     glexecrc_to_nagios $?
464    
465     # Store end time
466     dt=$(( $(date +%s) - t1))
467    
468 msalle 2454 if [ "$code" -eq 0 -a $dt -gt $WARNTIMEOUT ];then
469 msalle 2451 code=1 # Warning
470     summary="gLExec took long time to succeed"
471     fi
472    
473 msalle 2454 perfdata="time=${dt}s;$WARNTIMEOUT;$CRITTIMEOUT;0"
474     nagios_status $code "$summary|$perfdata"
475 msalle 2451 }
476    
477     ########################################################################
478     #
479 msalle 2454 # main
480 msalle 2451 #
481     ########################################################################
482    
483 msalle 2454 {
484     # Turn on jobcontrol (separate process groups for subshells), such that we
485     # can kill the process group for the background processes.
486     set -m
487 msalle 2451
488 msalle 2454 # Parse cmdline arguments (long ones are converted in corresponding short
489     # ones)
490     parse_args "$@"
491    
492     # Start glexec_run in background to have control over timeout
493     run_glexec &
494     probe_pid=$!
495 msalle 2451
496 msalle 2454 # Start watch process in background: will kill probe after timeout
497     waiter $probe_pid &
498     waiter_pid=$!
499 msalle 2451
500 msalle 2454 # Wait for run_glexec: it will either end by itself or by the waiter()
501     log 3 "Waiting at most $TIMEOUT seconds for probe $probe_pid to finish"
502     wait_func $probe_pid
503     probe_rc=$?
504 msalle 2451
505 msalle 2454 # Kill the waiter if it is still there
506     if [ -n "`ps -opid= -p $waiter_pid`" ];then
507     log 3 "Cleaning up waiter process $waiter_pid"
508     kill_func -$waiter_pid
509     fi
510 msalle 2453 # Call wait here to prevent logging of termination at end of script
511 msalle 2454 wait_func $waiter_pid
512 msalle 2451
513 msalle 2454 # If probe was killed, it's exit value will be outside valid nagios range of
514     # 0-3, if gLExec itself fails, the run_probe will exit with a 3
515     case "$probe_rc" in
516     0|1|2|3) # Normal ending of run_glexec, which has called nagios_status
517     code=$probe_rc; exit $code
518     ;;
519     137) # run_glexec ended via SIGKILL
520     nagios_status 2 "probe TIMEOUT of $TIMEOUT seconds exceeded"
521     ;;
522     143) # run_glexec ended via SIGTERM
523     nagios_status 2 "probe critical range of $CRITTIMEOUT seconds exceeded"
524     ;;
525     *) # run_glexec ended prematurely?!
526     nagios_status 3 "background process died unexpectly with rc=$probe_rc"
527     ;;
528     esac;
529     } | nagios_output
530 msalle 2451
531 msalle 2454 exit $?

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28