/[pdpsoft]/trunk/nagios/glexec/check_glexec
ViewVC logotype

Contents of /trunk/nagios/glexec/check_glexec

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2453 - (show annotations) (download)
Tue Nov 29 16:22:24 2011 UTC (10 years, 1 month ago) by msalle
File size: 11845 byte(s)
Explicitly call wait on waiter subprocess after kill to prevent logging of error
on stderr at end of script.

1 #!/bin/sh
2 #
3 # Copyright (C) Nikhef 2011
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #
17 # Author:
18 # Mischa Sall\'e <msalle@nikhef.nl>
19 # NIKHEF Amsterdam, the Netherlands
20 #
21 ########################################################################
22 #
23 # Nagios probe to test functioning of gLExec
24 #
25 # Nagios state can be one of the following:
26 # - Missing glexec command: CRITICAL
27 # - input proxies empty: UNKNOWN
28 # - short timeout exceeded: WARNING
29 # - timeout exceeded: CRITICAL
30 # - gLExec exit codes:
31 # 0 glexec succeeded: OK
32 # 201 Client error: CRITICAL
33 # 202 Internal error: CRITICAL
34 # 203 Auth error: CRITICAL
35 # 204 Overlap: CRITICAL
36 # 126 execve failed: WARNING
37 # 128+n signal: WARNING
38 # !=0 rc of payload: WARNING
39 #
40 ########################################################################
41
42 # version
43 VERSION=0.1
44
45 # plugin name and version
46 PROG=`basename $0`
47 # command will be set after arguments are parsed
48 DEF_GLEXEC_CMD="id"
49 DEF_GLEXEC_CMD_ARGS="-a"
50 # glexec command itself
51 GLEXEC_EXE=""
52 # Default verbosity
53 VERBOSE=0
54 # Default timeout
55 TIMEOUT=10
56 # Default short timeout, longer than this results in warning
57 SHORTTIMEOUT=5
58 # Default GLEXEC_CLIENT_CERT will be set after arguments are parsed
59 GLEXEC_CLIENT_CERT=""
60
61 ########################################################################
62 #
63 # general options
64 #
65 ########################################################################
66
67 # Short usage text
68 shortusage() {
69 echo "Usage: $PROG [options]"
70 exit 0
71 }
72
73 # Long usage text
74 usage() {
75 echo "Usage: $PROG [options]"
76 echo
77 echo "Options:"
78 echo " -t|--timeout <timeout> maximum runtime for probe, default $TIMEOUT sec"
79 echo " -s|--shorttimeout <timeout> runtime after which to warn, default $SHORTTIMEOUT sec"
80 echo " -x|--x509-user-proxy <file> set X509_USER_PROXY to given file"
81 echo " -g|--glexec-client-cert <file> set GLEXEC_CLIENT_CERT to given file"
82 echo " default: value of variable X509_USER_PROXY"
83 echo " -e|--execute <cmd> command to execute by gLExec"
84 echo " default: \"$DEF_GLEXEC_CMD_EXE\""
85 echo " -v|--verbose be more verbose, more -v means more verbosity"
86 echo " -V|--version print version"
87 echo " -h|--help show this helptext"
88 exit 0
89 }
90
91 # Log function: log <level> <message>
92 log() {
93 level=$1
94 shift
95 if [ $VERBOSE -ge $level ];then
96 for line in "$@" ; do
97 echo "$line"
98 done
99 fi
100 }
101
102 # Prints nagios status line: <stat>: <summary>
103 nagios_printout() {
104 code=$1
105 shift
106 summary="$*"
107 case "$code" in
108 0) stat='OK' ;;
109 1) stat='WARNING' ;;
110 2) stat='CRITICAL' ;;
111 3) stat='UNKNOWN' ;;
112 *) stat='INVALID NAGIOS CODE $1' ;;
113 esac
114 log 0 "$stat: $summary"
115 exit $code
116 }
117
118 # parsing error function
119 parse_err() {
120 code=3 # Unknown
121 summary=""
122 for line in "$@" ; do
123 [ -z "$summary" ] && summary="$line"
124 log 2 "$line"
125 done
126 nagios_printout $code "$summary"
127 }
128
129 # don't use builtin which since it might not exist
130 which_cmd() {
131 for dir in `echo $PATH|tr : ' '` ; do
132 cmd="${dir}/$1"
133 log 3 "Looking for $1 in $dir"
134 if [ -f "$cmd" ];then
135 echo $cmd
136 return
137 fi
138 done
139 }
140
141 # Assumes range as input, finds upperlimit, currently @ is not understood
142 # see http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
143 parse_range() {
144 [ $# -gt 1 ] && return 1
145 # invalid chars?
146 echo $1 | grep -q '[^~:0-9]' && return 1
147 # find upperlimit
148 echo $1 | grep -q ':' \
149 && uplimit=`echo $1|cut -d: -f2-` \
150 || uplimit=$1
151 # invalid chars?
152 echo $uplimit | grep -q '[^0-9]' && return 1
153 # print limit, note: might be empty
154 echo $uplimit
155 return 0
156 }
157
158 # Parses command line options
159 parse_short_args() {
160 while getopts ":g:x:t:c:w:e:hvVH:p:u:" i ; do
161 case "$i" in
162 g) GLEXEC_CLIENT_CERT="$OPTARG" ;;
163 x) X509_USER_PROXY="$OPTARG" ;;
164 c|t) TIMEOUT=`parse_range "$OPTARG"`
165 if [ -z "$TIMEOUT" ];then
166 parse_err "Not a valid timeout: \"$OPTARG\""
167 else
168 log 2 "TIMEOUT set to $TIMEOUT"
169 fi
170 ;;
171 w) SHORTTIMEOUT=`parse_range "$OPTARG"`
172 if [ -z "$SHORTTIMEOUT" ];then
173 parse_err "Not a valid timeout: \"$OPTARG\""
174 else
175 log 2 "SHORTTIMEOUT set to $TIMEOUT"
176 fi
177 ;;
178 e) GLEXEC_CMD_EXE="$OPTARG" ;;
179 h) shortusage ;;
180 V) nagios_printout 0 "$PROG version $VERSION" ;;
181 v) VERBOSE=`expr $VERBOSE + 1` ;;
182 H) log 2 "$PROG: option -H/--hostname is not used" ;;
183 p) log 2 "$PROG: option -p/--port is not used" ;;
184 u) log 2 "$PROG: option -u/--url is not used" ;;
185 :) parse_err "Option requires an argument -- '$OPTARG'" \
186 "Try \`$PROG -h' for more information."
187 ;;
188 \?) parse_err "Invalid option -- '$OPTARG'" \
189 "Try \`$PROG -h' for more information."
190 ;;
191 esac
192 done
193 # Check if we specified GLEXEC_CLIENT_CERT
194 if [ -z "$GLEXEC_CLIENT_CERT" ];then
195 GLEXEC_CLIENT_CERT="$X509_USER_PROXY"
196 log 2 "Using same proxy for GLEXEC_CLIENT_CERT and X509_USER_PROXY"
197 fi
198 # Check if we specified a command
199 if [ -z "$GLEXEC_CMD_EXE" ];then
200 GLEXEC_CMD_EXE="$DEF_GLEXEC_CMD_EXE"
201 log 2 "Using default payload command \"$GLEXEC_CMD_EXE\""
202 fi
203 }
204
205 # Converts long options into short options
206 parse_args() {
207 # Find default executable
208 cmd="`which_cmd $DEF_GLEXEC_CMD`"
209 if [ -n "$cmd" ];then
210 DEF_GLEXEC_CMD_EXE="$cmd $DEF_GLEXEC_CMD_ARGS"
211 fi
212 args=""
213 for arg in "$@" ; do
214 subarg=${arg##--}
215 if [ "${subarg}" != "${arg}" ];then
216 case "$subarg" in
217 x509-user-proxy) args="$args -x " ;;
218 glexec-client-cert) args="$args -c " ;;
219 execute) args="$args -e " ;;
220 timeout) args="$args -t " ;;
221 shorttimeout) args="$args -s " ;;
222 verbose) args="$args -v " ;;
223 version) args="$args -V " ;;
224 help) usage ;;
225 # Unused long options:
226 hostname) args="$args -H " ;;
227 port) args="$args -p " ;;
228 url) args="$args -u " ;;
229 warning) args="$args -w " ;;
230 critical) args="$args -c " ;;
231 *)
232 parse_err "$PROG: invalid longoption -- '$subarg'" \
233 "Try \`$PROG -h' for more information."
234 ;;
235 esac
236 else
237 args="$args \"$arg\""
238 fi
239 done
240 # Now parse the resulting short options
241 eval parse_short_args `echo $args`
242 }
243
244 # wait for background process to finish or timeout
245 waiter() {
246 pid=$1
247 sleep $TIMEOUT
248
249 # If process still running: kill it
250 if [ -n "`ps -opid= -p $pid`" ];then
251 # TIMEOUT exceeded: kill it
252 log 2 "Child process $pid is still running after timeout $TIMEOUT"
253 kill -9 $pid
254 exit 3
255 else
256 exit 0
257 fi
258 }
259
260 ########################################################################
261 #
262 # gLExec specific functions
263 #
264 ########################################################################
265
266 # Converts gLExec exit values to corresponding nagios codes
267 glexecrc_to_nagios() {
268 rc=$1
269
270 case "$rc" in
271 0)
272 code=0 # Success
273 summary='success'
274 ;;
275 126)
276 code=1 # Warning
277 summary="executable can't be executed ($rc)"
278 ;;
279 201)
280 code=2 # Critical
281 summary="client error ($rc)"
282 ;;
283 202)
284 code=2 # Critical
285 summary="system error ($rc)"
286 ;;
287 203)
288 code=2 # Critical
289 summary="authorization error ($rc)"
290 ;;
291 204)
292 code=2 # Critical
293 summary="exit code overlap ($rc)"
294 ;;
295 *)
296 code=1 # Warning
297 summary="executable failed with exit code $rc"
298 ;;
299 esac
300
301 return $code
302 }
303
304 # Searches for gLExec and sets global GLEXEC_EXE variable
305 find_glexec() {
306 # First look at GLEXEC_LOCATION
307 if [ -n "$GLEXEC_LOCATION" ];then
308 log 3 "GLEXEC_LOCATION=$GLEXEC_LOCATION"
309 glexloc="${GLEXEC_LOCATION}/sbin/glexec"
310 if [ -f "$glexloc" ];then
311 log 2 "gLExec found at $glexloc"
312 GLEXEC_EXE=$glexloc
313 return
314 else
315 log 2 "gLExec not found at \$GLEXEC_LOCATION"
316 fi
317 fi
318
319 # Set GLITE_LOCATION if unset
320 if [ -z "$GLITE_LOCATION" ];then
321 GLITE_LOCATION=/opt/glite
322 else
323 log 3 "GLITE_LOCATION=$GLITE_LOCATION"
324 fi
325
326
327 # Check in PATH, GLITE_LOCATION and extra dirs
328 for dir in `echo $PATH|tr : ' '` \
329 ${GLITE_LOCATION}/sbin \
330 /usr/local/sbin /usr/sbin /sbin /usr/local/bin /usr/bin ; do
331 glexloc="${dir}/glexec"
332 log 3 "Looking for gLExec at $glexloc"
333 if [ -f "$glexloc" ];then
334 log 2 "gLExec found at $glexloc"
335 GLEXEC_EXE=$glexloc
336 return
337 fi
338 done
339 }
340
341 # Full run:
342 # - search for glexec command
343 # - run glexec
344 # - print nagios status
345 run_glexec() {
346 # Store start time
347 t1=$(date +%s)
348
349 # Find glexec executable
350 find_glexec
351
352 # Test executable
353 if [ -z "$GLEXEC_EXE" ] ; then
354 code=2 # Critical
355 summary='glexec command not found.'
356 nagios_printout $code $summary
357 fi
358 # Test proxy variable
359 if [ -z "$X509_USER_PROXY" ] ; then
360 code=3 # Unknown
361 summary="\$X509_USER_PROXY is unset."
362 nagios_printout $code $summary
363 fi
364 # Test proxy file
365 if [ ! -f "$X509_USER_PROXY" -o ! -s "$X509_USER_PROXY" ] ; then
366 code=3 # Unknown
367 summary="\$X509_USER_PROXY does not point to a nonempty file."
368 nagios_printout $code $summary
369 fi
370
371 if [ ! -f "$GLEXEC_CLIENT_CERT" -o ! -s "$X509_USER_PROXY" ] ; then
372 code=3 # Unknown
373 summary="\$GLEXEC_CLIENT_CERT does not point to a nonempty file."
374 nagios_printout $code $summary
375 fi
376
377 log 2 "Running $GLEXEC_EXE $GLEXEC_CMD_EXE"
378 export GLEXEC_CLIENT_CERT X509_USER_PROXY
379 if [ $VERBOSE -gt 1 ];then
380 eval $GLEXEC_EXE $GLEXEC_CMD_EXE 2>&1
381 else
382 eval $GLEXEC_EXE $GLEXEC_CMD_EXE > /dev/null 2>&1
383 fi
384 glexecrc_to_nagios $?
385
386 # Store end time
387 dt=$(( $(date +%s) - t1))
388
389 if [ "$code" -eq 0 -a $dt -gt $SHORTTIMEOUT ];then
390 code=1 # Warning
391 summary="gLExec took long time to succeed"
392 fi
393
394 perfdata="time=${dt}s;$SHORTTIMEOUT;$TIMEOUT;0"
395 nagios_printout $code "$summary|$perfdata"
396 }
397
398 ########################################################################
399 #
400 # main program
401 #
402 ########################################################################
403
404 # Parse cmdline arguments (long ones are converted in corresponding short ones)
405 parse_args "$@"
406
407 # Start glexec_run in background to have control over timeout
408 run_glexec &
409 probe_pid=$!
410
411 # Start watch process in background: will kill probe after timeout
412 waiter $probe_pid &
413 waiter_pid=$!
414
415 # Wait for run_glexec: it will either end by itself or by the waiter()
416 log 3 "Waiting at most $TIMEOUT seconds for probe $probe_pid to finish"
417 if [ $VERBOSE -ge 1 ];then
418 wait $probe_pid
419 else
420 wait $probe_pid 2> /dev/null
421 fi
422 probe_rc=$?
423
424 # Kill the waiter if it is still there
425 if [ -n "`ps -opid= -p $waiter_pid`" ];then
426 kill $waiter_pid 2> /dev/null
427 # Call wait here to prevent logging of termination at end of script
428 wait $waiter_pid 2> /dev/null
429 fi
430
431 # If probe was killed, it's exit value will be outside valid nagios range of 0-3
432 if [ $probe_rc -gt 3 ];then
433 code=2 # Critical
434 if [ $VERBOSE -eq 0 ];then
435 nagios_printout $code "probe TIMEOUT of $TIMEOUT seconds exceeded"
436 else
437 nagios_printout $code "probe TIMEOUT of $TIMEOUT seconds exceeded (rc=$probe_rc)"
438 fi
439 else
440 code=$probe_rc
441 fi
442
443 # run_glexec has finished: parse its exit value and exit with it
444 exit $probe_rc

Properties

Name Value
svn:executable *

grid.support@nikhef.nl
ViewVC Help
Powered by ViewVC 1.1.28